def main(start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q) default_headers = deepcopy(DEFAULT_HEADERS) default_headers.update({'Referer': 'http://www.guokr.com/scientific/'}) save_path = SCRIPT_CONFIG['SAVE_PATH'] book_name = '果壳网' task = Task.make_task({ 'url': API_URL.format(start), 'method': 'GET', 'meta': { 'headers': default_headers, 'verify': False }, 'parser': parser_list, 'priority': 0, 'save': { 'cursor': start, 'start': start, 'end': end, 'kw': kw, 'save_path': SCRIPT_CONFIG['SAVE_PATH'], }, 'retry': 3, }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: pass crawler.start() items = [] with ArticleDB(save_path) as db: items.extend(db.select_article()) db.insert_meta_data(['BOOK_NAME', book_name]) db.increase_version() with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) if kw.get('email'): with SendEmail2Kindle() as s: s.send_all_mobi(SCRIPT_CONFIG['SAVE_PATH']) os._exit(0)
def resulter_collection(task): with ArticleDB(task['save']['save_path']) as article_db: # 获取收藏夹名字,将其插入meta表 global GET_BOOK_NAME_FLAG if GET_BOOK_NAME_FLAG is False: try: article_db.insert_meta_data( ['BOOK_NAME', '知乎收藏夹_' + task['save']['book_name']], update=False) GET_BOOK_NAME_FLAG = True except: pass article_db.insert_article(task['parsed_data'])
def make_mobi(path, window=50, multi=True): from web2kindle.libs.db import ArticleDB from web2kindle import MAIN_CONFIG from web2kindle.libs.html2kindle import HTML2Kindle if not path: import os path = os.getcwd() items = [] with ArticleDB(path) as db: db.reset_version() items.extend(db.select_article()) book_name = db.select_meta('BOOK_NAME') if items: with HTML2Kindle(items, path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window) if multi: html2kindle.make_book_multi(path) else: html2kindle.make_book(path)
def main(start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1), MAIN_CONFIG.get('DOWNLOADER_WORKER', 1), MAIN_CONFIG.get('RESULTER_WORKER', 1)) default_headers = deepcopy(DEFAULT_HEADERS) default_headers.update({'Referer': 'http://www.guokr.com/scientific/'}) save_path = SCRIPT_CONFIG['SAVE_PATH'] book_name = '果壳网' task = Task.make_task({ 'url': API_URL.format(start), 'method': 'GET', 'meta': { 'headers': default_headers, 'verify': False }, 'parser': parser_list, 'priority': 0, 'save': { 'cursor': start, 'start': start, 'end': end, 'kw': kw, 'save_path': SCRIPT_CONFIG['SAVE_PATH'], }, 'retry': 10, 'retry_delay': 10 }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: _ = db.select_all_article_id() if _: for each in _: ARTICLE_ID_SET.add(each[0]) crawler.start() items = [] with ArticleDB(save_path) as db: items.extend(db.select_article()) db.insert_meta_data(['BOOK_NAME', book_name]) db.increase_version() db.reset() if items: new = True with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) else: LOG.log_it('无新项目', 'INFO') new = False if new and kw.get('email'): with SendEmail2Kindle() as s: s.send_all_mobi(SCRIPT_CONFIG['SAVE_PATH'])
def resulter_content(task): LOG.log_it("正在将任务 {} 插入数据库".format(task['tid']), 'INFO') with ArticleDB(task['save']['save_path']) as article_db: article_db.insert_article(task['parsed_data'])
def main(zhuanti_list, start, end, kw): """start默认1;end为结束页数,每页9个""" iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1), MAIN_CONFIG.get('DOWNLOADER_WORKER', 1), MAIN_CONFIG.get('RESULTER_WORKER', 1)) start = int(start) end = int(end) for zhuanti in zhuanti_list: new_header = deepcopy(DEFAULT_HEADERS) new_header.update({'Referer': BASE_URL.format(zhuanti)}) # 以专题的数字作为子文件名 save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanti)) if kw.get('order_by') == 'comment': order_by = ORDER_COMMENT elif kw.get('order_by') == 'add': order_by = ORDER_ADD elif kw.get('order_by') == 'top': order_by = ORDER_TOP else: # 默认add order_by = ORDER_ADD task = Task.make_task({ 'url': API_URL.format(zhuanti, order_by, start), 'method': 'GET', 'meta': {'headers': new_header, 'verify': False}, 'parser': parser_list, 'priority': 0, 'save': {'cursor': start, 'save_path': save_path, 'start': start, 'end': end, 'kw': kw, 'name': zhuanti, 'order_by': order_by}, 'retry': 10, 'retry_delay': 10 }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: _ = db.select_all_article_id() # 利用集合去重 if _: for each in _: ARTICLE_ID_SET.add(each[0]) # 开始爬虫 crawler.start() # 开始制作电子书 for zhuanti in zhuanti_list: items = [] save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanti)) with ArticleDB(save_path, VERSION=0) as db: # 读取所有文章 items.extend(db.select_article()) # 从数据库中获取专题名字 book_name = db.select_meta('BOOK_NAME') # 更新数据库版本 db.increase_version() # 数据库收尾工作 db.reset() if items: with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) if kw.get('email'): with SendEmail2Kindle() as s: s.send_all_mobi(save_path) else: LOG.log_it('无新项目', 'INFO')
def parser_list(task): response = task['response'] new_tasks = [] to_next = True if not response: raise RetryDownload try: text = response.text bs = BeautifulSoup(text, 'lxml') except Exception as e: LOG.log_it('解析网页出错(如一直出现,而且浏览器能正常访问,可能是网站网站代码升级,请通知开发者。)ERRINFO:{}' .format(str(e)), 'WARN') raise RetryDownload book_name = bs.title.string if bs.title else task['save']['name'] # 插入文集名字 with ArticleDB(task['save']['save_path']) as article_db: article_db.insert_meta_data(['BOOK_NAME', format_file_name('简书专题_' + book_name)], update=False) # 顺序反向 items = bs.select('a.title') items.reverse() for item in items: # 如果已经在数据库中,则不下载 url = 'https://www.jianshu.com' + item.attrs['href'] if md5string(url) in ARTICLE_ID_SET: to_next = False continue try: title = item.string except: LOG.log_it('解析标题出错(如一直出现,而且浏览器能正常访问,可能是网站网站代码升级,请通知开发者。)', 'WARN') raise RetryDownload new_task = Task.make_task({ 'url': url, 'method': 'GET', 'meta': task['meta'], 'parser': parser_content, 'resulter': resulter_content, 'priority': 5, 'save': task['save'], 'title': title, }) new_tasks.append(new_task) # 下一页 if to_next and len(items) != 0: if task['save']['cursor'] < task['save']['end']: next_page_task = deepcopy(task) next_page_task.update( {'url': API_URL.format(task['save']['name'], task['save']['order_by'], task['save']['cursor'] + 1)}) next_page_task['save'].update({'cursor': next_page_task['save']['cursor'] + 1}) new_tasks.append(next_page_task) return None, new_tasks
def main(zhuanlan_name_list, start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q) for zhuanlan_name in zhuanlan_name_list: new_header = deepcopy(DEFAULT_HEADERS) new_header.update( {'Referer': 'https://zhuanlan.zhihu.com/{}'.format(zhuanlan_name)}) save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanlan_name)) task = Task.make_task({ 'url': 'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=20&offset={}' .format(zhuanlan_name, start), 'method': 'GET', 'meta': { 'headers': new_header, 'verify': False }, 'parser': parser_list, 'priority': 0, 'save': { 'cursor': start, 'save_path': save_path, 'start': start, 'end': end, 'kw': kw, 'name': zhuanlan_name }, 'retry': 3, }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: pass crawler.start() for zhuanlan_name in zhuanlan_name_list: items = [] save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanlan_name)) with ArticleDB(save_path, VERSION=0) as db: db.insert_meta_data(['BOOK_NAME', zhuanlan_name]) items.extend(db.select_article()) db.increase_version() with HTML2Kindle(items, save_path, zhuanlan_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) if kw.get('email'): for zhuanlan_name in zhuanlan_name_list: with SendEmail2Kindle() as s: s.send_all_mobi( os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanlan_name))) os._exit(0)
def main(start, end, kw): # start:2017/12/11 # end:2017/12/12 iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q) try: start_l = [int(_) for _ in start.split('-')] end_l = [int(_) for _ in end.split('-')] start_t = int( datetime.datetime(start_l[0], start_l[1], start_l[2]).timestamp()) + 60 * 60 * 24 end_t = int( datetime.datetime(end_l[0], end_l[1], end_l[2]).timestamp()) except: LOG.log_it('日期格式错误', 'WARN') traceback.print_exc() return global API_URL if 'type' in kw: if kw['type'] == 'business': API_URL = API_BUSINESS elif kw['type'] == 'intelligent': API_URL = API_INTELLIGENT elif kw['type'] == 'design': API_URL = API_DESIGN elif kw['type'] == 'fashion': API_URL = API_FASHION elif kw['type'] == 'entertainment': API_URL = API_ENTERTAINMENT elif kw['type'] == 'city': API_URL = API_CITY elif kw['type'] == 'game': API_URL = API_GAME elif kw['type'] == 'long': API_URL = API_LONG elif kw['type'] == 'home': pass else: kw.update({'type': 'home'}) new_header = deepcopy(SCRIPT_CONFIG.get('DEFAULT_HEADERS')) new_header.update({'Referer': 'https://www.qdaily.com/'}) save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], 'qdaily_{}'.format(kw['type'])) book_name = '好奇心日报_{}_{}_{}'.format(kw['type'], start, end) task = Task.make_task({ 'url': API_URL.format(start_t), 'method': 'GET', 'meta': { 'headers': new_header, 'verify': False }, 'parser': parser_list, 'priority': 0, 'save': { 'cursor': start_t, 'save_path': save_path, 'start': start_t, 'end': end_t, 'kw': kw, 'page': 1, 'name': book_name, }, 'retry': 3, }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: pass crawler.start() items = [] with ArticleDB(save_path) as db: items.extend(db.select_article()) db.insert_meta_data(['BOOK_NAME', book_name]) db.increase_version() with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) if kw.get('email'): with SendEmail2Kindle() as s: s.send_all_mobi(save_path) os._exit(0)
def main(zhuanlan_name_list, start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1), MAIN_CONFIG.get('DOWNLOADER_WORKER', 1), MAIN_CONFIG.get('RESULTER_WORKER', 1)) new = True for zhuanlan_name in zhuanlan_name_list: new_header = deepcopy(DEFAULT_HEADERS) new_header.update( {'Referer': 'https://zhuanlan.zhihu.com/{}'.format(zhuanlan_name)}) save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanlan_name)) task = Task.make_task({ 'url': 'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=20&offset={}' .format(zhuanlan_name, start), 'method': 'GET', 'meta': { 'headers': new_header, 'verify': False }, 'parser': parser_list, 'priority': 0, 'save': { 'cursor': start, 'save_path': save_path, 'start': start, 'end': end, 'kw': kw, 'name': zhuanlan_name }, 'retry': 10, 'retry_delay': 10 }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: _ = db.select_all_article_id() if _: for each in _: ARTICLE_ID_SET.add(each[0]) crawler.start() for zhuanlan_name in zhuanlan_name_list: items = [] book_name = '知乎专栏_{}'.format(zhuanlan_name) save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanlan_name)) with ArticleDB(save_path, VERSION=0) as db: db.insert_meta_data(['BOOK_NAME', zhuanlan_name]) items.extend(db.select_article()) db.increase_version() db.reset() if items: new = True with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) else: LOG.log_it('无新项目', 'INFO') new = False if new and kw.get('email'): for zhuanlan_name in zhuanlan_name_list: with SendEmail2Kindle() as s: s.send_all_mobi( os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanlan_name)))
def main(start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1), MAIN_CONFIG.get('DOWNLOADER_WORKER', 1), MAIN_CONFIG.get('RESULTER_WORKER', 1)) new_header = deepcopy(DEFAULT_HEADERS) global IS_TODAY_URL if start is None: IS_TODAY_URL = True save_path = os.path.join( SCRIPT_CONFIG['SAVE_PATH'], 'zhihu_daily_' + get_datetime_string('%Y%m%d')) book_name = '知乎日报_' + get_datetime_string('%Y%m%d') else: if end is None: end = datetime.datetime.fromtimestamp( time.time()).strftime('%Y%m%d') save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], 'zhihu_daily_{}_{}'.format(start, end)) book_name = '知乎日报_{}_{}'.format(start, end) IS_TODAY_URL = False url = TODAY_URL if IS_TODAY_URL else YESTERDAY_URL.format(start) task = Task.make_task({ 'url': url, 'method': 'GET', 'meta': { 'headers': new_header, 'verify': False }, 'parser': parser_list, 'priority': 0, 'save': { 'cursor': start, 'save_path': save_path, 'start': start, 'end': end, 'kw': kw }, 'retry': 99, 'retry_delay': 10 }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: _ = db.select_all_article_id() if _: for each in _: ARTICLE_ID_SET.add(each[0]) crawler.start() items = [] with ArticleDB(save_path, VERSION=0) as db: db.insert_meta_data(['BOOK_NAME', book_name]) items.extend(db.select_article()) db.increase_version() db.reset() if items: new = True with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) else: LOG.log_it('无新项目', 'INFO') new = False if new and kw.get('email'): with SendEmail2Kindle() as s: s.send_all_mobi(os.path.join(save_path))
def main(collection_num_list, start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1), MAIN_CONFIG.get('DOWNLOADER_WORKER', 1), MAIN_CONFIG.get('RESULTER_WORKER', 1)) new = True for collection_num in collection_num_list: save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(collection_num)) task = Task.make_task({ 'url': 'https://www.zhihu.com/collection/{}?page={}'.format( collection_num, start), 'method': 'GET', 'meta': { 'headers': DEFAULT_HEADERS, 'verify': False }, 'parser': parser_collection, 'resulter': resulter_collection, 'priority': 0, 'retry': 10, 'save': { 'start': start, 'end': end, 'kw': kw, 'save_path': save_path, 'name': collection_num, }, 'retry_delay': 10 }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: _ = db.select_all_article_id() if _: for each in _: ARTICLE_ID_SET.add(each[0]) crawler.start() for collection_num in collection_num_list: items = [] save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(collection_num)) with ArticleDB(save_path) as db: items.extend(db.select_article()) book_name = db.select_meta('BOOK_NAME') db.increase_version() db.reset() if items: new = True with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) else: LOG.log_it('无新项目', 'INFO') new = False if new and kw.get('email'): for collection_num in collection_num_list: save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(collection_num)) with SendEmail2Kindle() as s: s.send_all_mobi(save_path)