def send_mobi(path): if not path: import os path = os.getcwd() from web2kindle.libs.send_email import SendEmail2Kindle with SendEmail2Kindle() as s: s.send_all_mobi(path)
def main(start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q) default_headers = deepcopy(DEFAULT_HEADERS) default_headers.update({'Referer': 'http://www.guokr.com/scientific/'}) save_path = SCRIPT_CONFIG['SAVE_PATH'] book_name = '果壳网' task = Task.make_task({ 'url': API_URL.format(start), 'method': 'GET', 'meta': { 'headers': default_headers, 'verify': False }, 'parser': parser_list, 'priority': 0, 'save': { 'cursor': start, 'start': start, 'end': end, 'kw': kw, 'save_path': SCRIPT_CONFIG['SAVE_PATH'], }, 'retry': 3, }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: pass crawler.start() items = [] with ArticleDB(save_path) as db: items.extend(db.select_article()) db.insert_meta_data(['BOOK_NAME', book_name]) db.increase_version() with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) if kw.get('email'): with SendEmail2Kindle() as s: s.send_all_mobi(SCRIPT_CONFIG['SAVE_PATH']) os._exit(0)
def main(start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1), MAIN_CONFIG.get('DOWNLOADER_WORKER', 1), MAIN_CONFIG.get('RESULTER_WORKER', 1)) default_headers = deepcopy(DEFAULT_HEADERS) default_headers.update({'Referer': 'http://www.guokr.com/scientific/'}) save_path = SCRIPT_CONFIG['SAVE_PATH'] book_name = '果壳网' task = Task.make_task({ 'url': API_URL.format(start), 'method': 'GET', 'meta': { 'headers': default_headers, 'verify': False }, 'parser': parser_list, 'priority': 0, 'save': { 'cursor': start, 'start': start, 'end': end, 'kw': kw, 'save_path': SCRIPT_CONFIG['SAVE_PATH'], }, 'retry': 10, 'retry_delay': 10 }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: _ = db.select_all_article_id() if _: for each in _: ARTICLE_ID_SET.add(each[0]) crawler.start() items = [] with ArticleDB(save_path) as db: items.extend(db.select_article()) db.insert_meta_data(['BOOK_NAME', book_name]) db.increase_version() db.reset() if items: new = True with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) else: LOG.log_it('无新项目', 'INFO') new = False if new and kw.get('email'): with SendEmail2Kindle() as s: s.send_all_mobi(SCRIPT_CONFIG['SAVE_PATH'])
def main(zhuanti_list, start, end, kw): """start默认1;end为结束页数,每页9个""" iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1), MAIN_CONFIG.get('DOWNLOADER_WORKER', 1), MAIN_CONFIG.get('RESULTER_WORKER', 1)) start = int(start) end = int(end) for zhuanti in zhuanti_list: new_header = deepcopy(DEFAULT_HEADERS) new_header.update({'Referer': BASE_URL.format(zhuanti)}) # 以专题的数字作为子文件名 save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanti)) if kw.get('order_by') == 'comment': order_by = ORDER_COMMENT elif kw.get('order_by') == 'add': order_by = ORDER_ADD elif kw.get('order_by') == 'top': order_by = ORDER_TOP else: # 默认add order_by = ORDER_ADD task = Task.make_task({ 'url': API_URL.format(zhuanti, order_by, start), 'method': 'GET', 'meta': {'headers': new_header, 'verify': False}, 'parser': parser_list, 'priority': 0, 'save': {'cursor': start, 'save_path': save_path, 'start': start, 'end': end, 'kw': kw, 'name': zhuanti, 'order_by': order_by}, 'retry': 10, 'retry_delay': 10 }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: _ = db.select_all_article_id() # 利用集合去重 if _: for each in _: ARTICLE_ID_SET.add(each[0]) # 开始爬虫 crawler.start() # 开始制作电子书 for zhuanti in zhuanti_list: items = [] save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanti)) with ArticleDB(save_path, VERSION=0) as db: # 读取所有文章 items.extend(db.select_article()) # 从数据库中获取专题名字 book_name = db.select_meta('BOOK_NAME') # 更新数据库版本 db.increase_version() # 数据库收尾工作 db.reset() if items: with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) if kw.get('email'): with SendEmail2Kindle() as s: s.send_all_mobi(save_path) else: LOG.log_it('无新项目', 'INFO')
def main(zhuanlan_name_list, start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q) for zhuanlan_name in zhuanlan_name_list: new_header = deepcopy(DEFAULT_HEADERS) new_header.update( {'Referer': 'https://zhuanlan.zhihu.com/{}'.format(zhuanlan_name)}) save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanlan_name)) task = Task.make_task({ 'url': 'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=20&offset={}' .format(zhuanlan_name, start), 'method': 'GET', 'meta': { 'headers': new_header, 'verify': False }, 'parser': parser_list, 'priority': 0, 'save': { 'cursor': start, 'save_path': save_path, 'start': start, 'end': end, 'kw': kw, 'name': zhuanlan_name }, 'retry': 3, }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: pass crawler.start() for zhuanlan_name in zhuanlan_name_list: items = [] save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanlan_name)) with ArticleDB(save_path, VERSION=0) as db: db.insert_meta_data(['BOOK_NAME', zhuanlan_name]) items.extend(db.select_article()) db.increase_version() with HTML2Kindle(items, save_path, zhuanlan_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) if kw.get('email'): for zhuanlan_name in zhuanlan_name_list: with SendEmail2Kindle() as s: s.send_all_mobi( os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanlan_name))) os._exit(0)
def main(start, end, kw): # start:2017/12/11 # end:2017/12/12 iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q) try: start_l = [int(_) for _ in start.split('-')] end_l = [int(_) for _ in end.split('-')] start_t = int( datetime.datetime(start_l[0], start_l[1], start_l[2]).timestamp()) + 60 * 60 * 24 end_t = int( datetime.datetime(end_l[0], end_l[1], end_l[2]).timestamp()) except: LOG.log_it('日期格式错误', 'WARN') traceback.print_exc() return global API_URL if 'type' in kw: if kw['type'] == 'business': API_URL = API_BUSINESS elif kw['type'] == 'intelligent': API_URL = API_INTELLIGENT elif kw['type'] == 'design': API_URL = API_DESIGN elif kw['type'] == 'fashion': API_URL = API_FASHION elif kw['type'] == 'entertainment': API_URL = API_ENTERTAINMENT elif kw['type'] == 'city': API_URL = API_CITY elif kw['type'] == 'game': API_URL = API_GAME elif kw['type'] == 'long': API_URL = API_LONG elif kw['type'] == 'home': pass else: kw.update({'type': 'home'}) new_header = deepcopy(SCRIPT_CONFIG.get('DEFAULT_HEADERS')) new_header.update({'Referer': 'https://www.qdaily.com/'}) save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], 'qdaily_{}'.format(kw['type'])) book_name = '好奇心日报_{}_{}_{}'.format(kw['type'], start, end) task = Task.make_task({ 'url': API_URL.format(start_t), 'method': 'GET', 'meta': { 'headers': new_header, 'verify': False }, 'parser': parser_list, 'priority': 0, 'save': { 'cursor': start_t, 'save_path': save_path, 'start': start_t, 'end': end_t, 'kw': kw, 'page': 1, 'name': book_name, }, 'retry': 3, }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: pass crawler.start() items = [] with ArticleDB(save_path) as db: items.extend(db.select_article()) db.insert_meta_data(['BOOK_NAME', book_name]) db.increase_version() with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) if kw.get('email'): with SendEmail2Kindle() as s: s.send_all_mobi(save_path) os._exit(0)
def main(zhuanlan_name_list, start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1), MAIN_CONFIG.get('DOWNLOADER_WORKER', 1), MAIN_CONFIG.get('RESULTER_WORKER', 1)) new = True for zhuanlan_name in zhuanlan_name_list: new_header = deepcopy(DEFAULT_HEADERS) new_header.update( {'Referer': 'https://zhuanlan.zhihu.com/{}'.format(zhuanlan_name)}) save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanlan_name)) task = Task.make_task({ 'url': 'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=20&offset={}' .format(zhuanlan_name, start), 'method': 'GET', 'meta': { 'headers': new_header, 'verify': False }, 'parser': parser_list, 'priority': 0, 'save': { 'cursor': start, 'save_path': save_path, 'start': start, 'end': end, 'kw': kw, 'name': zhuanlan_name }, 'retry': 10, 'retry_delay': 10 }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: _ = db.select_all_article_id() if _: for each in _: ARTICLE_ID_SET.add(each[0]) crawler.start() for zhuanlan_name in zhuanlan_name_list: items = [] book_name = '知乎专栏_{}'.format(zhuanlan_name) save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanlan_name)) with ArticleDB(save_path, VERSION=0) as db: db.insert_meta_data(['BOOK_NAME', zhuanlan_name]) items.extend(db.select_article()) db.increase_version() db.reset() if items: new = True with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) else: LOG.log_it('无新项目', 'INFO') new = False if new and kw.get('email'): for zhuanlan_name in zhuanlan_name_list: with SendEmail2Kindle() as s: s.send_all_mobi( os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanlan_name)))
def main(start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1), MAIN_CONFIG.get('DOWNLOADER_WORKER', 1), MAIN_CONFIG.get('RESULTER_WORKER', 1)) new_header = deepcopy(DEFAULT_HEADERS) global IS_TODAY_URL if start is None: IS_TODAY_URL = True save_path = os.path.join( SCRIPT_CONFIG['SAVE_PATH'], 'zhihu_daily_' + get_datetime_string('%Y%m%d')) book_name = '知乎日报_' + get_datetime_string('%Y%m%d') else: if end is None: end = datetime.datetime.fromtimestamp( time.time()).strftime('%Y%m%d') save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], 'zhihu_daily_{}_{}'.format(start, end)) book_name = '知乎日报_{}_{}'.format(start, end) IS_TODAY_URL = False url = TODAY_URL if IS_TODAY_URL else YESTERDAY_URL.format(start) task = Task.make_task({ 'url': url, 'method': 'GET', 'meta': { 'headers': new_header, 'verify': False }, 'parser': parser_list, 'priority': 0, 'save': { 'cursor': start, 'save_path': save_path, 'start': start, 'end': end, 'kw': kw }, 'retry': 99, 'retry_delay': 10 }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: _ = db.select_all_article_id() if _: for each in _: ARTICLE_ID_SET.add(each[0]) crawler.start() items = [] with ArticleDB(save_path, VERSION=0) as db: db.insert_meta_data(['BOOK_NAME', book_name]) items.extend(db.select_article()) db.increase_version() db.reset() if items: new = True with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) else: LOG.log_it('无新项目', 'INFO') new = False if new and kw.get('email'): with SendEmail2Kindle() as s: s.send_all_mobi(os.path.join(save_path))
def main(collection_num_list, start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1), MAIN_CONFIG.get('DOWNLOADER_WORKER', 1), MAIN_CONFIG.get('RESULTER_WORKER', 1)) new = True for collection_num in collection_num_list: save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(collection_num)) task = Task.make_task({ 'url': 'https://www.zhihu.com/collection/{}?page={}'.format( collection_num, start), 'method': 'GET', 'meta': { 'headers': DEFAULT_HEADERS, 'verify': False }, 'parser': parser_collection, 'resulter': resulter_collection, 'priority': 0, 'retry': 10, 'save': { 'start': start, 'end': end, 'kw': kw, 'save_path': save_path, 'name': collection_num, }, 'retry_delay': 10 }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: _ = db.select_all_article_id() if _: for each in _: ARTICLE_ID_SET.add(each[0]) crawler.start() for collection_num in collection_num_list: items = [] save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(collection_num)) with ArticleDB(save_path) as db: items.extend(db.select_article()) book_name = db.select_meta('BOOK_NAME') db.increase_version() db.reset() if items: new = True with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) else: LOG.log_it('无新项目', 'INFO') new = False if new and kw.get('email'): for collection_num in collection_num_list: save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(collection_num)) with SendEmail2Kindle() as s: s.send_all_mobi(save_path)