class Downloader(Thread): def __init__(self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, name: str, session=requests.session()): super().__init__(name=name) self.to_download_q = to_download_q self.downloader_parser_q = downloader_parser_q self.result_q = result_q self.session = session self._exit = False self.log = Log(self.name) def exit(self): self._exit = True def request(self): response = None try: task = self.to_download_q.get_nowait() TaskManager.register(task['tid']) except Empty: self.log.log_it( "Scheduler to Downloader队列为空,{}等待中。".format(self.name), 'DEBUG') with COND: COND.wait() self.log.log_it( "Downloader to Parser队列不为空。{}被唤醒。".format(self.name), 'DEBUG') return self.log.log_it("请求 {}".format(task['url']), 'INFO') try: response = self.session.request(task['method'], task['url'], **task.get('meta', {})) except Exception as e: traceback.print_exc() self.log.log_it( "网络请求错误。错误信息:{} URL:{} Response:{}".format( str(e), task['url'], response), 'INFO') retry(task, self.to_download_q) return if response: task['response'] = response else: task['response'] = None self.downloader_parser_q.put(task) def run(self): while not self._exit: self.request()
class Downloader(Thread): def __init__(self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, name: str, lock, session=requests.session()): super().__init__(name=name) self.to_download_q = to_download_q self.downloader_parser_q = downloader_parser_q self.result_q = result_q self.session = session self._exit = False self.log = Log(self.name) self.lock = lock self.task_manager = TaskManager(self.lock) def exit(self): self._exit = True def request(self): response = None try: task = self.to_download_q.get_nowait() self.task_manager.register(task['tid']) except Empty: self.log.log_it("Scheduler to Downloader队列为空,{}等待中。".format(self.name), 'DEBUG') with COND: COND.wait() self.log.log_it("Downloader to Parser队列不为空。{}被唤醒。".format(self.name), 'DEBUG') return self.log.log_it("请求 {}".format(task['url']), 'INFO') try: response = self.session.request(task['method'], task['url'], **task.get('meta', {})) except Exception as e: # traceback.print_exc(file=open(os.path.join(config.get('LOG_PATH'), 'downlaoder_traceback'), 'a')) traceback.print_exc() self.log.log_it("网络请求错误。错误信息:{} URL:{} Response:{}".format(str(e), task['url'], response), 'INFO') if task.get('retry', None): if task.get('retried', 0) < task.get('retry'): task.update({'retried': task.get('retried', 1) + 1}) self.to_download_q.put(task) return if response: task.update({'response': response}) else: task.update({'response': None}) self.downloader_parser_q.put(task) def run(self): while not self._exit: self.request()
class SendEmail: def __init__(self): self.log = Log('SendEmail2Kindle') try: self.username = MAIN_CONFIG['EMAIL_USERNAME'] self.password = MAIN_CONFIG['PASSWORD'] self.smtp_addr = MAIN_CONFIG['SMTP_ADDR'] self.kindle_addr = MAIN_CONFIG['KINDLE_ADDR'] except KeyError: self.log.log_it("无法实例化SendEmail2Kindle,请确保config.yml配置完整", 'ERROR') return self.sender = self.username self.sended = [] self.client = smtplib.SMTP() def connect(self) -> bool: try: self.log.log_it("正在连接邮件服务器", 'INFO') self.client.connect(self.smtp_addr) self.log.log_it("正在登录服务器", 'INFO') self.client.login(self.username, self.password) return True except smtplib.SMTPAuthenticationError: self.log.log_it("邮箱用户名或密码错误", 'WARN') return False except Exception as e: self.log.log_it("连接错误。错误信息:{}".format(str(e)), 'INFO') return False def disconnect(self) -> None: self.client.quit() def __enter__(self): if not self.connect(): raise Exception("SendEmail2Kindle连接服务器错误") return self def __exit__(self, exc_type, exc_val, exc_tb): self.disconnect() def send_file(self, file_path: str) -> None: msg = MIMEMultipart() msg['Subject'] = 'Web2kindle' msg['From'] = self.sender msg['To'] = self.kindle_addr file = MIMEApplication(open(file_path, 'rb').read()) file.add_header('Content-Disposition', 'attachment', filename=file_path) msg.attach(file) try: self.client.sendmail(self.sender, self.kindle_addr, msg.as_string()) self.sended.append(file_path) except smtplib.SMTPRecipientsRefused as e: self.log.log_it("所有收件人都被拒绝。", 'WARN') except smtplib.SMTPSenderRefused as e: self.log.log_it("发件人地址被拒绝。", 'WARN') except smtplib.SMTPDataError as e: self.log.log_it("服务器拒绝接受邮件数据。", 'WARN') except smtplib.SMTPException as e: self.log.log_it( "未知错误。FILE_PATH:{},ERRINFO:{}".format(file_path, str(e)), 'WARN') def send_files(self, file_paths: list) -> None: for file_path in file_paths: self.log.log_it("正在发送:{}".format(file_path), 'INFO') self.send_file(file_path) self.log.log_it("{}发送成功".format(file_path), 'INFO')
class HTML2Kindle: content_template = Template( read_file('./web2kindle/templates/kindle_content.html')) opf_template = Template( read_file('./web2kindle/templates/kindle_opf.html')) index_template = Template( read_file('./web2kindle/templates/kindle_table.html')) ncx_template = Template(read_file('./web2kindle/templates/kindle_ncx.ncx')) def __init__(self, items: list, path: str, book_name: str, kindlegen_path: str = KINDLE_GEN_PATH) -> None: self.kindlegen_path = kindlegen_path if kindlegen_path is not None else KINDLE_GEN_PATH self.items = items self.book_name = str(book_name) self.path = path self.to_remove = set() self.log = Log('HTML2Kindle') if not os.path.exists(path): os.makedirs(path) def __exit__(self, exc_type: None, exc_val: None, exc_tb: None) -> None: self.remove() def __enter__(self): return self def remove(self) -> None: for i in self.to_remove: try: os.remove(i) except FileNotFoundError: pass def make_metadata(self, window: int = 20) -> None: window = int(window) spilt_items = split_list(self.items, window) # 根据window分割电子书 for index, items in enumerate(spilt_items): self.log.log_it("制作 {}_{} 的元数据".format(self.book_name, str(index)), 'INFO') opf = [] table = [] table_name = '{}_{}.html'.format(self.book_name, str(index)) opf_name = '{}_{}.opf'.format(self.book_name, str(index)) ncx_name = '{}_{}.ncx'.format(self.book_name, str(index)) table_path = os.path.join(self.path, table_name) opf_path = os.path.join(self.path, opf_name) ncx_path = os.path.join(self.path, ncx_name) # 标记,以便删除 self.to_remove.add(table_path) self.to_remove.add(opf_path) self.to_remove.add(ncx_path) for item in items: kw = { 'author_name': item[5], 'voteup_count': item[4], 'created_time': item[3] } # 文件名=title+author article_path = os.path.join( self.path, format_file_name(item[1], item[5]) + '.html') if os.path.exists(article_path): # 防止文件名重复 article_path = article_path.replace('.html', '') + ''.join( random_char(3)) + '.html' self.make_content(item[1], item[2], article_path, kw) # 标记,以便删除 self.to_remove.add(article_path) opf.append({ 'id': article_path, 'href': article_path, 'title': item[1] }) table.append({'href': article_path, 'name': item[1]}) self.make_table(table, table_path) self.make_opf(self.book_name + '_' + str(index), opf, table_path, opf_path, ncx_path) self.make_ncx(self.book_name + '_' + str(index), opf, table_path, ncx_path) def make_opf(self, title: str, navigation: list, table_path: str, opf_path: str, ncx_path: str) -> None: rendered_content = self.opf_template.render(title=title, navigation=navigation, table_href=table_path, ncx_href=ncx_path) with codecs.open(opf_path, 'w', 'utf_8_sig') as f: f.write(rendered_content) def make_ncx(self, title: str, navigation: list, table_path: str, opf_path: str) -> None: rendered_content = self.ncx_template.render(title=title, navigation=navigation, table_href=table_path) with codecs.open(opf_path, 'w', 'utf_8_sig') as f: f.write(rendered_content) def make_content(self, title: str, content: str, path: str, kw: dict = None) -> None: rendered_content = self.content_template.render(title=title, content=content, kw=kw) with codecs.open(path, 'w', 'utf_8_sig') as f: f.write(rendered_content) def make_table(self, navigation: list, path: str) -> None: rendered_content = self.index_template.render(navigation=navigation) with codecs.open(path, 'w', 'utf_8_sig') as f: f.write(rendered_content) @staticmethod def _make_book(kindlegen_path: str, log_path: str, path: str) -> None: os.system("{} -dont_append_source {}".format(kindlegen_path, path)) def make_book_multi(self, rootdir: str, overwrite: bool = True) -> None: from multiprocessing import Pool self.log.log_it("新建 {} 个线程制作mobi文件.正在制作中,请稍后".format(str(cpu_count())), 'INFO') pool = Pool(cpu_count()) opf_list = self.get_opf(rootdir, overwrite) pool.map( partial(self._make_book, self.kindlegen_path, os.path.join(self.path, 'kindlegen.log')), opf_list) def make_book(self, rootdir: str, overwrite: bool = True) -> None: opf_list = self.get_opf(rootdir, overwrite) self.log.log_it("正在制作中,请稍后", 'INFO') for i in opf_list: os.system("{} -dont_append_source {} > {}".format( self.kindlegen_path, os.path.join(rootdir, i), os.path.join(self.path, 'kindlegen.log'))) def get_opf(self, rootdir: str, overwrite: bool) -> list: result = [] mobi = [] for i in os.listdir(rootdir): if not os.path.isdir(os.path.join(rootdir, i)): if i.lower().endswith('mobi'): mobi.append(i) for i in os.listdir(rootdir): if not os.path.isdir(os.path.join(rootdir, i)): if i.lower().endswith('opf'): if overwrite: result.append(os.path.join(rootdir, i)) else: if i.replace('opf', 'mobi') not in mobi: result.append(os.path.join(rootdir, i)) return result
class Crawler: def __init__(self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, parser_worker_count, downloader_worker_count, resulter_worker_count, session=requests.session()): self.parser_worker_count = int(parser_worker_count) self.downloader_worker_count = int(downloader_worker_count) self.resulter_worker_count = int(resulter_worker_count) self.downloader_worker = [] self.parser_worker = [] self.resulter_worker = [] self.log = Log("Crawler") self.to_download_q = to_download_q self.downloader_parser_q = downloader_parser_q self.result_q = result_q self.task_manager = TaskManager(self.to_download_q) self.session = session self.lock = LOCK self.task_manager_thread = Thread(target=self.task_manager.run) def start(self): self.task_manager_thread.start() for i in range(self.downloader_worker_count): _worker = Downloader( self.to_download_q, self.downloader_parser_q, self.result_q, "Downloader {}".format(i), self.session, ) self.downloader_worker.append(_worker) self.log.log_it("启动 Downloader {}".format(i), 'INFO') _worker.start() for i in range(self.parser_worker_count): _worker = Parser(self.to_download_q, self.downloader_parser_q, self.result_q, "Parser {}".format(i)) self.parser_worker.append(_worker) self.log.log_it("启动 Parser {}".format(i), 'INFO') _worker.start() for i in range(self.resulter_worker_count): _worker = Resulter(self.to_download_q, self.downloader_parser_q, self.result_q, "Resulter {}".format(i)) self.resulter_worker.append(_worker) self.log.log_it("启动 Resulter {}".format(i), 'INFO') _worker.start() while True: time.sleep(1) if self.task_manager.is_empty(): for worker in self.downloader_worker: worker.exit() for worker in self.parser_worker: worker.exit() resulter_not_alive = False while not resulter_not_alive: resulter_not_alive = True time.sleep(1) for worker in self.resulter_worker: resulter_not_alive &= not worker.is_alive() for worker in self.resulter_worker: worker.exit() self.task_manager.exit() TaskManager.ALLDONE = False return
class Resulter(Thread): def __init__(self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, name: str): super().__init__(name=name) self.result_q = result_q self.downloader_parser_q = downloader_parser_q self.to_download_q = to_download_q self._exit = False self.log = Log(self.name) def exit(self): self._exit = True def result(self): with COND: COND.notify_all() try: task = self.result_q.get_nowait() except Empty: time.sleep(0.1) return try: self.log.log_it("正在处理{}".format(task['tid'])) task['resulter'](task) except RetryDownload: self.log.log_it("RetryDownload Exception.Task{}".format(task), 'INFO') retry(task, self.to_download_q) return except RetryDownloadEnForceNodelay: self.log.log_it( "RetryDownloadEnForce Exception.Task{}".format(task), 'INFO') self.to_download_q.put(task) return except RetryDownloadNodelay: self.log.log_it( "RetryDownloadNodelay Exception.Task{}".format(task), 'INFO') retry_nodelay(task, self.to_download_q) return except RetryParse: self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO') retry(task, self.downloader_parser_q) return except RetryParseEnForceNodelay: self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO') self.downloader_parser_q.put(task) return except RetryParseNodelay: self.log.log_it("RetryParseNodelay Exception.Task{}".format(task), 'INFO') retry_nodelay(task, self.downloader_parser_q) return except RetryResult: self.log.log_it("RetryResult Exception.Task{}".format(task), 'INFO') retry(task, self.result_q) return except RetryResultEnForceNodelay: self.log.log_it("RetryResultEnForce Exception.Task{}".format(task), 'INFO') self.result_q.put(task) return except RetryResultNodelay: self.log.log_it("RetryResultNodelay Exception.Task{}".format(task), 'INFO') retry_nodelay(task, self.result_q) return except Exception as e: traceback.print_exc() self.log.log_it( "Resulter函数错误。错误信息:{}。Task:{}".format(str(e), task), 'WARN') retry(task, self.result_q) return def run(self): while (not TaskManager.ALLDONE) or (not self.result_q.empty()): self.result()
class Parser(Thread): def __init__(self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, name: str): super().__init__(name=name) self.downloader_parser_q = downloader_parser_q self.to_download_q = to_download_q self.result_q = result_q self._exit = False self.log = Log(self.name) def exit(self): self._exit = True def parser(self): with COND: COND.notify_all() try: task = self.downloader_parser_q.get_nowait() except Empty: time.sleep(0.1) with COND: COND.notify_all() return try: task_with_parsed_data, tasks = task['parser'](task) if tasks: if not isinstance(tasks, list): tasks = [tasks] self.log.log_it("获取新任务{}个。".format(len(tasks)), 'INFO') for each_task in tasks: TaskManager.register(each_task['tid']) self.to_download_q.put(each_task) except RetryDownload: self.log.log_it("RetryDownload Exception.Task{}".format(task), 'INFO') retry(task, self.to_download_q) return except RetryDownloadEnForce: self.log.log_it( "RetryDownloadEnForce Exception.Task{}".format(task), 'INFO') self.to_download_q.put(task) return except RetryParse: self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO') retry(task, self.downloader_parser_q) return except RetryParseEnForce: self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO') self.downloader_parser_q.put(task) return except Exception as e: self.log.log_it("解析错误。错误信息:{}。Task:{}".format(str(e), task), 'WARN') traceback.print_exc() return TaskManager.unregister(task['tid']) return task_with_parsed_data def run(self): while not self._exit: task_with_parsed_data = self.parser() if task_with_parsed_data: self.result_q.put(task_with_parsed_data)
class Crawler: def __init__(self, to_download_q, downloader_parser_q, result_q, parser_worker_count=CRAWLER_CONFIG.get('PARSER_WORKER', 1), downloader_worker_count=CRAWLER_CONFIG.get('DOWNLOADER_WORKER', 1), resulter_worker_count=CRAWLER_CONFIG.get('RESULTER_WORKER', 1), session=requests.session()): self.parser_worker_count = parser_worker_count self.downloader_worker_count = downloader_worker_count self.resulter_worker_count = resulter_worker_count self.downloader_worker = [] self.parser_worker = [] self.resulter_worker = [] self.log = Log("Crawler") self.to_download_q = to_download_q self.downloader_parser_q = downloader_parser_q self.result_q = result_q self.session = session self.lock = Lock() self.task_manager = TaskManager(self.lock) def start(self): for i in range(self.downloader_worker_count): _worker = Downloader(self.to_download_q, self.downloader_parser_q, self.result_q, "Downloader {}".format(i), self.lock, self.session, ) self.downloader_worker.append(_worker) self.log.log_it("启动 Downloader {}".format(i), 'INFO') _worker.start() for i in range(self.parser_worker_count): _worker = Parser(self.to_download_q, self.downloader_parser_q, self.result_q, "Parser {}".format(i), self.lock) self.parser_worker.append(_worker) self.log.log_it("启动 Parser {}".format(i), 'INFO') _worker.start() for i in range(self.resulter_worker_count): _worker = Resulter(self.to_download_q, self.downloader_parser_q, self.result_q, "Resulter {}".format(i), self.lock) self.resulter_worker.append(_worker) self.log.log_it("启动 Resulter {}".format(i), 'INFO') _worker.start() while True: time.sleep(1) if self.task_manager.is_empty(): for worker in self.downloader_worker: worker.exit() for worker in self.parser_worker: worker.exit() resulter_not_alive = False while not resulter_not_alive: resulter_not_alive = True time.sleep(1) for worker in self.resulter_worker: resulter_not_alive &= not worker.is_alive() return
class Resulter(Thread): def __init__( self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, name: str, lock): super().__init__(name=name) self.result_q = result_q self.downloader_parser_q = downloader_parser_q self.to_download_q = to_download_q self._exit = False self.log = Log(self.name) self.lock = lock self.task_manager = TaskManager(self.lock) def exit(self): self._exit = True def result(self): with COND: COND.notify_all() try: task = self.result_q.get_nowait() except Empty: time.sleep(1) return try: task['resulter'](task) except RetryDownload: self.log.log_it("RetryDownload Exception.Task{}".format(task), 'INFO') if task.get('retry', None): if task.get('retried', 0) < task.get('retry'): task.update({'retried': task.get('retried', 1) + 1}) self.to_download_q.put(task) return except RetryDownloadEnForce: self.log.log_it("RetryDownloadEnForce Exception.Task{}".format(task), 'INFO') self.to_download_q.put(task) return except RetryParse: self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO') if task.get('retry', None): if task.get('retried', 0) < task.get('retry'): task.update({'retried': task.get('retried', 1) + 1}) self.downloader_parser_q.put(task) return except RetryParseEnForce: self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO') self.downloader_parser_q.put(task) except RetryResult: self.log.log_it("RetryResult Exception.Task{}".format(task), 'INFO') if task.get('retry', None): if task.get('retried', 0) < task.get('retry'): task.update({'retried': task.get('retried', 1) + 1}) self.result_q.put(task) return except RetryResultEnForce: self.log.log_it("RetryResultEnForce Exception.Task{}".format(task), 'INFO') self.result_q.put(task) return except Exception as e: # FIXME FileNotFoundError # traceback.print_exc(file=open(os.path.join(config.get('LOG_PATH'), 'parser_traceback'), 'a')) traceback.print_exc() self.log.log_it("Resulter函数错误。错误信息:{}。Task:{}".format(str(e), task), 'WARN') def run(self): while not (TaskManager.ALLDONE and self.result_q.empty()): self.result()
class Parser(Thread): def __init__( self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, name: str, lock): super().__init__(name=name) self.downloader_parser_q = downloader_parser_q self.to_download_q = to_download_q self.result_q = result_q self._exit = False self.log = Log(self.name) self.lock = lock self.task_manager = TaskManager(self.lock) def exit(self): self._exit = True def parser(self): with COND: COND.notify_all() task = self.downloader_parser_q.get() try: task_with_parsed_data, tasks = task['parser'](task) if tasks and isinstance(tasks, list): self.log.log_it("获取新任务{}个。".format(len(tasks)), 'INFO') for new_task in tasks: self.task_manager.register(new_task['tid']) self.to_download_q.put(new_task) elif tasks: self.log.log_it("获取新任务1个。", 'INFO') self.task_manager.register(tasks['tid']) self.to_download_q.put(tasks) except RetryDownload: self.log.log_it("RetryDownload Exception.Task{}".format(task), 'INFO') if task.get('retry', None): if task.get('retried', 0) < task.get('retry'): task.update({'retried': task.get('retried', 1) + 1}) self.to_download_q.put(task) return except RetryDownloadEnForce: self.log.log_it("RetryDownloadEnForce Exception.Task{}".format(task), 'INFO') self.to_download_q.put(task) return except RetryParse: self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO') if task.get('retry', None): if task.get('retried', 0) < task.get('retry'): task.update({'retried': task.get('retried', 1) + 1}) self.downloader_parser_q.put(task) return except RetryParseEnForce: self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO') self.downloader_parser_q.put(task) return except Exception as e: # FIXME FileNotFoundError # traceback.print_exc(file=open(os.path.join(config.get('LOG_PATH'), 'parser_traceback'), 'a')) traceback.print_exc() self.log.log_it("解析错误。错误信息:{}。Task:{}".format(str(e), task), 'WARN') return finally: self.task_manager.unregister(task['tid']) return task_with_parsed_data def run(self): while not self._exit: task_with_parsed_data = self.parser() if task_with_parsed_data: self.result_q.put(task_with_parsed_data)