def __init__(self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, parser_worker_count, downloader_worker_count, resulter_worker_count, session=requests.session()): self.parser_worker_count = int(parser_worker_count) self.downloader_worker_count = int(downloader_worker_count) self.resulter_worker_count = int(resulter_worker_count) self.downloader_worker = [] self.parser_worker = [] self.resulter_worker = [] self.log = Log("Crawler") self.to_download_q = to_download_q self.downloader_parser_q = downloader_parser_q self.result_q = result_q self.task_manager = TaskManager(self.to_download_q) self.session = session self.lock = LOCK self.task_manager_thread = Thread(target=self.task_manager.run)
def __init__(self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, name: str): super().__init__(name=name) self.result_q = result_q self.downloader_parser_q = downloader_parser_q self.to_download_q = to_download_q self._exit = False self.log = Log(self.name)
def __init__(self, items: list, path: str, book_name: str, kindlegen_path: str = KINDLE_GEN_PATH) -> None: self.kindlegen_path = kindlegen_path if kindlegen_path is not None else KINDLE_GEN_PATH self.items = items self.book_name = str(book_name) self.path = path self.to_remove = set() self.log = Log('HTML2Kindle') if not os.path.exists(path): os.makedirs(path)
class Downloader(Thread): def __init__(self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, name: str, session=requests.session()): super().__init__(name=name) self.to_download_q = to_download_q self.downloader_parser_q = downloader_parser_q self.result_q = result_q self.session = session self._exit = False self.log = Log(self.name) def exit(self): self._exit = True def request(self): response = None try: task = self.to_download_q.get_nowait() TaskManager.register(task['tid']) except Empty: self.log.log_it( "Scheduler to Downloader队列为空,{}等待中。".format(self.name), 'DEBUG') with COND: COND.wait() self.log.log_it( "Downloader to Parser队列不为空。{}被唤醒。".format(self.name), 'DEBUG') return self.log.log_it("请求 {}".format(task['url']), 'INFO') try: response = self.session.request(task['method'], task['url'], **task.get('meta', {})) except Exception as e: traceback.print_exc() self.log.log_it( "网络请求错误。错误信息:{} URL:{} Response:{}".format( str(e), task['url'], response), 'INFO') retry(task, self.to_download_q) return if response: task['response'] = response else: task['response'] = None self.downloader_parser_q.put(task) def run(self): while not self._exit: self.request()
def __init__(self): self.log = Log('SendEmail2Kindle') try: self.username = MAIN_CONFIG['EMAIL_USERNAME'] self.password = MAIN_CONFIG['PASSWORD'] self.smtp_addr = MAIN_CONFIG['SMTP_ADDR'] self.kindle_addr = MAIN_CONFIG['KINDLE_ADDR'] except KeyError: self.log.log_it("无法实例化SendEmail2Kindle,请确保config.yml配置完整", 'ERROR') return self.sender = self.username self.sended = [] self.client = smtplib.SMTP()
def __init__( self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, name: str, lock): super().__init__(name=name) self.result_q = result_q self.downloader_parser_q = downloader_parser_q self.to_download_q = to_download_q self._exit = False self.log = Log(self.name) self.lock = lock self.task_manager = TaskManager(self.lock)
def __init__(self, items, path, book_name, kindlegen_path=KINDLE_GEN_PATH): # self.template_env = Environment(loader=PackageLoader('web2kindle')) # self.content_template = self.template_env.get_template('kindle_content.html') # self.opf_template = self.template_env.get_template('kindle_opf.html') # self.index_template = self.template_env.get_template('kindle_table.html') # 打包成exe之后会有bug self.kindlegen_path = kindlegen_path if kindlegen_path is not None else KINDLE_GEN_PATH self.items = items self.book_name = str(book_name) self.path = path self.to_remove = set() self.log = Log('HTML2Kindle') if not os.path.exists(os.path.split(path)[0]): os.makedirs((os.path.split(path)[0]))
class Downloader(Thread): def __init__(self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, name: str, lock, session=requests.session()): super().__init__(name=name) self.to_download_q = to_download_q self.downloader_parser_q = downloader_parser_q self.result_q = result_q self.session = session self._exit = False self.log = Log(self.name) self.lock = lock self.task_manager = TaskManager(self.lock) def exit(self): self._exit = True def request(self): response = None try: task = self.to_download_q.get_nowait() self.task_manager.register(task['tid']) except Empty: self.log.log_it("Scheduler to Downloader队列为空,{}等待中。".format(self.name), 'DEBUG') with COND: COND.wait() self.log.log_it("Downloader to Parser队列不为空。{}被唤醒。".format(self.name), 'DEBUG') return self.log.log_it("请求 {}".format(task['url']), 'INFO') try: response = self.session.request(task['method'], task['url'], **task.get('meta', {})) except Exception as e: # traceback.print_exc(file=open(os.path.join(config.get('LOG_PATH'), 'downlaoder_traceback'), 'a')) traceback.print_exc() self.log.log_it("网络请求错误。错误信息:{} URL:{} Response:{}".format(str(e), task['url'], response), 'INFO') if task.get('retry', None): if task.get('retried', 0) < task.get('retry'): task.update({'retried': task.get('retried', 1) + 1}) self.to_download_q.put(task) return if response: task.update({'response': response}) else: task.update({'response': None}) self.downloader_parser_q.put(task) def run(self): while not self._exit: self.request()
def __init__(self): self.CONFIG = load_config('./web2kindle/config/config.yml') self.log = Log('SendEmail2Kindle') try: self.username = self.CONFIG['EMAIL_USERNAME'] self.password = self.CONFIG['PASSWORD'] self.smtp_addr = self.CONFIG['SMTP_ADDR'] self.kindle_addr = self.CONFIG['KINDLE_ADDR'] except KeyError: self.log.log_it("无法实例化SendEmail2Kindle,请确保config.yml配置完整", 'ERROR') import os os._exit(1) self.sender = self.username self.sended = [] self.client = smtplib.SMTP()
def __init__(self, to_download_q, downloader_parser_q, result_q, parser_worker_count=CRAWLER_CONFIG.get('PARSER_WORKER', 1), downloader_worker_count=CRAWLER_CONFIG.get('DOWNLOADER_WORKER', 1), resulter_worker_count=CRAWLER_CONFIG.get('RESULTER_WORKER', 1), session=requests.session()): self.parser_worker_count = parser_worker_count self.downloader_worker_count = downloader_worker_count self.resulter_worker_count = resulter_worker_count self.downloader_worker = [] self.parser_worker = [] self.resulter_worker = [] self.log = Log("Crawler") self.to_download_q = to_download_q self.downloader_parser_q = downloader_parser_q self.result_q = result_q self.session = session self.lock = Lock() self.task_manager = TaskManager(self.lock)
class HTML2Kindle: content_template = Template( read_file('./web2kindle/templates/kindle_content.html')) opf_template = Template( read_file('./web2kindle/templates/kindle_opf.html')) index_template = Template( read_file('./web2kindle/templates/kindle_table.html')) ncx_template = Template(read_file('./web2kindle/templates/kindle_ncx.ncx')) def __init__(self, items: list, path: str, book_name: str, kindlegen_path: str = KINDLE_GEN_PATH) -> None: self.kindlegen_path = kindlegen_path if kindlegen_path is not None else KINDLE_GEN_PATH self.items = items self.book_name = str(book_name) self.path = path self.to_remove = set() self.log = Log('HTML2Kindle') if not os.path.exists(path): os.makedirs(path) def __exit__(self, exc_type: None, exc_val: None, exc_tb: None) -> None: self.remove() def __enter__(self): return self def remove(self) -> None: for i in self.to_remove: try: os.remove(i) except FileNotFoundError: pass def make_metadata(self, window: int = 20) -> None: window = int(window) spilt_items = split_list(self.items, window) # 根据window分割电子书 for index, items in enumerate(spilt_items): self.log.log_it("制作 {}_{} 的元数据".format(self.book_name, str(index)), 'INFO') opf = [] table = [] table_name = '{}_{}.html'.format(self.book_name, str(index)) opf_name = '{}_{}.opf'.format(self.book_name, str(index)) ncx_name = '{}_{}.ncx'.format(self.book_name, str(index)) table_path = os.path.join(self.path, table_name) opf_path = os.path.join(self.path, opf_name) ncx_path = os.path.join(self.path, ncx_name) # 标记,以便删除 self.to_remove.add(table_path) self.to_remove.add(opf_path) self.to_remove.add(ncx_path) for item in items: kw = { 'author_name': item[5], 'voteup_count': item[4], 'created_time': item[3] } # 文件名=title+author article_path = os.path.join( self.path, format_file_name(item[1], item[5]) + '.html') if os.path.exists(article_path): # 防止文件名重复 article_path = article_path.replace('.html', '') + ''.join( random_char(3)) + '.html' self.make_content(item[1], item[2], article_path, kw) # 标记,以便删除 self.to_remove.add(article_path) opf.append({ 'id': article_path, 'href': article_path, 'title': item[1] }) table.append({'href': article_path, 'name': item[1]}) self.make_table(table, table_path) self.make_opf(self.book_name + '_' + str(index), opf, table_path, opf_path, ncx_path) self.make_ncx(self.book_name + '_' + str(index), opf, table_path, ncx_path) def make_opf(self, title: str, navigation: list, table_path: str, opf_path: str, ncx_path: str) -> None: rendered_content = self.opf_template.render(title=title, navigation=navigation, table_href=table_path, ncx_href=ncx_path) with codecs.open(opf_path, 'w', 'utf_8_sig') as f: f.write(rendered_content) def make_ncx(self, title: str, navigation: list, table_path: str, opf_path: str) -> None: rendered_content = self.ncx_template.render(title=title, navigation=navigation, table_href=table_path) with codecs.open(opf_path, 'w', 'utf_8_sig') as f: f.write(rendered_content) def make_content(self, title: str, content: str, path: str, kw: dict = None) -> None: rendered_content = self.content_template.render(title=title, content=content, kw=kw) with codecs.open(path, 'w', 'utf_8_sig') as f: f.write(rendered_content) def make_table(self, navigation: list, path: str) -> None: rendered_content = self.index_template.render(navigation=navigation) with codecs.open(path, 'w', 'utf_8_sig') as f: f.write(rendered_content) @staticmethod def _make_book(kindlegen_path: str, log_path: str, path: str) -> None: os.system("{} -dont_append_source {}".format(kindlegen_path, path)) def make_book_multi(self, rootdir: str, overwrite: bool = True) -> None: from multiprocessing import Pool self.log.log_it("新建 {} 个线程制作mobi文件.正在制作中,请稍后".format(str(cpu_count())), 'INFO') pool = Pool(cpu_count()) opf_list = self.get_opf(rootdir, overwrite) pool.map( partial(self._make_book, self.kindlegen_path, os.path.join(self.path, 'kindlegen.log')), opf_list) def make_book(self, rootdir: str, overwrite: bool = True) -> None: opf_list = self.get_opf(rootdir, overwrite) self.log.log_it("正在制作中,请稍后", 'INFO') for i in opf_list: os.system("{} -dont_append_source {} > {}".format( self.kindlegen_path, os.path.join(rootdir, i), os.path.join(self.path, 'kindlegen.log'))) def get_opf(self, rootdir: str, overwrite: bool) -> list: result = [] mobi = [] for i in os.listdir(rootdir): if not os.path.isdir(os.path.join(rootdir, i)): if i.lower().endswith('mobi'): mobi.append(i) for i in os.listdir(rootdir): if not os.path.isdir(os.path.join(rootdir, i)): if i.lower().endswith('opf'): if overwrite: result.append(os.path.join(rootdir, i)) else: if i.replace('opf', 'mobi') not in mobi: result.append(os.path.join(rootdir, i)) return result
import time from copy import deepcopy from queue import Queue, PriorityQueue from urllib.parse import urlparse from bs4 import BeautifulSoup from web2kindle import MAIN_CONFIG from web2kindle.libs.crawler import Crawler, RetryDownload, Task from web2kindle.libs.db import ArticleDB from web2kindle.libs.html2kindle import HTML2Kindle from web2kindle.libs.send_email import SendEmail2Kindle from web2kindle.libs.utils import write, load_config, check_config, md5string from web2kindle.libs.log import Log SCRIPT_CONFIG = load_config('./web2kindle/config/guoke_scientific.yml') LOG = Log("guoke_scientific") API_URL = "http://www.guokr.com/apis/minisite/article.json?retrieve_type=by_subject&limit=20&offset={}&_=1508757235776" DEFAULT_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' '61.0.3163.100 Safari/537.36' } check_config(MAIN_CONFIG, SCRIPT_CONFIG, 'SAVE_PATH', LOG) ARTICLE_ID_SET = set() def main(start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1),
import time from copy import deepcopy from queue import Queue, PriorityQueue from urllib.parse import urlparse, unquote from bs4 import BeautifulSoup from web2kindle import MAIN_CONFIG from web2kindle.libs.crawler import Crawler, RetryDownload, Task from web2kindle.libs.db import ArticleDB from web2kindle.libs.html2kindle import HTML2Kindle from web2kindle.libs.send_email import SendEmail2Kindle from web2kindle.libs.utils import write, md5string, load_config, check_config, format_file_name from web2kindle.libs.log import Log SCRIPT_CONFIG = load_config('./web2kindle/config/jianshu_user.yml') LOG = Log("jianshu_user") DEFAULT_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' '61.0.3163.100 Safari/537.36' } check_config(MAIN_CONFIG, SCRIPT_CONFIG, 'SAVE_PATH', LOG) ARTICLE_ID_SET = set() ORDER_TOP = 'top' ORDER_COMMENT = 'commented_at' ORDER_ADD = 'added_at' API_URL = 'https://www.jianshu.com/u/{}?order_by={}&page={}' BASE_URL = 'https://www.jianshu.com/u/{}'
import re import time from copy import deepcopy from queue import Queue, PriorityQueue, Empty from urllib.parse import urlparse, unquote from web2kindle.libs.crawler import Crawler, RetryDownload, Task from web2kindle.libs.db import ArticleDB from web2kindle.libs.html2kindle import HTML2Kindle from web2kindle.libs.send_email import SendEmail2Kindle from web2kindle.libs.utils import write, md5string, load_config, check_config from web2kindle.libs.log import Log from bs4 import BeautifulSoup SCRIPT_CONFIG = load_config('./web2kindle/config/zhihu_zhuanlan_config.yml') MAIN_CONFIG = load_config('./web2kindle/config/config.yml') LOG = Log("zhihu_zhuanlan") DEFAULT_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' '61.0.3163.100 Safari/537.36' } check_config(MAIN_CONFIG, SCRIPT_CONFIG, 'SAVE_PATH', LOG) def main(zhuanlan_name_list, start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q)
class SendEmail: def __init__(self): self.log = Log('SendEmail2Kindle') try: self.username = MAIN_CONFIG['EMAIL_USERNAME'] self.password = MAIN_CONFIG['PASSWORD'] self.smtp_addr = MAIN_CONFIG['SMTP_ADDR'] self.kindle_addr = MAIN_CONFIG['KINDLE_ADDR'] except KeyError: self.log.log_it("无法实例化SendEmail2Kindle,请确保config.yml配置完整", 'ERROR') return self.sender = self.username self.sended = [] self.client = smtplib.SMTP() def connect(self) -> bool: try: self.log.log_it("正在连接邮件服务器", 'INFO') self.client.connect(self.smtp_addr) self.log.log_it("正在登录服务器", 'INFO') self.client.login(self.username, self.password) return True except smtplib.SMTPAuthenticationError: self.log.log_it("邮箱用户名或密码错误", 'WARN') return False except Exception as e: self.log.log_it("连接错误。错误信息:{}".format(str(e)), 'INFO') return False def disconnect(self) -> None: self.client.quit() def __enter__(self): if not self.connect(): raise Exception("SendEmail2Kindle连接服务器错误") return self def __exit__(self, exc_type, exc_val, exc_tb): self.disconnect() def send_file(self, file_path: str) -> None: msg = MIMEMultipart() msg['Subject'] = 'Web2kindle' msg['From'] = self.sender msg['To'] = self.kindle_addr file = MIMEApplication(open(file_path, 'rb').read()) file.add_header('Content-Disposition', 'attachment', filename=file_path) msg.attach(file) try: self.client.sendmail(self.sender, self.kindle_addr, msg.as_string()) self.sended.append(file_path) except smtplib.SMTPRecipientsRefused as e: self.log.log_it("所有收件人都被拒绝。", 'WARN') except smtplib.SMTPSenderRefused as e: self.log.log_it("发件人地址被拒绝。", 'WARN') except smtplib.SMTPDataError as e: self.log.log_it("服务器拒绝接受邮件数据。", 'WARN') except smtplib.SMTPException as e: self.log.log_it( "未知错误。FILE_PATH:{},ERRINFO:{}".format(file_path, str(e)), 'WARN') def send_files(self, file_paths: list) -> None: for file_path in file_paths: self.log.log_it("正在发送:{}".format(file_path), 'INFO') self.send_file(file_path) self.log.log_it("{}发送成功".format(file_path), 'INFO')
class Resulter(Thread): def __init__(self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, name: str): super().__init__(name=name) self.result_q = result_q self.downloader_parser_q = downloader_parser_q self.to_download_q = to_download_q self._exit = False self.log = Log(self.name) def exit(self): self._exit = True def result(self): with COND: COND.notify_all() try: task = self.result_q.get_nowait() except Empty: time.sleep(0.1) return try: self.log.log_it("正在处理{}".format(task['tid'])) task['resulter'](task) except RetryDownload: self.log.log_it("RetryDownload Exception.Task{}".format(task), 'INFO') retry(task, self.to_download_q) return except RetryDownloadEnForceNodelay: self.log.log_it( "RetryDownloadEnForce Exception.Task{}".format(task), 'INFO') self.to_download_q.put(task) return except RetryDownloadNodelay: self.log.log_it( "RetryDownloadNodelay Exception.Task{}".format(task), 'INFO') retry_nodelay(task, self.to_download_q) return except RetryParse: self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO') retry(task, self.downloader_parser_q) return except RetryParseEnForceNodelay: self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO') self.downloader_parser_q.put(task) return except RetryParseNodelay: self.log.log_it("RetryParseNodelay Exception.Task{}".format(task), 'INFO') retry_nodelay(task, self.downloader_parser_q) return except RetryResult: self.log.log_it("RetryResult Exception.Task{}".format(task), 'INFO') retry(task, self.result_q) return except RetryResultEnForceNodelay: self.log.log_it("RetryResultEnForce Exception.Task{}".format(task), 'INFO') self.result_q.put(task) return except RetryResultNodelay: self.log.log_it("RetryResultNodelay Exception.Task{}".format(task), 'INFO') retry_nodelay(task, self.result_q) return except Exception as e: traceback.print_exc() self.log.log_it( "Resulter函数错误。错误信息:{}。Task:{}".format(str(e), task), 'WARN') retry(task, self.result_q) return def run(self): while (not TaskManager.ALLDONE) or (not self.result_q.empty()): self.result()
from threading import current_thread, active_count from urllib.parse import urlparse, unquote import time from bs4 import BeautifulSoup from web2kindle import MAIN_CONFIG from web2kindle.libs.crawler import Crawler, md5string, RetryDownload, Task from web2kindle.libs.db import ArticleDB from web2kindle.libs.utils import write, load_config, check_config from web2kindle.libs.html2kindle import HTML2Kindle from web2kindle.libs.log import Log from web2kindle.libs.send_email import SendEmail2Kindle SCRIPT_CONFIG = load_config('./web2kindle/config/zhihu_collection.yml') GET_BOOK_NAME_FLAG = False LOG = Log('zhihu_collection') DEFAULT_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' '61.0.3163.100 Safari/537.36' } check_config(MAIN_CONFIG, SCRIPT_CONFIG, 'SAVE_PATH', LOG) ARTICLE_ID_SET = set() def main(collection_num_list, start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1), MAIN_CONFIG.get('DOWNLOADER_WORKER', 1),
class Resulter(Thread): def __init__( self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, name: str, lock): super().__init__(name=name) self.result_q = result_q self.downloader_parser_q = downloader_parser_q self.to_download_q = to_download_q self._exit = False self.log = Log(self.name) self.lock = lock self.task_manager = TaskManager(self.lock) def exit(self): self._exit = True def result(self): with COND: COND.notify_all() try: task = self.result_q.get_nowait() except Empty: time.sleep(1) return try: task['resulter'](task) except RetryDownload: self.log.log_it("RetryDownload Exception.Task{}".format(task), 'INFO') if task.get('retry', None): if task.get('retried', 0) < task.get('retry'): task.update({'retried': task.get('retried', 1) + 1}) self.to_download_q.put(task) return except RetryDownloadEnForce: self.log.log_it("RetryDownloadEnForce Exception.Task{}".format(task), 'INFO') self.to_download_q.put(task) return except RetryParse: self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO') if task.get('retry', None): if task.get('retried', 0) < task.get('retry'): task.update({'retried': task.get('retried', 1) + 1}) self.downloader_parser_q.put(task) return except RetryParseEnForce: self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO') self.downloader_parser_q.put(task) except RetryResult: self.log.log_it("RetryResult Exception.Task{}".format(task), 'INFO') if task.get('retry', None): if task.get('retried', 0) < task.get('retry'): task.update({'retried': task.get('retried', 1) + 1}) self.result_q.put(task) return except RetryResultEnForce: self.log.log_it("RetryResultEnForce Exception.Task{}".format(task), 'INFO') self.result_q.put(task) return except Exception as e: # FIXME FileNotFoundError # traceback.print_exc(file=open(os.path.join(config.get('LOG_PATH'), 'parser_traceback'), 'a')) traceback.print_exc() self.log.log_it("Resulter函数错误。错误信息:{}。Task:{}".format(str(e), task), 'WARN') def run(self): while not (TaskManager.ALLDONE and self.result_q.empty()): self.result()
import time from copy import deepcopy from queue import Queue, PriorityQueue from urllib.parse import urlparse, unquote from bs4 import BeautifulSoup from web2kindle import MAIN_CONFIG from web2kindle.libs.crawler import Crawler, RetryDownload, Task from web2kindle.libs.db import ArticleDB from web2kindle.libs.html2kindle import HTML2Kindle from web2kindle.libs.send_email import SendEmail2Kindle from web2kindle.libs.utils import write, md5string, load_config, check_config, format_file_name from web2kindle.libs.log import Log SCRIPT_CONFIG = load_config('./web2kindle/config/jianshu_wenji.yml') LOG = Log("jianshu_wenji") DEFAULT_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' '61.0.3163.100 Safari/537.36' } check_config(MAIN_CONFIG, SCRIPT_CONFIG, 'SAVE_PATH', LOG) ARTICLE_ID_SET = set() API_URL = 'http://www.jianshu.com/nb/{}?order_by={}&page={}' BASE_URL = 'https://www.jianshu.com/nb/{}' ORDER_SEQ = 'seq' ORDER_COMMENT = 'commented_at' ORDER_ADD = 'added_at'
import time from copy import deepcopy from queue import Queue, PriorityQueue from urllib.parse import urlparse from web2kindle.libs.crawler import Crawler, RetryDownload, Task from web2kindle.libs.db import ArticleDB from web2kindle.libs.html2kindle import HTML2Kindle from web2kindle.libs.send_email import SendEmail2Kindle from web2kindle.libs.utils import write, format_file_name, load_config, check_config, md5string from web2kindle.libs.log import Log from bs4 import BeautifulSoup SCRIPT_CONFIG = load_config('./web2kindle/config/qdaily_config.yml') MAIN_CONFIG = load_config('./web2kindle/config/config.yml') LOG = Log("qdaily_home") API_URL = 'https://www.qdaily.com/homes/articlemore/{}.json' DEFAULT_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' '61.0.3163.100 Safari/537.36' } check_config(MAIN_CONFIG, SCRIPT_CONFIG, 'SAVE_PATH', LOG) API_BUSINESS = 'https://www.qdaily.com/categories/categorymore/18/{}.json' API_INTELLIGENT = 'https://www.qdaily.com/categories/categorymore/4/{}.json' API_DESIGN = 'https://www.qdaily.com/categories/categorymore/17/{}.json' API_FASHION = 'https://www.qdaily.com/categories/categorymore/19/{}.json' API_ENTERTAINMENT = 'https://www.qdaily.com/categories/categorymore/3/{}.json' API_CITY = 'https://www.qdaily.com/categories/categorymore/5/{}.json' API_GAME = 'https://www.qdaily.com/categories/categorymore/54/{}.json' API_LONG = 'https://www.qdaily.com/tags/tagmore/1068/{}.json'
import time from copy import deepcopy from queue import Queue, PriorityQueue from urllib.parse import urlparse, unquote from bs4 import BeautifulSoup from web2kindle import MAIN_CONFIG from web2kindle.libs.crawler import Crawler, RetryDownload, Task from web2kindle.libs.db import ArticleDB from web2kindle.libs.html2kindle import HTML2Kindle from web2kindle.libs.send_email import SendEmail2Kindle from web2kindle.libs.utils import write, md5string, load_config, check_config, format_file_name from web2kindle.libs.log import Log SCRIPT_CONFIG = load_config('./web2kindle/config/jianshu_zhuanti.yml') LOG = Log("jianshu_zhuanti") DEFAULT_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' '61.0.3163.100 Safari/537.36' } check_config(MAIN_CONFIG, SCRIPT_CONFIG, 'SAVE_PATH', LOG) ARTICLE_ID_SET = set() ORDER_TOP = 'top' ORDER_COMMENT = 'commented_at' ORDER_ADD = 'added_at' API_URL = 'https://www.jianshu.com/c/{}?order_by={}&page={}' BASE_URL = 'https://www.jianshu.com/c/{}'
from copy import deepcopy from queue import Queue, PriorityQueue from urllib.parse import urlparse, unquote from bs4 import BeautifulSoup from web2kindle import MAIN_CONFIG from web2kindle.libs.crawler import Crawler, RetryDownload, Task from web2kindle.libs.db import ArticleDB from web2kindle.libs.html2kindle import HTML2Kindle from web2kindle.libs.send_email import SendEmail2Kindle from web2kindle.libs.utils import write, md5string, load_config, check_config, get_next_datetime_string, \ compare_datetime_string, get_datetime_string from web2kindle.libs.log import Log SCRIPT_CONFIG = load_config('./web2kindle/config/zhihu_daily.yml') LOG = Log("zhihu_daily") DEFAULT_HEADERS = { 'User-Agent': 'DailyApi/4 (Linux; Android 4.4.2; SM-T525 Build/samsung/picassoltezs/picassolte/KOT49H/zh_CN) ' 'Google-HTTP-Java-Client/1.22.0 (gzip) Google-HTTP-Java-Client/1.22.0 (gzip)' } check_config(MAIN_CONFIG, SCRIPT_CONFIG, 'SAVE_PATH', LOG) ARTICLE_ID_SET = set() TODAY_URL = 'http://news-at.zhihu.com/api/4/stories/latest' # http://http://news-at.zhihu.com/api/4/stories/before/20180212 YESTERDAY_URL = 'http://news-at.zhihu.com/api/4/stories/before/{}' IS_TODAY_URL = True
class Parser(Thread): def __init__( self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, name: str, lock): super().__init__(name=name) self.downloader_parser_q = downloader_parser_q self.to_download_q = to_download_q self.result_q = result_q self._exit = False self.log = Log(self.name) self.lock = lock self.task_manager = TaskManager(self.lock) def exit(self): self._exit = True def parser(self): with COND: COND.notify_all() task = self.downloader_parser_q.get() try: task_with_parsed_data, tasks = task['parser'](task) if tasks and isinstance(tasks, list): self.log.log_it("获取新任务{}个。".format(len(tasks)), 'INFO') for new_task in tasks: self.task_manager.register(new_task['tid']) self.to_download_q.put(new_task) elif tasks: self.log.log_it("获取新任务1个。", 'INFO') self.task_manager.register(tasks['tid']) self.to_download_q.put(tasks) except RetryDownload: self.log.log_it("RetryDownload Exception.Task{}".format(task), 'INFO') if task.get('retry', None): if task.get('retried', 0) < task.get('retry'): task.update({'retried': task.get('retried', 1) + 1}) self.to_download_q.put(task) return except RetryDownloadEnForce: self.log.log_it("RetryDownloadEnForce Exception.Task{}".format(task), 'INFO') self.to_download_q.put(task) return except RetryParse: self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO') if task.get('retry', None): if task.get('retried', 0) < task.get('retry'): task.update({'retried': task.get('retried', 1) + 1}) self.downloader_parser_q.put(task) return except RetryParseEnForce: self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO') self.downloader_parser_q.put(task) return except Exception as e: # FIXME FileNotFoundError # traceback.print_exc(file=open(os.path.join(config.get('LOG_PATH'), 'parser_traceback'), 'a')) traceback.print_exc() self.log.log_it("解析错误。错误信息:{}。Task:{}".format(str(e), task), 'WARN') return finally: self.task_manager.unregister(task['tid']) return task_with_parsed_data def run(self): while not self._exit: task_with_parsed_data = self.parser() if task_with_parsed_data: self.result_q.put(task_with_parsed_data)
class Crawler: def __init__(self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, parser_worker_count, downloader_worker_count, resulter_worker_count, session=requests.session()): self.parser_worker_count = int(parser_worker_count) self.downloader_worker_count = int(downloader_worker_count) self.resulter_worker_count = int(resulter_worker_count) self.downloader_worker = [] self.parser_worker = [] self.resulter_worker = [] self.log = Log("Crawler") self.to_download_q = to_download_q self.downloader_parser_q = downloader_parser_q self.result_q = result_q self.task_manager = TaskManager(self.to_download_q) self.session = session self.lock = LOCK self.task_manager_thread = Thread(target=self.task_manager.run) def start(self): self.task_manager_thread.start() for i in range(self.downloader_worker_count): _worker = Downloader( self.to_download_q, self.downloader_parser_q, self.result_q, "Downloader {}".format(i), self.session, ) self.downloader_worker.append(_worker) self.log.log_it("启动 Downloader {}".format(i), 'INFO') _worker.start() for i in range(self.parser_worker_count): _worker = Parser(self.to_download_q, self.downloader_parser_q, self.result_q, "Parser {}".format(i)) self.parser_worker.append(_worker) self.log.log_it("启动 Parser {}".format(i), 'INFO') _worker.start() for i in range(self.resulter_worker_count): _worker = Resulter(self.to_download_q, self.downloader_parser_q, self.result_q, "Resulter {}".format(i)) self.resulter_worker.append(_worker) self.log.log_it("启动 Resulter {}".format(i), 'INFO') _worker.start() while True: time.sleep(1) if self.task_manager.is_empty(): for worker in self.downloader_worker: worker.exit() for worker in self.parser_worker: worker.exit() resulter_not_alive = False while not resulter_not_alive: resulter_not_alive = True time.sleep(1) for worker in self.resulter_worker: resulter_not_alive &= not worker.is_alive() for worker in self.resulter_worker: worker.exit() self.task_manager.exit() TaskManager.ALLDONE = False return
class Parser(Thread): def __init__(self, to_download_q: PriorityQueue, downloader_parser_q: PriorityQueue, result_q: Queue, name: str): super().__init__(name=name) self.downloader_parser_q = downloader_parser_q self.to_download_q = to_download_q self.result_q = result_q self._exit = False self.log = Log(self.name) def exit(self): self._exit = True def parser(self): with COND: COND.notify_all() try: task = self.downloader_parser_q.get_nowait() except Empty: time.sleep(0.1) with COND: COND.notify_all() return try: task_with_parsed_data, tasks = task['parser'](task) if tasks: if not isinstance(tasks, list): tasks = [tasks] self.log.log_it("获取新任务{}个。".format(len(tasks)), 'INFO') for each_task in tasks: TaskManager.register(each_task['tid']) self.to_download_q.put(each_task) except RetryDownload: self.log.log_it("RetryDownload Exception.Task{}".format(task), 'INFO') retry(task, self.to_download_q) return except RetryDownloadEnForce: self.log.log_it( "RetryDownloadEnForce Exception.Task{}".format(task), 'INFO') self.to_download_q.put(task) return except RetryParse: self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO') retry(task, self.downloader_parser_q) return except RetryParseEnForce: self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO') self.downloader_parser_q.put(task) return except Exception as e: self.log.log_it("解析错误。错误信息:{}。Task:{}".format(str(e), task), 'WARN') traceback.print_exc() return TaskManager.unregister(task['tid']) return task_with_parsed_data def run(self): while not self._exit: task_with_parsed_data = self.parser() if task_with_parsed_data: self.result_q.put(task_with_parsed_data)
class Crawler: def __init__(self, to_download_q, downloader_parser_q, result_q, parser_worker_count=CRAWLER_CONFIG.get('PARSER_WORKER', 1), downloader_worker_count=CRAWLER_CONFIG.get('DOWNLOADER_WORKER', 1), resulter_worker_count=CRAWLER_CONFIG.get('RESULTER_WORKER', 1), session=requests.session()): self.parser_worker_count = parser_worker_count self.downloader_worker_count = downloader_worker_count self.resulter_worker_count = resulter_worker_count self.downloader_worker = [] self.parser_worker = [] self.resulter_worker = [] self.log = Log("Crawler") self.to_download_q = to_download_q self.downloader_parser_q = downloader_parser_q self.result_q = result_q self.session = session self.lock = Lock() self.task_manager = TaskManager(self.lock) def start(self): for i in range(self.downloader_worker_count): _worker = Downloader(self.to_download_q, self.downloader_parser_q, self.result_q, "Downloader {}".format(i), self.lock, self.session, ) self.downloader_worker.append(_worker) self.log.log_it("启动 Downloader {}".format(i), 'INFO') _worker.start() for i in range(self.parser_worker_count): _worker = Parser(self.to_download_q, self.downloader_parser_q, self.result_q, "Parser {}".format(i), self.lock) self.parser_worker.append(_worker) self.log.log_it("启动 Parser {}".format(i), 'INFO') _worker.start() for i in range(self.resulter_worker_count): _worker = Resulter(self.to_download_q, self.downloader_parser_q, self.result_q, "Resulter {}".format(i), self.lock) self.resulter_worker.append(_worker) self.log.log_it("启动 Resulter {}".format(i), 'INFO') _worker.start() while True: time.sleep(1) if self.task_manager.is_empty(): for worker in self.downloader_worker: worker.exit() for worker in self.parser_worker: worker.exit() resulter_not_alive = False while not resulter_not_alive: resulter_not_alive = True time.sleep(1) for worker in self.resulter_worker: resulter_not_alive &= not worker.is_alive() return