Ejemplo n.º 1
0
    def __init__(self,
                 to_download_q: PriorityQueue,
                 downloader_parser_q: PriorityQueue,
                 result_q: Queue,
                 parser_worker_count,
                 downloader_worker_count,
                 resulter_worker_count,
                 session=requests.session()):
        self.parser_worker_count = int(parser_worker_count)
        self.downloader_worker_count = int(downloader_worker_count)
        self.resulter_worker_count = int(resulter_worker_count)
        self.downloader_worker = []
        self.parser_worker = []
        self.resulter_worker = []
        self.log = Log("Crawler")

        self.to_download_q = to_download_q
        self.downloader_parser_q = downloader_parser_q
        self.result_q = result_q

        self.task_manager = TaskManager(self.to_download_q)
        self.session = session
        self.lock = LOCK

        self.task_manager_thread = Thread(target=self.task_manager.run)
Ejemplo n.º 2
0
    def __init__(self, to_download_q: PriorityQueue,
                 downloader_parser_q: PriorityQueue, result_q: Queue,
                 name: str):
        super().__init__(name=name)
        self.result_q = result_q
        self.downloader_parser_q = downloader_parser_q
        self.to_download_q = to_download_q

        self._exit = False
        self.log = Log(self.name)
Ejemplo n.º 3
0
    def __init__(self,
                 items: list,
                 path: str,
                 book_name: str,
                 kindlegen_path: str = KINDLE_GEN_PATH) -> None:
        self.kindlegen_path = kindlegen_path if kindlegen_path is not None else KINDLE_GEN_PATH
        self.items = items
        self.book_name = str(book_name)
        self.path = path
        self.to_remove = set()
        self.log = Log('HTML2Kindle')

        if not os.path.exists(path):
            os.makedirs(path)
Ejemplo n.º 4
0
class Downloader(Thread):
    def __init__(self,
                 to_download_q: PriorityQueue,
                 downloader_parser_q: PriorityQueue,
                 result_q: Queue,
                 name: str,
                 session=requests.session()):
        super().__init__(name=name)
        self.to_download_q = to_download_q
        self.downloader_parser_q = downloader_parser_q
        self.result_q = result_q
        self.session = session

        self._exit = False

        self.log = Log(self.name)

    def exit(self):
        self._exit = True

    def request(self):
        response = None

        try:
            task = self.to_download_q.get_nowait()
            TaskManager.register(task['tid'])
        except Empty:
            self.log.log_it(
                "Scheduler to Downloader队列为空,{}等待中。".format(self.name),
                'DEBUG')
            with COND:
                COND.wait()
                self.log.log_it(
                    "Downloader to Parser队列不为空。{}被唤醒。".format(self.name),
                    'DEBUG')
            return

        self.log.log_it("请求 {}".format(task['url']), 'INFO')
        try:
            response = self.session.request(task['method'], task['url'],
                                            **task.get('meta', {}))
        except Exception as e:
            traceback.print_exc()
            self.log.log_it(
                "网络请求错误。错误信息:{} URL:{} Response:{}".format(
                    str(e), task['url'], response), 'INFO')
            retry(task, self.to_download_q)
            return

        if response:
            task['response'] = response
        else:
            task['response'] = None

        self.downloader_parser_q.put(task)

    def run(self):
        while not self._exit:
            self.request()
Ejemplo n.º 5
0
    def __init__(self):
        self.log = Log('SendEmail2Kindle')

        try:
            self.username = MAIN_CONFIG['EMAIL_USERNAME']
            self.password = MAIN_CONFIG['PASSWORD']
            self.smtp_addr = MAIN_CONFIG['SMTP_ADDR']
            self.kindle_addr = MAIN_CONFIG['KINDLE_ADDR']
        except KeyError:
            self.log.log_it("无法实例化SendEmail2Kindle,请确保config.yml配置完整", 'ERROR')
            return

        self.sender = self.username
        self.sended = []
        self.client = smtplib.SMTP()
Ejemplo n.º 6
0
    def __init__(
            self,
            to_download_q: PriorityQueue,
            downloader_parser_q: PriorityQueue,
            result_q: Queue,
            name: str,
            lock):
        super().__init__(name=name)
        self.result_q = result_q
        self.downloader_parser_q = downloader_parser_q
        self.to_download_q = to_download_q

        self._exit = False
        self.log = Log(self.name)
        self.lock = lock
        self.task_manager = TaskManager(self.lock)
Ejemplo n.º 7
0
    def __init__(self, items, path, book_name, kindlegen_path=KINDLE_GEN_PATH):
        # self.template_env = Environment(loader=PackageLoader('web2kindle'))
        # self.content_template = self.template_env.get_template('kindle_content.html')
        # self.opf_template = self.template_env.get_template('kindle_opf.html')
        # self.index_template = self.template_env.get_template('kindle_table.html')
        # 打包成exe之后会有bug
        self.kindlegen_path = kindlegen_path if kindlegen_path is not None else KINDLE_GEN_PATH

        self.items = items
        self.book_name = str(book_name)
        self.path = path
        self.to_remove = set()
        self.log = Log('HTML2Kindle')

        if not os.path.exists(os.path.split(path)[0]):
            os.makedirs((os.path.split(path)[0]))
Ejemplo n.º 8
0
class Downloader(Thread):
    def __init__(self, to_download_q: PriorityQueue,
                 downloader_parser_q: PriorityQueue,
                 result_q: Queue,
                 name: str,
                 lock,
                 session=requests.session()):
        super().__init__(name=name)
        self.to_download_q = to_download_q
        self.downloader_parser_q = downloader_parser_q
        self.result_q = result_q
        self.session = session

        self._exit = False

        self.log = Log(self.name)
        self.lock = lock
        self.task_manager = TaskManager(self.lock)

    def exit(self):
        self._exit = True

    def request(self):
        response = None

        try:
            task = self.to_download_q.get_nowait()
            self.task_manager.register(task['tid'])
        except Empty:
            self.log.log_it("Scheduler to Downloader队列为空,{}等待中。".format(self.name), 'DEBUG')
            with COND:
                COND.wait()
                self.log.log_it("Downloader to Parser队列不为空。{}被唤醒。".format(self.name), 'DEBUG')
            return

        self.log.log_it("请求 {}".format(task['url']), 'INFO')
        try:
            response = self.session.request(task['method'], task['url'], **task.get('meta', {}))
        except Exception as e:
            # traceback.print_exc(file=open(os.path.join(config.get('LOG_PATH'), 'downlaoder_traceback'), 'a'))
            traceback.print_exc()
            self.log.log_it("网络请求错误。错误信息:{} URL:{} Response:{}".format(str(e), task['url'], response), 'INFO')
            if task.get('retry', None):
                if task.get('retried', 0) < task.get('retry'):
                    task.update({'retried': task.get('retried', 1) + 1})
                    self.to_download_q.put(task)
            return

        if response:
            task.update({'response': response})
        else:
            task.update({'response': None})
        self.downloader_parser_q.put(task)

    def run(self):
        while not self._exit:
            self.request()
Ejemplo n.º 9
0
    def __init__(self):
        self.CONFIG = load_config('./web2kindle/config/config.yml')
        self.log = Log('SendEmail2Kindle')

        try:
            self.username = self.CONFIG['EMAIL_USERNAME']
            self.password = self.CONFIG['PASSWORD']
            self.smtp_addr = self.CONFIG['SMTP_ADDR']
            self.kindle_addr = self.CONFIG['KINDLE_ADDR']
        except KeyError:
            self.log.log_it("无法实例化SendEmail2Kindle,请确保config.yml配置完整", 'ERROR')
            import os
            os._exit(1)

        self.sender = self.username
        self.sended = []
        self.client = smtplib.SMTP()
Ejemplo n.º 10
0
    def __init__(self,
                 to_download_q,
                 downloader_parser_q,
                 result_q,
                 parser_worker_count=CRAWLER_CONFIG.get('PARSER_WORKER', 1),
                 downloader_worker_count=CRAWLER_CONFIG.get('DOWNLOADER_WORKER', 1),
                 resulter_worker_count=CRAWLER_CONFIG.get('RESULTER_WORKER', 1),
                 session=requests.session()):
        self.parser_worker_count = parser_worker_count
        self.downloader_worker_count = downloader_worker_count
        self.resulter_worker_count = resulter_worker_count
        self.downloader_worker = []
        self.parser_worker = []
        self.resulter_worker = []
        self.log = Log("Crawler")

        self.to_download_q = to_download_q
        self.downloader_parser_q = downloader_parser_q
        self.result_q = result_q

        self.session = session
        self.lock = Lock()
        self.task_manager = TaskManager(self.lock)
Ejemplo n.º 11
0
class HTML2Kindle:
    content_template = Template(
        read_file('./web2kindle/templates/kindle_content.html'))
    opf_template = Template(
        read_file('./web2kindle/templates/kindle_opf.html'))
    index_template = Template(
        read_file('./web2kindle/templates/kindle_table.html'))
    ncx_template = Template(read_file('./web2kindle/templates/kindle_ncx.ncx'))

    def __init__(self,
                 items: list,
                 path: str,
                 book_name: str,
                 kindlegen_path: str = KINDLE_GEN_PATH) -> None:
        self.kindlegen_path = kindlegen_path if kindlegen_path is not None else KINDLE_GEN_PATH
        self.items = items
        self.book_name = str(book_name)
        self.path = path
        self.to_remove = set()
        self.log = Log('HTML2Kindle')

        if not os.path.exists(path):
            os.makedirs(path)

    def __exit__(self, exc_type: None, exc_val: None, exc_tb: None) -> None:
        self.remove()

    def __enter__(self):
        return self

    def remove(self) -> None:
        for i in self.to_remove:
            try:
                os.remove(i)
            except FileNotFoundError:
                pass

    def make_metadata(self, window: int = 20) -> None:
        window = int(window)
        spilt_items = split_list(self.items, window)

        # 根据window分割电子书
        for index, items in enumerate(spilt_items):
            self.log.log_it("制作 {}_{} 的元数据".format(self.book_name, str(index)),
                            'INFO')
            opf = []
            table = []
            table_name = '{}_{}.html'.format(self.book_name, str(index))
            opf_name = '{}_{}.opf'.format(self.book_name, str(index))
            ncx_name = '{}_{}.ncx'.format(self.book_name, str(index))
            table_path = os.path.join(self.path, table_name)
            opf_path = os.path.join(self.path, opf_name)
            ncx_path = os.path.join(self.path, ncx_name)

            # 标记,以便删除
            self.to_remove.add(table_path)
            self.to_remove.add(opf_path)
            self.to_remove.add(ncx_path)

            for item in items:
                kw = {
                    'author_name': item[5],
                    'voteup_count': item[4],
                    'created_time': item[3]
                }
                # 文件名=title+author
                article_path = os.path.join(
                    self.path,
                    format_file_name(item[1], item[5]) + '.html')
                if os.path.exists(article_path):
                    # 防止文件名重复
                    article_path = article_path.replace('.html', '') + ''.join(
                        random_char(3)) + '.html'

                self.make_content(item[1], item[2], article_path, kw)
                # 标记,以便删除
                self.to_remove.add(article_path)
                opf.append({
                    'id': article_path,
                    'href': article_path,
                    'title': item[1]
                })
                table.append({'href': article_path, 'name': item[1]})

            self.make_table(table, table_path)
            self.make_opf(self.book_name + '_' + str(index), opf, table_path,
                          opf_path, ncx_path)
            self.make_ncx(self.book_name + '_' + str(index), opf, table_path,
                          ncx_path)

    def make_opf(self, title: str, navigation: list, table_path: str,
                 opf_path: str, ncx_path: str) -> None:
        rendered_content = self.opf_template.render(title=title,
                                                    navigation=navigation,
                                                    table_href=table_path,
                                                    ncx_href=ncx_path)
        with codecs.open(opf_path, 'w', 'utf_8_sig') as f:
            f.write(rendered_content)

    def make_ncx(self, title: str, navigation: list, table_path: str,
                 opf_path: str) -> None:
        rendered_content = self.ncx_template.render(title=title,
                                                    navigation=navigation,
                                                    table_href=table_path)
        with codecs.open(opf_path, 'w', 'utf_8_sig') as f:
            f.write(rendered_content)

    def make_content(self,
                     title: str,
                     content: str,
                     path: str,
                     kw: dict = None) -> None:
        rendered_content = self.content_template.render(title=title,
                                                        content=content,
                                                        kw=kw)
        with codecs.open(path, 'w', 'utf_8_sig') as f:
            f.write(rendered_content)

    def make_table(self, navigation: list, path: str) -> None:
        rendered_content = self.index_template.render(navigation=navigation)
        with codecs.open(path, 'w', 'utf_8_sig') as f:
            f.write(rendered_content)

    @staticmethod
    def _make_book(kindlegen_path: str, log_path: str, path: str) -> None:
        os.system("{} -dont_append_source {}".format(kindlegen_path, path))

    def make_book_multi(self, rootdir: str, overwrite: bool = True) -> None:
        from multiprocessing import Pool
        self.log.log_it("新建 {} 个线程制作mobi文件.正在制作中,请稍后".format(str(cpu_count())),
                        'INFO')
        pool = Pool(cpu_count())
        opf_list = self.get_opf(rootdir, overwrite)
        pool.map(
            partial(self._make_book, self.kindlegen_path,
                    os.path.join(self.path, 'kindlegen.log')), opf_list)

    def make_book(self, rootdir: str, overwrite: bool = True) -> None:
        opf_list = self.get_opf(rootdir, overwrite)
        self.log.log_it("正在制作中,请稍后", 'INFO')
        for i in opf_list:
            os.system("{} -dont_append_source {} > {}".format(
                self.kindlegen_path, os.path.join(rootdir, i),
                os.path.join(self.path, 'kindlegen.log')))

    def get_opf(self, rootdir: str, overwrite: bool) -> list:
        result = []
        mobi = []
        for i in os.listdir(rootdir):
            if not os.path.isdir(os.path.join(rootdir, i)):
                if i.lower().endswith('mobi'):
                    mobi.append(i)

        for i in os.listdir(rootdir):
            if not os.path.isdir(os.path.join(rootdir, i)):
                if i.lower().endswith('opf'):
                    if overwrite:
                        result.append(os.path.join(rootdir, i))
                    else:
                        if i.replace('opf', 'mobi') not in mobi:
                            result.append(os.path.join(rootdir, i))
        return result
Ejemplo n.º 12
0
import time
from copy import deepcopy
from queue import Queue, PriorityQueue
from urllib.parse import urlparse
from bs4 import BeautifulSoup

from web2kindle import MAIN_CONFIG
from web2kindle.libs.crawler import Crawler, RetryDownload, Task
from web2kindle.libs.db import ArticleDB
from web2kindle.libs.html2kindle import HTML2Kindle
from web2kindle.libs.send_email import SendEmail2Kindle
from web2kindle.libs.utils import write, load_config, check_config, md5string
from web2kindle.libs.log import Log

SCRIPT_CONFIG = load_config('./web2kindle/config/guoke_scientific.yml')
LOG = Log("guoke_scientific")
API_URL = "http://www.guokr.com/apis/minisite/article.json?retrieve_type=by_subject&limit=20&offset={}&_=1508757235776"
DEFAULT_HEADERS = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
    '61.0.3163.100 Safari/537.36'
}
check_config(MAIN_CONFIG, SCRIPT_CONFIG, 'SAVE_PATH', LOG)
ARTICLE_ID_SET = set()


def main(start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1),
Ejemplo n.º 13
0
import time
from copy import deepcopy
from queue import Queue, PriorityQueue
from urllib.parse import urlparse, unquote
from bs4 import BeautifulSoup

from web2kindle import MAIN_CONFIG
from web2kindle.libs.crawler import Crawler, RetryDownload, Task
from web2kindle.libs.db import ArticleDB
from web2kindle.libs.html2kindle import HTML2Kindle
from web2kindle.libs.send_email import SendEmail2Kindle
from web2kindle.libs.utils import write, md5string, load_config, check_config, format_file_name
from web2kindle.libs.log import Log

SCRIPT_CONFIG = load_config('./web2kindle/config/jianshu_user.yml')
LOG = Log("jianshu_user")
DEFAULT_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
                  '61.0.3163.100 Safari/537.36'
}

check_config(MAIN_CONFIG, SCRIPT_CONFIG, 'SAVE_PATH', LOG)
ARTICLE_ID_SET = set()

ORDER_TOP = 'top'
ORDER_COMMENT = 'commented_at'
ORDER_ADD = 'added_at'
API_URL = 'https://www.jianshu.com/u/{}?order_by={}&page={}'
BASE_URL = 'https://www.jianshu.com/u/{}'

Ejemplo n.º 14
0
import re
import time
from copy import deepcopy
from queue import Queue, PriorityQueue, Empty
from urllib.parse import urlparse, unquote
from web2kindle.libs.crawler import Crawler, RetryDownload, Task
from web2kindle.libs.db import ArticleDB
from web2kindle.libs.html2kindle import HTML2Kindle
from web2kindle.libs.send_email import SendEmail2Kindle
from web2kindle.libs.utils import write, md5string, load_config, check_config
from web2kindle.libs.log import Log
from bs4 import BeautifulSoup

SCRIPT_CONFIG = load_config('./web2kindle/config/zhihu_zhuanlan_config.yml')
MAIN_CONFIG = load_config('./web2kindle/config/config.yml')
LOG = Log("zhihu_zhuanlan")
DEFAULT_HEADERS = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
    '61.0.3163.100 Safari/537.36'
}

check_config(MAIN_CONFIG, SCRIPT_CONFIG, 'SAVE_PATH', LOG)


def main(zhuanlan_name_list, start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q)
Ejemplo n.º 15
0
class SendEmail:
    def __init__(self):
        self.log = Log('SendEmail2Kindle')

        try:
            self.username = MAIN_CONFIG['EMAIL_USERNAME']
            self.password = MAIN_CONFIG['PASSWORD']
            self.smtp_addr = MAIN_CONFIG['SMTP_ADDR']
            self.kindle_addr = MAIN_CONFIG['KINDLE_ADDR']
        except KeyError:
            self.log.log_it("无法实例化SendEmail2Kindle,请确保config.yml配置完整", 'ERROR')
            return

        self.sender = self.username
        self.sended = []
        self.client = smtplib.SMTP()

    def connect(self) -> bool:
        try:
            self.log.log_it("正在连接邮件服务器", 'INFO')
            self.client.connect(self.smtp_addr)
            self.log.log_it("正在登录服务器", 'INFO')
            self.client.login(self.username, self.password)
            return True
        except smtplib.SMTPAuthenticationError:
            self.log.log_it("邮箱用户名或密码错误", 'WARN')
            return False
        except Exception as e:
            self.log.log_it("连接错误。错误信息:{}".format(str(e)), 'INFO')
            return False

    def disconnect(self) -> None:
        self.client.quit()

    def __enter__(self):
        if not self.connect():
            raise Exception("SendEmail2Kindle连接服务器错误")
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.disconnect()

    def send_file(self, file_path: str) -> None:
        msg = MIMEMultipart()
        msg['Subject'] = 'Web2kindle'
        msg['From'] = self.sender
        msg['To'] = self.kindle_addr

        file = MIMEApplication(open(file_path, 'rb').read())
        file.add_header('Content-Disposition',
                        'attachment',
                        filename=file_path)
        msg.attach(file)
        try:
            self.client.sendmail(self.sender, self.kindle_addr,
                                 msg.as_string())
            self.sended.append(file_path)
        except smtplib.SMTPRecipientsRefused as e:
            self.log.log_it("所有收件人都被拒绝。", 'WARN')
        except smtplib.SMTPSenderRefused as e:
            self.log.log_it("发件人地址被拒绝。", 'WARN')
        except smtplib.SMTPDataError as e:
            self.log.log_it("服务器拒绝接受邮件数据。", 'WARN')
        except smtplib.SMTPException as e:
            self.log.log_it(
                "未知错误。FILE_PATH:{},ERRINFO:{}".format(file_path, str(e)),
                'WARN')

    def send_files(self, file_paths: list) -> None:
        for file_path in file_paths:
            self.log.log_it("正在发送:{}".format(file_path), 'INFO')
            self.send_file(file_path)
            self.log.log_it("{}发送成功".format(file_path), 'INFO')
Ejemplo n.º 16
0
class Resulter(Thread):
    def __init__(self, to_download_q: PriorityQueue,
                 downloader_parser_q: PriorityQueue, result_q: Queue,
                 name: str):
        super().__init__(name=name)
        self.result_q = result_q
        self.downloader_parser_q = downloader_parser_q
        self.to_download_q = to_download_q

        self._exit = False
        self.log = Log(self.name)

    def exit(self):
        self._exit = True

    def result(self):
        with COND:
            COND.notify_all()

        try:
            task = self.result_q.get_nowait()
        except Empty:
            time.sleep(0.1)
            return

        try:
            self.log.log_it("正在处理{}".format(task['tid']))
            task['resulter'](task)
        except RetryDownload:
            self.log.log_it("RetryDownload Exception.Task{}".format(task),
                            'INFO')
            retry(task, self.to_download_q)
            return
        except RetryDownloadEnForceNodelay:
            self.log.log_it(
                "RetryDownloadEnForce Exception.Task{}".format(task), 'INFO')
            self.to_download_q.put(task)
            return
        except RetryDownloadNodelay:
            self.log.log_it(
                "RetryDownloadNodelay Exception.Task{}".format(task), 'INFO')
            retry_nodelay(task, self.to_download_q)
            return

        except RetryParse:
            self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO')
            retry(task, self.downloader_parser_q)
            return
        except RetryParseEnForceNodelay:
            self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO')
            self.downloader_parser_q.put(task)
            return
        except RetryParseNodelay:
            self.log.log_it("RetryParseNodelay Exception.Task{}".format(task),
                            'INFO')
            retry_nodelay(task, self.downloader_parser_q)
            return

        except RetryResult:
            self.log.log_it("RetryResult Exception.Task{}".format(task),
                            'INFO')
            retry(task, self.result_q)
            return
        except RetryResultEnForceNodelay:
            self.log.log_it("RetryResultEnForce Exception.Task{}".format(task),
                            'INFO')
            self.result_q.put(task)
            return
        except RetryResultNodelay:
            self.log.log_it("RetryResultNodelay Exception.Task{}".format(task),
                            'INFO')
            retry_nodelay(task, self.result_q)
            return

        except Exception as e:
            traceback.print_exc()
            self.log.log_it(
                "Resulter函数错误。错误信息:{}。Task:{}".format(str(e), task), 'WARN')
            retry(task, self.result_q)
            return

    def run(self):
        while (not TaskManager.ALLDONE) or (not self.result_q.empty()):
            self.result()
Ejemplo n.º 17
0
from threading import current_thread, active_count
from urllib.parse import urlparse, unquote
import time
from bs4 import BeautifulSoup

from web2kindle import MAIN_CONFIG
from web2kindle.libs.crawler import Crawler, md5string, RetryDownload, Task
from web2kindle.libs.db import ArticleDB
from web2kindle.libs.utils import write, load_config, check_config
from web2kindle.libs.html2kindle import HTML2Kindle
from web2kindle.libs.log import Log
from web2kindle.libs.send_email import SendEmail2Kindle

SCRIPT_CONFIG = load_config('./web2kindle/config/zhihu_collection.yml')
GET_BOOK_NAME_FLAG = False
LOG = Log('zhihu_collection')
DEFAULT_HEADERS = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
    '61.0.3163.100 Safari/537.36'
}
check_config(MAIN_CONFIG, SCRIPT_CONFIG, 'SAVE_PATH', LOG)
ARTICLE_ID_SET = set()


def main(collection_num_list, start, end, kw):
    iq = PriorityQueue()
    oq = PriorityQueue()
    result_q = Queue()
    crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1),
                      MAIN_CONFIG.get('DOWNLOADER_WORKER', 1),
Ejemplo n.º 18
0
class Resulter(Thread):
    def __init__(
            self,
            to_download_q: PriorityQueue,
            downloader_parser_q: PriorityQueue,
            result_q: Queue,
            name: str,
            lock):
        super().__init__(name=name)
        self.result_q = result_q
        self.downloader_parser_q = downloader_parser_q
        self.to_download_q = to_download_q

        self._exit = False
        self.log = Log(self.name)
        self.lock = lock
        self.task_manager = TaskManager(self.lock)

    def exit(self):
        self._exit = True

    def result(self):
        with COND:
            COND.notify_all()

        try:
            task = self.result_q.get_nowait()
        except Empty:
            time.sleep(1)
            return

        try:
            task['resulter'](task)
        except RetryDownload:
            self.log.log_it("RetryDownload Exception.Task{}".format(task), 'INFO')
            if task.get('retry', None):
                if task.get('retried', 0) < task.get('retry'):
                    task.update({'retried': task.get('retried', 1) + 1})
                    self.to_download_q.put(task)
            return
        except RetryDownloadEnForce:
            self.log.log_it("RetryDownloadEnForce Exception.Task{}".format(task), 'INFO')
            self.to_download_q.put(task)
            return
        except RetryParse:
            self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO')
            if task.get('retry', None):
                if task.get('retried', 0) < task.get('retry'):
                    task.update({'retried': task.get('retried', 1) + 1})
                    self.downloader_parser_q.put(task)
            return
        except RetryParseEnForce:
            self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO')
            self.downloader_parser_q.put(task)
        except RetryResult:
            self.log.log_it("RetryResult Exception.Task{}".format(task), 'INFO')
            if task.get('retry', None):
                if task.get('retried', 0) < task.get('retry'):
                    task.update({'retried': task.get('retried', 1) + 1})
                    self.result_q.put(task)
            return
        except RetryResultEnForce:
            self.log.log_it("RetryResultEnForce Exception.Task{}".format(task), 'INFO')
            self.result_q.put(task)
            return

        except Exception as e:
            # FIXME FileNotFoundError
            # traceback.print_exc(file=open(os.path.join(config.get('LOG_PATH'), 'parser_traceback'), 'a'))
            traceback.print_exc()
            self.log.log_it("Resulter函数错误。错误信息:{}。Task:{}".format(str(e), task), 'WARN')

    def run(self):
        while not (TaskManager.ALLDONE and self.result_q.empty()):
            self.result()
Ejemplo n.º 19
0
import time
from copy import deepcopy
from queue import Queue, PriorityQueue
from urllib.parse import urlparse, unquote
from bs4 import BeautifulSoup

from web2kindle import MAIN_CONFIG
from web2kindle.libs.crawler import Crawler, RetryDownload, Task
from web2kindle.libs.db import ArticleDB
from web2kindle.libs.html2kindle import HTML2Kindle
from web2kindle.libs.send_email import SendEmail2Kindle
from web2kindle.libs.utils import write, md5string, load_config, check_config, format_file_name
from web2kindle.libs.log import Log

SCRIPT_CONFIG = load_config('./web2kindle/config/jianshu_wenji.yml')
LOG = Log("jianshu_wenji")
DEFAULT_HEADERS = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
    '61.0.3163.100 Safari/537.36'
}

check_config(MAIN_CONFIG, SCRIPT_CONFIG, 'SAVE_PATH', LOG)
ARTICLE_ID_SET = set()

API_URL = 'http://www.jianshu.com/nb/{}?order_by={}&page={}'
BASE_URL = 'https://www.jianshu.com/nb/{}'

ORDER_SEQ = 'seq'
ORDER_COMMENT = 'commented_at'
ORDER_ADD = 'added_at'
Ejemplo n.º 20
0
import time
from copy import deepcopy
from queue import Queue, PriorityQueue
from urllib.parse import urlparse

from web2kindle.libs.crawler import Crawler, RetryDownload, Task
from web2kindle.libs.db import ArticleDB
from web2kindle.libs.html2kindle import HTML2Kindle
from web2kindle.libs.send_email import SendEmail2Kindle
from web2kindle.libs.utils import write, format_file_name, load_config, check_config, md5string
from web2kindle.libs.log import Log
from bs4 import BeautifulSoup

SCRIPT_CONFIG = load_config('./web2kindle/config/qdaily_config.yml')
MAIN_CONFIG = load_config('./web2kindle/config/config.yml')
LOG = Log("qdaily_home")
API_URL = 'https://www.qdaily.com/homes/articlemore/{}.json'
DEFAULT_HEADERS = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
    '61.0.3163.100 Safari/537.36'
}
check_config(MAIN_CONFIG, SCRIPT_CONFIG, 'SAVE_PATH', LOG)
API_BUSINESS = 'https://www.qdaily.com/categories/categorymore/18/{}.json'
API_INTELLIGENT = 'https://www.qdaily.com/categories/categorymore/4/{}.json'
API_DESIGN = 'https://www.qdaily.com/categories/categorymore/17/{}.json'
API_FASHION = 'https://www.qdaily.com/categories/categorymore/19/{}.json'
API_ENTERTAINMENT = 'https://www.qdaily.com/categories/categorymore/3/{}.json'
API_CITY = 'https://www.qdaily.com/categories/categorymore/5/{}.json'
API_GAME = 'https://www.qdaily.com/categories/categorymore/54/{}.json'
API_LONG = 'https://www.qdaily.com/tags/tagmore/1068/{}.json'
Ejemplo n.º 21
0
import time
from copy import deepcopy
from queue import Queue, PriorityQueue
from urllib.parse import urlparse, unquote
from bs4 import BeautifulSoup

from web2kindle import MAIN_CONFIG
from web2kindle.libs.crawler import Crawler, RetryDownload, Task
from web2kindle.libs.db import ArticleDB
from web2kindle.libs.html2kindle import HTML2Kindle
from web2kindle.libs.send_email import SendEmail2Kindle
from web2kindle.libs.utils import write, md5string, load_config, check_config, format_file_name
from web2kindle.libs.log import Log

SCRIPT_CONFIG = load_config('./web2kindle/config/jianshu_zhuanti.yml')
LOG = Log("jianshu_zhuanti")
DEFAULT_HEADERS = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
    '61.0.3163.100 Safari/537.36'
}

check_config(MAIN_CONFIG, SCRIPT_CONFIG, 'SAVE_PATH', LOG)
ARTICLE_ID_SET = set()

ORDER_TOP = 'top'
ORDER_COMMENT = 'commented_at'
ORDER_ADD = 'added_at'
API_URL = 'https://www.jianshu.com/c/{}?order_by={}&page={}'
BASE_URL = 'https://www.jianshu.com/c/{}'
Ejemplo n.º 22
0
from copy import deepcopy
from queue import Queue, PriorityQueue
from urllib.parse import urlparse, unquote
from bs4 import BeautifulSoup

from web2kindle import MAIN_CONFIG
from web2kindle.libs.crawler import Crawler, RetryDownload, Task
from web2kindle.libs.db import ArticleDB
from web2kindle.libs.html2kindle import HTML2Kindle
from web2kindle.libs.send_email import SendEmail2Kindle
from web2kindle.libs.utils import write, md5string, load_config, check_config, get_next_datetime_string, \
    compare_datetime_string, get_datetime_string
from web2kindle.libs.log import Log

SCRIPT_CONFIG = load_config('./web2kindle/config/zhihu_daily.yml')
LOG = Log("zhihu_daily")
DEFAULT_HEADERS = {
    'User-Agent':
    'DailyApi/4 (Linux; Android 4.4.2; SM-T525 Build/samsung/picassoltezs/picassolte/KOT49H/zh_CN) '
    'Google-HTTP-Java-Client/1.22.0 (gzip) Google-HTTP-Java-Client/1.22.0 (gzip)'
}

check_config(MAIN_CONFIG, SCRIPT_CONFIG, 'SAVE_PATH', LOG)
ARTICLE_ID_SET = set()

TODAY_URL = 'http://news-at.zhihu.com/api/4/stories/latest'
# http://http://news-at.zhihu.com/api/4/stories/before/20180212
YESTERDAY_URL = 'http://news-at.zhihu.com/api/4/stories/before/{}'
IS_TODAY_URL = True

Ejemplo n.º 23
0
class Parser(Thread):
    def __init__(
            self,
            to_download_q: PriorityQueue,
            downloader_parser_q: PriorityQueue,
            result_q: Queue,
            name: str,
            lock):
        super().__init__(name=name)
        self.downloader_parser_q = downloader_parser_q
        self.to_download_q = to_download_q
        self.result_q = result_q

        self._exit = False
        self.log = Log(self.name)
        self.lock = lock
        self.task_manager = TaskManager(self.lock)

    def exit(self):
        self._exit = True

    def parser(self):

        with COND:
            COND.notify_all()
        task = self.downloader_parser_q.get()

        try:
            task_with_parsed_data, tasks = task['parser'](task)
            if tasks and isinstance(tasks, list):
                self.log.log_it("获取新任务{}个。".format(len(tasks)), 'INFO')
                for new_task in tasks:
                    self.task_manager.register(new_task['tid'])
                    self.to_download_q.put(new_task)
            elif tasks:
                self.log.log_it("获取新任务1个。", 'INFO')
                self.task_manager.register(tasks['tid'])
                self.to_download_q.put(tasks)
        except RetryDownload:
            self.log.log_it("RetryDownload Exception.Task{}".format(task), 'INFO')
            if task.get('retry', None):
                if task.get('retried', 0) < task.get('retry'):
                    task.update({'retried': task.get('retried', 1) + 1})
                    self.to_download_q.put(task)
            return
        except RetryDownloadEnForce:
            self.log.log_it("RetryDownloadEnForce Exception.Task{}".format(task), 'INFO')
            self.to_download_q.put(task)
            return
        except RetryParse:
            self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO')
            if task.get('retry', None):
                if task.get('retried', 0) < task.get('retry'):
                    task.update({'retried': task.get('retried', 1) + 1})
                    self.downloader_parser_q.put(task)
            return
        except RetryParseEnForce:
            self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO')
            self.downloader_parser_q.put(task)
            return
        except Exception as e:
            # FIXME FileNotFoundError
            # traceback.print_exc(file=open(os.path.join(config.get('LOG_PATH'), 'parser_traceback'), 'a'))
            traceback.print_exc()
            self.log.log_it("解析错误。错误信息:{}。Task:{}".format(str(e), task), 'WARN')
            return
        finally:
            self.task_manager.unregister(task['tid'])
        return task_with_parsed_data

    def run(self):
        while not self._exit:
            task_with_parsed_data = self.parser()
            if task_with_parsed_data:
                self.result_q.put(task_with_parsed_data)
Ejemplo n.º 24
0
class Crawler:
    def __init__(self,
                 to_download_q: PriorityQueue,
                 downloader_parser_q: PriorityQueue,
                 result_q: Queue,
                 parser_worker_count,
                 downloader_worker_count,
                 resulter_worker_count,
                 session=requests.session()):
        self.parser_worker_count = int(parser_worker_count)
        self.downloader_worker_count = int(downloader_worker_count)
        self.resulter_worker_count = int(resulter_worker_count)
        self.downloader_worker = []
        self.parser_worker = []
        self.resulter_worker = []
        self.log = Log("Crawler")

        self.to_download_q = to_download_q
        self.downloader_parser_q = downloader_parser_q
        self.result_q = result_q

        self.task_manager = TaskManager(self.to_download_q)
        self.session = session
        self.lock = LOCK

        self.task_manager_thread = Thread(target=self.task_manager.run)

    def start(self):
        self.task_manager_thread.start()

        for i in range(self.downloader_worker_count):
            _worker = Downloader(
                self.to_download_q,
                self.downloader_parser_q,
                self.result_q,
                "Downloader {}".format(i),
                self.session,
            )
            self.downloader_worker.append(_worker)
            self.log.log_it("启动 Downloader {}".format(i), 'INFO')
            _worker.start()

        for i in range(self.parser_worker_count):
            _worker = Parser(self.to_download_q, self.downloader_parser_q,
                             self.result_q, "Parser {}".format(i))
            self.parser_worker.append(_worker)
            self.log.log_it("启动 Parser {}".format(i), 'INFO')
            _worker.start()

        for i in range(self.resulter_worker_count):
            _worker = Resulter(self.to_download_q, self.downloader_parser_q,
                               self.result_q, "Resulter {}".format(i))
            self.resulter_worker.append(_worker)
            self.log.log_it("启动 Resulter {}".format(i), 'INFO')
            _worker.start()

        while True:
            time.sleep(1)
            if self.task_manager.is_empty():
                for worker in self.downloader_worker:
                    worker.exit()
                for worker in self.parser_worker:
                    worker.exit()

                resulter_not_alive = False
                while not resulter_not_alive:
                    resulter_not_alive = True
                    time.sleep(1)
                    for worker in self.resulter_worker:
                        resulter_not_alive &= not worker.is_alive()

                for worker in self.resulter_worker:
                    worker.exit()

                self.task_manager.exit()
                TaskManager.ALLDONE = False
                return
Ejemplo n.º 25
0
class Parser(Thread):
    def __init__(self, to_download_q: PriorityQueue,
                 downloader_parser_q: PriorityQueue, result_q: Queue,
                 name: str):
        super().__init__(name=name)
        self.downloader_parser_q = downloader_parser_q
        self.to_download_q = to_download_q
        self.result_q = result_q

        self._exit = False
        self.log = Log(self.name)

    def exit(self):
        self._exit = True

    def parser(self):

        with COND:
            COND.notify_all()
        try:
            task = self.downloader_parser_q.get_nowait()
        except Empty:
            time.sleep(0.1)
            with COND:
                COND.notify_all()
            return

        try:
            task_with_parsed_data, tasks = task['parser'](task)
            if tasks:
                if not isinstance(tasks, list):
                    tasks = [tasks]
                self.log.log_it("获取新任务{}个。".format(len(tasks)), 'INFO')
                for each_task in tasks:
                    TaskManager.register(each_task['tid'])
                    self.to_download_q.put(each_task)
        except RetryDownload:
            self.log.log_it("RetryDownload Exception.Task{}".format(task),
                            'INFO')
            retry(task, self.to_download_q)
            return
        except RetryDownloadEnForce:
            self.log.log_it(
                "RetryDownloadEnForce Exception.Task{}".format(task), 'INFO')
            self.to_download_q.put(task)
            return
        except RetryParse:
            self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO')
            retry(task, self.downloader_parser_q)
            return
        except RetryParseEnForce:
            self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO')
            self.downloader_parser_q.put(task)
            return
        except Exception as e:
            self.log.log_it("解析错误。错误信息:{}。Task:{}".format(str(e), task),
                            'WARN')
            traceback.print_exc()
            return
        TaskManager.unregister(task['tid'])
        return task_with_parsed_data

    def run(self):
        while not self._exit:
            task_with_parsed_data = self.parser()
            if task_with_parsed_data:
                self.result_q.put(task_with_parsed_data)
Ejemplo n.º 26
0
class Crawler:
    def __init__(self,
                 to_download_q,
                 downloader_parser_q,
                 result_q,
                 parser_worker_count=CRAWLER_CONFIG.get('PARSER_WORKER', 1),
                 downloader_worker_count=CRAWLER_CONFIG.get('DOWNLOADER_WORKER', 1),
                 resulter_worker_count=CRAWLER_CONFIG.get('RESULTER_WORKER', 1),
                 session=requests.session()):
        self.parser_worker_count = parser_worker_count
        self.downloader_worker_count = downloader_worker_count
        self.resulter_worker_count = resulter_worker_count
        self.downloader_worker = []
        self.parser_worker = []
        self.resulter_worker = []
        self.log = Log("Crawler")

        self.to_download_q = to_download_q
        self.downloader_parser_q = downloader_parser_q
        self.result_q = result_q

        self.session = session
        self.lock = Lock()
        self.task_manager = TaskManager(self.lock)

    def start(self):
        for i in range(self.downloader_worker_count):
            _worker = Downloader(self.to_download_q, self.downloader_parser_q, self.result_q, "Downloader {}".format(i),
                                 self.lock, self.session, )
            self.downloader_worker.append(_worker)
            self.log.log_it("启动 Downloader {}".format(i), 'INFO')
            _worker.start()

        for i in range(self.parser_worker_count):
            _worker = Parser(self.to_download_q, self.downloader_parser_q, self.result_q, "Parser {}".format(i),
                             self.lock)
            self.parser_worker.append(_worker)
            self.log.log_it("启动 Parser {}".format(i), 'INFO')
            _worker.start()

        for i in range(self.resulter_worker_count):
            _worker = Resulter(self.to_download_q, self.downloader_parser_q, self.result_q, "Resulter {}".format(i),
                               self.lock)
            self.resulter_worker.append(_worker)
            self.log.log_it("启动 Resulter {}".format(i), 'INFO')
            _worker.start()

        while True:
            time.sleep(1)
            if self.task_manager.is_empty():
                for worker in self.downloader_worker:
                    worker.exit()
                for worker in self.parser_worker:
                    worker.exit()

                resulter_not_alive = False
                while not resulter_not_alive:
                    resulter_not_alive = True
                    time.sleep(1)
                    for worker in self.resulter_worker:
                        resulter_not_alive &= not worker.is_alive()
                return