コード例 #1
0
ファイル: crawler.py プロジェクト: wax8280/MiniCat
    def __init__(self,
                 to_download_q: PriorityQueue,
                 downloader_parser_q: PriorityQueue,
                 result_q: Queue,
                 parser_worker_count,
                 downloader_worker_count,
                 resulter_worker_count,
                 speed=None,
                 session=requests.session()):
        self.parser_worker_count = int(parser_worker_count)
        self.downloader_worker_count = int(downloader_worker_count)
        self.resulter_worker_count = int(resulter_worker_count)
        self.downloader_worker = []
        self.parser_worker = []
        self.resulter_worker = []
        self.log = Log("Crawler")

        self.to_download_q = to_download_q
        self.downloader_parser_q = downloader_parser_q
        self.result_q = result_q

        if speed is not None:
            TaskManager.download_wait = 1 / speed

        self.task_manager = TaskManager(self.to_download_q)
        self.session = session
        self.lock = LOCK

        self.task_manager_thread = Thread(target=self.task_manager.run)
コード例 #2
0
ファイル: crawler.py プロジェクト: wax8280/MiniCat
    def __init__(self, to_download_q: PriorityQueue,
                 downloader_parser_q: PriorityQueue, result_q: Queue,
                 name: str):
        super().__init__(name=name)
        self.result_q = result_q
        self.downloader_parser_q = downloader_parser_q
        self.to_download_q = to_download_q

        self._exit = False
        self.log = Log(self.name)
コード例 #3
0
ファイル: crawler.py プロジェクト: wax8280/MiniCat
class Crawler:
    def __init__(self,
                 to_download_q: PriorityQueue,
                 downloader_parser_q: PriorityQueue,
                 result_q: Queue,
                 parser_worker_count,
                 downloader_worker_count,
                 resulter_worker_count,
                 speed=None,
                 session=requests.session()):
        self.parser_worker_count = int(parser_worker_count)
        self.downloader_worker_count = int(downloader_worker_count)
        self.resulter_worker_count = int(resulter_worker_count)
        self.downloader_worker = []
        self.parser_worker = []
        self.resulter_worker = []
        self.log = Log("Crawler")

        self.to_download_q = to_download_q
        self.downloader_parser_q = downloader_parser_q
        self.result_q = result_q

        if speed is not None:
            TaskManager.download_wait = 1 / speed

        self.task_manager = TaskManager(self.to_download_q)
        self.session = session
        self.lock = LOCK

        self.task_manager_thread = Thread(target=self.task_manager.run)

    def start(self):
        self.task_manager_thread.start()

        for i in range(self.downloader_worker_count):
            _worker = Downloader(
                self.to_download_q,
                self.downloader_parser_q,
                self.result_q,
                "Downloader {}".format(i),
                self.session,
            )
            self.downloader_worker.append(_worker)
            self.log.log_it("启动 Downloader {}".format(i), 'INFO')
            _worker.start()

        for i in range(self.parser_worker_count):
            _worker = Parser(self.to_download_q, self.downloader_parser_q,
                             self.result_q, "Parser {}".format(i))
            self.parser_worker.append(_worker)
            self.log.log_it("启动 Parser {}".format(i), 'INFO')
            _worker.start()

        for i in range(self.resulter_worker_count):
            _worker = Resulter(self.to_download_q, self.downloader_parser_q,
                               self.result_q, "Resulter {}".format(i))
            self.resulter_worker.append(_worker)
            self.log.log_it("启动 Resulter {}".format(i), 'INFO')
            _worker.start()

        while True:
            time.sleep(1)
            if self.task_manager.is_empty():
                for worker in self.downloader_worker:
                    worker.exit()
                for worker in self.parser_worker:
                    worker.exit()

                resulter_not_alive = False
                while not resulter_not_alive:
                    resulter_not_alive = True
                    time.sleep(1)
                    for worker in self.resulter_worker:
                        resulter_not_alive &= not worker.is_alive()

                for worker in self.resulter_worker:
                    worker.exit()

                self.task_manager.exit()
                TaskManager.ALLDONE = False
                return
コード例 #4
0
ファイル: crawler.py プロジェクト: wax8280/MiniCat
class Resulter(Thread):
    def __init__(self, to_download_q: PriorityQueue,
                 downloader_parser_q: PriorityQueue, result_q: Queue,
                 name: str):
        super().__init__(name=name)
        self.result_q = result_q
        self.downloader_parser_q = downloader_parser_q
        self.to_download_q = to_download_q

        self._exit = False
        self.log = Log(self.name)

    def exit(self):
        self._exit = True

    def result(self):
        with COND:
            COND.notify_all()

        try:
            task = self.result_q.get_nowait()
        except Empty:
            time.sleep(0.1)
            return

        try:
            self.log.log_it("正在处理{}".format(task['tid']))
            task['resulter'](task)
        except RetryDownload:
            self.log.log_it("RetryDownload Exception.Task{}".format(task),
                            'INFO')
            retry(task, self.to_download_q)
            return
        except RetryDownloadEnForceNodelay:
            self.log.log_it(
                "RetryDownloadEnForce Exception.Task{}".format(task), 'INFO')
            self.to_download_q.put(task)
            return
        except RetryDownloadNodelay:
            self.log.log_it(
                "RetryDownloadNodelay Exception.Task{}".format(task), 'INFO')
            retry_nodelay(task, self.to_download_q)
            return

        except RetryParse:
            self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO')
            retry(task, self.downloader_parser_q)
            return
        except RetryParseEnForceNodelay:
            self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO')
            self.downloader_parser_q.put(task)
            return
        except RetryParseNodelay:
            self.log.log_it("RetryParseNodelay Exception.Task{}".format(task),
                            'INFO')
            retry_nodelay(task, self.downloader_parser_q)
            return
        except RetryResult:
            self.log.log_it("RetryResult Exception.Task{}".format(task),
                            'INFO')
            retry(task, self.result_q)
            return
        except RetryResultEnForceNodelay:
            self.log.log_it("RetryResultEnForce Exception.Task{}".format(task),
                            'INFO')
            self.result_q.put(task)
            return
        except RetryResultNodelay:
            self.log.log_it("RetryResultNodelay Exception.Task{}".format(task),
                            'INFO')
            retry_nodelay(task, self.result_q)
            return

        except Exception as e:
            traceback.print_exc()
            self.log.log_it(
                "Resulter函数错误。错误信息:{}。Task:{}".format(str(e), task), 'WARN')
            retry(task, self.result_q)
            return

    def run(self):
        while (not TaskManager.ALLDONE) or (not self.result_q.empty()) or (
                not self.to_download_q.empty()):
            self.result()
        self.log.logd("Exit")
コード例 #5
0
ファイル: crawler.py プロジェクト: wax8280/MiniCat
class Parser(Thread):
    def __init__(self, to_download_q: PriorityQueue,
                 downloader_parser_q: PriorityQueue, result_q: Queue,
                 name: str):
        super().__init__(name=name)
        self.downloader_parser_q = downloader_parser_q
        self.to_download_q = to_download_q
        self.result_q = result_q

        self._exit = False
        self.log = Log(self.name)

    def exit(self):
        self._exit = True

    def parser(self):
        # 此处需要先唤醒一次,否则会出现死锁
        with COND:
            COND.notify_all()
        try:
            task = self.downloader_parser_q.get_nowait()
        except Empty:
            # 否则会一直while
            time.sleep(0.1)
            with COND:
                COND.notify_all()
            return

        try:
            task_with_parsed_data, tasks = task['parser'](task)
            if tasks:
                if not isinstance(tasks, list):
                    tasks = [tasks]
                self.log.log_it("获取新任务{}个。".format(len(tasks)), 'INFO')
                for each_task in tasks:
                    # 注册新任务
                    TaskManager.register(each_task['tid'])
                    # 放入队列
                    self.to_download_q.put(each_task)

        # 处理各种在爬虫脚本中抛出的重试错误
        except RetryDownload:
            self.log.log_it("RetryDownload Exception.Task{}".format(task),
                            'INFO')
            retry(task, self.to_download_q)
            return
        except RetryDownloadEnForce:
            self.log.log_it(
                "RetryDownloadEnForce Exception.Task{}".format(task), 'INFO')
            self.to_download_q.put(task)
            return
        except RetryParse:
            self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO')
            retry(task, self.downloader_parser_q)
            return
        except RetryParseEnForce:
            self.log.log_it("RetryParse Exception.Task{}".format(task), 'INFO')
            self.downloader_parser_q.put(task)
            return
        except Exception as e:
            self.log.log_it("解析错误。错误信息:{}。Task:{}".format(str(e), task),
                            'WARN')
            traceback.print_exc()
            return

        # 在Parser里面反注册
        TaskManager.unregister(task['tid'])
        return task_with_parsed_data

    def run(self):
        while not self._exit:
            task_with_parsed_data = self.parser()

            # 放入Resulter队列
            if task_with_parsed_data:
                self.result_q.put(task_with_parsed_data)

        self.log.logd("Exit")
コード例 #6
0
ファイル: crawler.py プロジェクト: wax8280/MiniCat
class Downloader(Thread):
    def __init__(self,
                 to_download_q: PriorityQueue,
                 downloader_parser_q: PriorityQueue,
                 result_q: Queue,
                 name: str,
                 session=requests.session()):
        super().__init__(name=name)
        self.to_download_q = to_download_q
        self.downloader_parser_q = downloader_parser_q
        self.result_q = result_q
        self.session = session

        self._exit = False

        self.log = Log(self.name)

    def exit(self):
        self._exit = True

    def request(self):
        response = None

        if time.time(
        ) - TaskManager.last_download_time < TaskManager.download_wait:
            time.sleep(TaskManager.download_wait / 4)
            return

        try:
            # 获取task
            task = self.to_download_q.get_nowait()
            # 在TaskManager里面注册
            TaskManager.register(task['tid'])

        except Empty:
            self.log.log_it(
                "Scheduler to Downloader队列为空,{}等待中。".format(self.name),
                'DEBUG')
            # 等待被Parser唤醒
            with COND:
                COND.wait()
                self.log.log_it(
                    "Downloader to Parser队列不为空。{}被唤醒。".format(self.name),
                    'DEBUG')
            return

        self.log.log_it("请求 {}".format(task['url']), 'INFO')
        try:
            # 记录下时间
            TaskManager.mark_download_time()
            # 网络请求
            response = self.session.request(task['method'], task['url'],
                                            **task.get('meta', {}))
        except Exception as e:
            traceback.print_exc()
            self.log.log_it(
                "网络请求错误。错误信息:{} URL:{} Response:{}".format(
                    str(e), task['url'], response), 'INFO')
            # 重试
            retry(task, self.to_download_q)
            return

        # 如果网络请求成功
        if response:
            task['response'] = response
        else:
            task['response'] = None

        # 放入队列供Parser使用
        self.downloader_parser_q.put(task)

    def run(self):
        while not self._exit:
            self.request()
        self.log.logd("Exit")