Example #1
0
 def run(self):
     try:
         self.spiderManager.Clock_Sta = ThreadSta.Work
         self.check_date()
     except Exception:
         report_logger.error("Clock_Thread 线程出错")
         self.spiderManager.Clock_Sta = ThreadSta.Error
 def run(self):
     self.spiderManager.Monitor_Sta = ThreadSta.Work
     try:
         self._run()
     except Exception:
         self.spiderManager.Monitor_Sta = ThreadSta.Error
         report_logger.error("MonitorThread 出错")
Example #3
0
 def run(self):
     self.spiderManager.Parse_Sta = ThreadSta.Work
     try:
         self._run()
     except Exception as exc:
         report_logger.error("ParseThread:\n" + traceback.format_exc())
         self.spiderManager.Parse_Sta = ThreadSta.Error
 def run(self):
     self.spiderManager.Fetch_Sta = ThreadSta.Work
     try:
         self._run()
     except Exception as exc:
         report_logger.error("FetchThread ", exc)
         self.spiderManager.Fetch_Sta = ThreadSta.Error
 def run(self):
     self.spiderManager.TaskAllot_Sta = ThreadSta.Work
     try:
         self._run()
     except Exception:
         report_logger.error("TaskAllotThread 出错")
         self.spiderManager.fetch_queue.is_valid = False
         self.spiderManager.TaskAllot_Sta = ThreadSta.Error
Example #6
0
 def check_thread_error(self):
     for sta in [
             self.Fetch_Sta, self.Parse_Sta, self.Save_Sta, self.Clock_Sta,
             self.TaskAllot_Sta, self.Monitor_Sta
     ]:
         if sta == ThreadSta.Error:
             report_logger.error("运行报错,且退出")
             return True
     return False
Example #7
0
    def start_work_and_wait_done(self,
                                 fetcher_num=10,
                                 parser_num=1,
                                 monitor_time=5):
        """
        开启爬取
        :param monitor_time: 监视线程报告间隔
        :param fetcher_num: 爬取线程数
        :param parser_num:  解析进程数
        """
        # 时钟线程
        clock_thread = ClockThread(self.schedule, self.spiderManager)
        # 监视线程
        monitor_thread = MonitorThread(self.client,
                                       self.spiderManager,
                                       sleeptime=monitor_time)

        # 任务调度线程
        task_allot_thread = TaskAllot(self.client, self.schedule,
                                      self.spiderManager)

        # 爬取线程
        if self.proxy:
            print("启用 代理模式")
            fetch_thread = FetchThreadProxy(self.client, self.fetcher,
                                            fetcher_num, self.spiderManager)
        else:
            fetch_thread = FetchThread(self.client, self.fetcher, fetcher_num,
                                       self.spiderManager)

        # 解析线程
        parse_thread = ParseThread(self.client, self.parser, parser_num,
                                   self.spiderManager)
        # 存储线程
        save_thread = SaveThread(self.saver, self.spiderManager)

        for thread in [
                clock_thread, task_allot_thread, fetch_thread, parse_thread,
                save_thread, monitor_thread
        ]:
            thread.start()

        while True:
            if self.is_all_task_done(
            ) or self.spiderManager.check_thread_error():
                self.spiderManager.finish_all_threads()
                break
            time.sleep(1)

        for thread in [
                fetch_thread, parse_thread, save_thread, monitor_thread
        ]:
            thread.join()
        report_logger.error("任务全部完成")
    def _run(self):
        while True:
            time.sleep(self._sleeptime)
            self._init_time += self._sleeptime
            m, s = divmod(self._init_time, 60)
            h, m = divmod(m, 60)

            fetch_rec, fetch_fin = self.spiderManager.fetch_queue.get_count()
            parse_rec, parse_fin = self.spiderManager.parse_queue.get_count()
            save_rec, save_fin = self.spiderManager.save_queue.get_count()
            dif_fetch = fetch_fin - self._last_fetch_num
            dif_parse = parse_fin - self._last_parse_num
            dif_save = save_fin - self._last_save_num

            info = "idle_task_num=%d " % self.client.get_idle_tasks_size()
            info += "fetch=(%d/%d,%d) " % (fetch_fin, fetch_rec, dif_fetch)
            info += "parse=(%d/%d,%d) " % (parse_fin, parse_rec, dif_parse)
            info += "save=(%d/%d,%d) " % (save_fin, save_rec, dif_save)
            info += "times=%02d:%02d:%02d" % (h, m, s)

            self._last_fetch_num = fetch_fin
            self._last_parse_num = parse_fin
            self._last_save_num = save_fin

            montion_logger.debug(info)

            if self.DEBUG is False and dif_fetch == 0 and dif_parse == 0 and dif_save == 0:
                self.ErrorCount += 1
                if self.ErrorCount >= self._IdleTime:
                    self.ErrorCount = 0
                    self.spiderManager.Monitor_Sta = ThreadSta.Error
                    report_logger.error("MonitorThread 长时间无进展")
                    break
            else:
                self.ErrorCount = 0

            if self.spiderManager.Monitor_Sta == ThreadSta.Finish:
                break