def run(self): try: self.spiderManager.Clock_Sta = ThreadSta.Work self.check_date() except Exception: report_logger.error("Clock_Thread 线程出错") self.spiderManager.Clock_Sta = ThreadSta.Error
def run(self): self.spiderManager.Monitor_Sta = ThreadSta.Work try: self._run() except Exception: self.spiderManager.Monitor_Sta = ThreadSta.Error report_logger.error("MonitorThread 出错")
def run(self): self.spiderManager.Parse_Sta = ThreadSta.Work try: self._run() except Exception as exc: report_logger.error("ParseThread:\n" + traceback.format_exc()) self.spiderManager.Parse_Sta = ThreadSta.Error
def run(self): self.spiderManager.Fetch_Sta = ThreadSta.Work try: self._run() except Exception as exc: report_logger.error("FetchThread ", exc) self.spiderManager.Fetch_Sta = ThreadSta.Error
def run(self): self.spiderManager.TaskAllot_Sta = ThreadSta.Work try: self._run() except Exception: report_logger.error("TaskAllotThread 出错") self.spiderManager.fetch_queue.is_valid = False self.spiderManager.TaskAllot_Sta = ThreadSta.Error
def check_thread_error(self): for sta in [ self.Fetch_Sta, self.Parse_Sta, self.Save_Sta, self.Clock_Sta, self.TaskAllot_Sta, self.Monitor_Sta ]: if sta == ThreadSta.Error: report_logger.error("运行报错,且退出") return True return False
def start_work_and_wait_done(self, fetcher_num=10, parser_num=1, monitor_time=5): """ 开启爬取 :param monitor_time: 监视线程报告间隔 :param fetcher_num: 爬取线程数 :param parser_num: 解析进程数 """ # 时钟线程 clock_thread = ClockThread(self.schedule, self.spiderManager) # 监视线程 monitor_thread = MonitorThread(self.client, self.spiderManager, sleeptime=monitor_time) # 任务调度线程 task_allot_thread = TaskAllot(self.client, self.schedule, self.spiderManager) # 爬取线程 if self.proxy: print("启用 代理模式") fetch_thread = FetchThreadProxy(self.client, self.fetcher, fetcher_num, self.spiderManager) else: fetch_thread = FetchThread(self.client, self.fetcher, fetcher_num, self.spiderManager) # 解析线程 parse_thread = ParseThread(self.client, self.parser, parser_num, self.spiderManager) # 存储线程 save_thread = SaveThread(self.saver, self.spiderManager) for thread in [ clock_thread, task_allot_thread, fetch_thread, parse_thread, save_thread, monitor_thread ]: thread.start() while True: if self.is_all_task_done( ) or self.spiderManager.check_thread_error(): self.spiderManager.finish_all_threads() break time.sleep(1) for thread in [ fetch_thread, parse_thread, save_thread, monitor_thread ]: thread.join() report_logger.error("任务全部完成")
def _run(self): while True: time.sleep(self._sleeptime) self._init_time += self._sleeptime m, s = divmod(self._init_time, 60) h, m = divmod(m, 60) fetch_rec, fetch_fin = self.spiderManager.fetch_queue.get_count() parse_rec, parse_fin = self.spiderManager.parse_queue.get_count() save_rec, save_fin = self.spiderManager.save_queue.get_count() dif_fetch = fetch_fin - self._last_fetch_num dif_parse = parse_fin - self._last_parse_num dif_save = save_fin - self._last_save_num info = "idle_task_num=%d " % self.client.get_idle_tasks_size() info += "fetch=(%d/%d,%d) " % (fetch_fin, fetch_rec, dif_fetch) info += "parse=(%d/%d,%d) " % (parse_fin, parse_rec, dif_parse) info += "save=(%d/%d,%d) " % (save_fin, save_rec, dif_save) info += "times=%02d:%02d:%02d" % (h, m, s) self._last_fetch_num = fetch_fin self._last_parse_num = parse_fin self._last_save_num = save_fin montion_logger.debug(info) if self.DEBUG is False and dif_fetch == 0 and dif_parse == 0 and dif_save == 0: self.ErrorCount += 1 if self.ErrorCount >= self._IdleTime: self.ErrorCount = 0 self.spiderManager.Monitor_Sta = ThreadSta.Error report_logger.error("MonitorThread 长时间无进展") break else: self.ErrorCount = 0 if self.spiderManager.Monitor_Sta == ThreadSta.Finish: break