def all_thread_is_done(self): for i in range(3): # 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性 # 检测 collector 状态 if ( self._collector.is_collector_task() or self._collector.get_requests_count() > 0 ): return False # 检测 parser_control 状态 for parser_control in self._parser_controls: if not parser_control.is_not_task(): return False # 检测 item_buffer 状态 if ( self._item_buffer.get_items_count() > 0 or self._item_buffer.is_adding_to_db() ): return False # 检测 request_buffer 状态 if ( self._request_buffer.get_requests_count() > 0 or self._request_buffer.is_adding_to_db() ): return False tools.delay_time(1) return True
def run(self): if not self.is_reach_next_spider_time(): return self._start() while True: if self.all_thread_is_done(): if not self._is_notify_end: self.spider_end() # 跑完一轮 self.record_spider_state( spider_type=1, state=1, spider_end_time=tools.get_current_date(), batch_interval=self._batch_interval, ) self._is_notify_end = True if self._auto_stop_when_spider_done: self._stop_all_thread() break else: self._is_notify_end = False self.check_task_status() tools.delay_time(1) # 1秒钟检查一次爬虫状态
def run(self): while not self._thread_stop: try: self.__add_request_to_db() except Exception as e: log.exception(e) tools.delay_time(1)
def run(self): if not self._parsers: # 不是add_parser 模式 self._parsers.append(self) self._start() while True: if self.all_thread_is_done(): self._stop_all_thread() break tools.delay_time(1) # 1秒钟检查一次爬虫状态 self.delete_tables([self._table_folder + "*"])
def all_thread_is_done(self): for i in range(3): # 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性 # 检测 parser_control 状态 for parser_control in self._parser_controls: if not parser_control.is_not_task(): return False # 检测 任务队列 状态 if not self._memory_db.empty(): return False tools.delay_time(1) return True
def run(self): """ @summary: 重写run方法 检查mysql中的任务是否做完, 做完停止 --------- --------- @result: """ try: self.create_batch_record_table() if not self._parsers: # 不是add_parser 模式 self._parsers.append(self) self._start() while True: if ( self.task_is_done() and self.all_thread_is_done() ): # redis全部的任务已经做完 并且mysql中的任务已经做完(检查各个线程all_thread_is_done,防止任务没做完,就更新任务状态,导致程序结束的情况) if not self._is_notify_end: self.spider_end() self.record_spider_state( spider_type=2, state=1, batch_date=self._batch_date_cache, spider_end_time=tools.get_current_date(), batch_interval=self._batch_interval, ) self._is_notify_end = True if self._auto_stop_when_spider_done: self._stop_all_thread() break else: self._is_notify_end = False self.check_task_status() tools.delay_time(10) # 10秒钟检查一次爬虫状态 except Exception as e: msg = "《%s》主线程异常 爬虫结束 exception: %s" % (self._batch_name, e) log.error(msg) self.send_msg(msg) os._exit(137) # 使退出码为35072 方便爬虫管理器重启
def check_batch(self, is_first_check=False): """ @summary: 检查批次是否完成 --------- @param: is_first_check 是否为首次检查,若首次检查,且检查结果为批次已完成,则不发送批次完成消息。因为之前发送过了 --------- @result: 完成返回True 否则False """ sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count from {batch_record_table} order by id desc limit 1'.format( date_format=self._date_format.replace(":%M", ":%i"), batch_record_table=self._batch_record_table, ) batch_info = self._mysqldb.find(sql) # (('2018-08-19', 49686, 0),) if batch_info: batch_date, total_count, done_count = batch_info[0] now_date = datetime.datetime.now() last_batch_date = datetime.datetime.strptime( batch_date, self._date_format) time_difference = now_date - last_batch_date if total_count == done_count and time_difference < datetime.timedelta( days=self._batch_interval): # 若在本批次内,再次检查任务表是否有新增任务 # # 改成查询任务表 看是否真的没任务了,因为batch_record表里边的数量可能没来得及更新 task_count = self.__get_task_state_count() total_count = task_count.get("total_count") done_count = task_count.get("done_count") if total_count == done_count: # 检查相关联的爬虫是否完成 releated_spider_is_done = self.related_spider_is_done() if releated_spider_is_done == False: msg = "《{}》本批次未完成, 正在等待依赖爬虫 {} 结束. 批次时间 {} 批次进度 {}/{}".format( self._batch_name, self._related_batch_record or self._related_task_tables, batch_date, done_count, total_count, ) log.info(msg) # 检查是否超时 超时发出报警 if time_difference >= datetime.timedelta( days=self._batch_interval): # 已经超时 if (not self._last_send_msg_time or now_date - self._last_send_msg_time >= self._send_msg_interval): self._last_send_msg_time = now_date self.send_msg(msg, level="error") return False elif releated_spider_is_done == True: # 更新is_done 状态 self.update_is_done() else: self.update_is_done() msg = "《{}》本批次完成 批次时间 {} 共处理 {} 条任务".format( self._batch_name, batch_date, done_count) log.info(msg) if not is_first_check: self.send_msg(msg) # 判断下一批次是否到 if time_difference >= datetime.timedelta( days=self._batch_interval): msg = "《{}》下一批次开始".format(self._batch_name) log.info(msg) self.send_msg(msg) # 初始化任务表状态 if self.init_task() != False: # 更新失败返回False 其他返回True/None # 初始化属性 self.init_property() is_success = ( self.record_batch() ) # 有可能插入不成功,但是任务表已经重置了,不过由于当前时间为下一批次的时间,检查批次是否结束时不会检查任务表,所以下次执行时仍然会重置 if is_success: log.info( "插入新批次记录成功 1分钟后开始下发任务") # 防止work批次时间没来得及更新 tools.delay_time(60) return False # 下一批次开始 else: return True # 下一批次不开始。先不派发任务,因为批次表新批次插入失败了,需要插入成功后再派发任务 else: log.info("《{}》下次批次时间未到".format(self._batch_name)) if not is_first_check: self.send_msg("《{}》下次批次时间未到".format(self._batch_name)) return True else: if time_difference >= datetime.timedelta( days=self._batch_interval): # 已经超时 time_out = time_difference - datetime.timedelta( days=self._batch_interval) time_out_pretty = tools.format_seconds( time_out.total_seconds()) msg = "《{}》本批次已超时{} 批次时间 {}, 批次进度 {}/{}".format( self._batch_name, time_out_pretty, batch_date, done_count, total_count, ) if self._batch_interval >= 1: msg += ", 期望时间{}天".format(self._batch_interval) else: msg += ", 期望时间{}小时".format(self._batch_interval * 24) result = self.get_deal_speed( total_count=total_count, done_count=done_count, last_batch_date=last_batch_date, ) if result: deal_speed, need_time, overflow_time, calculate_speed_time = ( result) msg += ", 任务处理速度于{}统计, 约 {}条/小时, 预计还需 {}".format( calculate_speed_time, deal_speed, tools.format_seconds(need_time), ) if overflow_time > 0: msg += ", 该批次预计总超时 {}, 请及时处理".format( tools.format_seconds(overflow_time)) log.info(msg) if (not self._last_send_msg_time or now_date - self._last_send_msg_time >= self._send_msg_interval): self._last_send_msg_time = now_date self.send_msg(msg, level="error") else: # 未超时 remaining_time = ( datetime.timedelta(days=self._batch_interval) - time_difference) remaining_time_pretty = tools.format_seconds( remaining_time.total_seconds()) if self._batch_interval >= 1: msg = "《{}》本批次正在进行, 批次时间 {}, 批次进度 {}/{}, 期望时间{}天, 剩余{}".format( self._batch_name, batch_date, done_count, total_count, self._batch_interval, remaining_time_pretty, ) else: msg = "《{}》本批次正在进行, 批次时间 {}, 批次进度 {}/{}, 期望时间{}小时, 剩余{}".format( self._batch_name, batch_date, done_count, total_count, self._batch_interval * 24, remaining_time_pretty, ) result = self.get_deal_speed( total_count=total_count, done_count=done_count, last_batch_date=last_batch_date, ) if result: deal_speed, need_time, overflow_time, calculate_speed_time = ( result) msg += ", 任务处理速度于{}统计, 约 {}条/小时, 预计还需 {}".format( calculate_speed_time, deal_speed, tools.format_seconds(need_time), ) if overflow_time > 0: msg += ", 该批次可能会超时 {}, 请及时处理".format( tools.format_seconds(overflow_time)) # 发送警报 if (not self._last_send_msg_time or now_date - self._last_send_msg_time >= self._send_msg_interval): self._last_send_msg_time = now_date self.send_msg(msg, level="error") elif overflow_time < 0: msg += ", 该批次预计提前 {} 完成".format( tools.format_seconds(-overflow_time)) log.info(msg) else: # 插入batch_date self.record_batch() # 初始化任务表状态 可能有产生任务的代码 self.init_task() return False
def run(self): while not self._thread_stop: self.flush() tools.delay_time(0.5) self.close()