def spider_end(self): self.record_end_time() if self._end_callback: self._end_callback() for parser in self._parsers: parser.close() parser.end_callback() # 计算抓取时常 data = self._redisdb.hget( self._tab_spider_time, SPIDER_START_TIME_KEY, is_pop=True ) if data: begin_timestamp = int(data) spand_time = tools.get_current_timestamp() - begin_timestamp msg = "《%s》爬虫结束,耗时 %s" % ( self._spider_name, tools.format_seconds(spand_time), ) log.info(msg) if self._send_run_time: self.send_msg(msg) if not self._auto_stop_when_spider_done: log.info("爬虫不自动结束, 等待下一轮任务...") else: self.delete_tables(self._tab_spider_status)
def is_reach_next_spider_time(self): if not self._batch_interval: return True last_spider_end_time = self._redisdb.hget(self._tab_spider_time, SPIDER_END_TIME_KEY) if last_spider_end_time: last_spider_end_time = int(last_spider_end_time) current_timestamp = tools.get_current_timestamp() time_interval = current_timestamp - last_spider_end_time if time_interval < self._batch_interval * 86400: log.info("上次运行结束时间为 {} 与当前时间间隔 为 {}, 小于规定的抓取时间间隔 {}。爬虫不执行,退出~". format( tools.timestamp_to_date(last_spider_end_time), tools.format_seconds(time_interval), tools.format_seconds(self._batch_interval * 86400), )) return False return True
def check_task_status(self): """ 检查任务状态 预警 """ # 每分钟检查一次 now_time = time.time() if now_time - self._last_check_task_status_time > 60: self._last_check_task_status_time = now_time else: return # 检查redis中任务状态,若连续20分钟内任务数量未发生变化(parser可能卡死),则发出报警信息 task_count = self._redisdb.zget_count(self._tab_requests) if task_count: if task_count != self._last_task_count: self._last_task_count = task_count self._redisdb.hset( self._tab_spider_time, SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY, tools.get_current_timestamp(), ) # 多进程会重复发消息, 使用reids记录上次统计时间 else: # 判断时间间隔是否超过20分钟 lua = """ local key = KEYS[1] local field = ARGV[1] local current_timestamp = ARGV[2] -- 取值 local last_timestamp = redis.call('hget', key, field) if last_timestamp and current_timestamp - last_timestamp >= 1200 then return current_timestamp - last_timestamp -- 返回任务停滞时间 秒 end if not last_timestamp then redis.call('hset', key, field, current_timestamp) end return 0 """ redis_obj = self._redisdb.get_redis_obj() cmd = redis_obj.register_script(lua) overtime = cmd( keys=[self._tab_spider_time], args=[ SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY, tools.get_current_timestamp(), ], ) if overtime: # 发送报警 msg = "《{}》爬虫任务停滞 {},请检查爬虫是否正常".format( self._spider_name, tools.format_seconds(overtime) ) log.error(msg) self.send_msg( msg, level="error", message_prefix="《{}》爬虫任务停滞".format(self._spider_name), ) else: self._last_task_count = 0 # 检查失败任务数量 超过1000 报警, failed_count = self._redisdb.zget_count(self._tab_failed_requests) if failed_count > setting.WARNING_FAILED_COUNT: # 发送报警 msg = "《%s》爬虫当前失败任务 %s, 请检查爬虫是否正常" % (self._spider_name, failed_count) log.error(msg) self.send_msg( msg, level="error", message_prefix="《%s》爬虫当前失败任务数预警" % (self._spider_name), ) # parser_control实时统计已做任务数及失败任务数,若失败数大于10且失败任务数/已做任务数>=0.5 则报警 failed_task_count, success_task_count = PaserControl.get_task_status_count() total_count = success_task_count + failed_task_count if total_count > 0: task_success_rate = success_task_count / total_count if task_success_rate < 0.5: # 发送报警 msg = "《%s》爬虫当前任务成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % ( self._spider_name, success_task_count, failed_task_count, task_success_rate, ) log.error(msg) # 统计下上次发消息的时间,若时间大于1小时,则报警(此处为多进程,需要考虑别报重复) self.send_msg( msg, level="error", message_prefix="《%s》爬虫当前任务成功率" % (self._spider_name), )
def check_batch(self, is_first_check=False): """ @summary: 检查批次是否完成 --------- @param: is_first_check 是否为首次检查,若首次检查,且检查结果为批次已完成,则不发送批次完成消息。因为之前发送过了 --------- @result: 完成返回True 否则False """ sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count from {batch_record_table} order by id desc limit 1'.format( date_format=self._date_format.replace(":%M", ":%i"), batch_record_table=self._batch_record_table, ) batch_info = self._mysqldb.find(sql) # (('2018-08-19', 49686, 0),) if batch_info: batch_date, total_count, done_count = batch_info[0] now_date = datetime.datetime.now() last_batch_date = datetime.datetime.strptime( batch_date, self._date_format) time_difference = now_date - last_batch_date if total_count == done_count and time_difference < datetime.timedelta( days=self._batch_interval): # 若在本批次内,再次检查任务表是否有新增任务 # # 改成查询任务表 看是否真的没任务了,因为batch_record表里边的数量可能没来得及更新 task_count = self.__get_task_state_count() total_count = task_count.get("total_count") done_count = task_count.get("done_count") if total_count == done_count: # 检查相关联的爬虫是否完成 releated_spider_is_done = self.related_spider_is_done() if releated_spider_is_done == False: msg = "《{}》本批次未完成, 正在等待依赖爬虫 {} 结束. 批次时间 {} 批次进度 {}/{}".format( self._batch_name, self._related_batch_record or self._related_task_tables, batch_date, done_count, total_count, ) log.info(msg) # 检查是否超时 超时发出报警 if time_difference >= datetime.timedelta( days=self._batch_interval): # 已经超时 if (not self._last_send_msg_time or now_date - self._last_send_msg_time >= self._send_msg_interval): self._last_send_msg_time = now_date self.send_msg(msg, level="error") return False elif releated_spider_is_done == True: # 更新is_done 状态 self.update_is_done() else: self.update_is_done() msg = "《{}》本批次完成 批次时间 {} 共处理 {} 条任务".format( self._batch_name, batch_date, done_count) log.info(msg) if not is_first_check: self.send_msg(msg) # 判断下一批次是否到 if time_difference >= datetime.timedelta( days=self._batch_interval): msg = "《{}》下一批次开始".format(self._batch_name) log.info(msg) self.send_msg(msg) # 初始化任务表状态 if self.init_task() != False: # 更新失败返回False 其他返回True/None # 初始化属性 self.init_property() is_success = ( self.record_batch() ) # 有可能插入不成功,但是任务表已经重置了,不过由于当前时间为下一批次的时间,检查批次是否结束时不会检查任务表,所以下次执行时仍然会重置 if is_success: log.info( "插入新批次记录成功 1分钟后开始下发任务") # 防止work批次时间没来得及更新 tools.delay_time(60) return False # 下一批次开始 else: return True # 下一批次不开始。先不派发任务,因为批次表新批次插入失败了,需要插入成功后再派发任务 else: log.info("《{}》下次批次时间未到".format(self._batch_name)) if not is_first_check: self.send_msg("《{}》下次批次时间未到".format(self._batch_name)) return True else: if time_difference >= datetime.timedelta( days=self._batch_interval): # 已经超时 time_out = time_difference - datetime.timedelta( days=self._batch_interval) time_out_pretty = tools.format_seconds( time_out.total_seconds()) msg = "《{}》本批次已超时{} 批次时间 {}, 批次进度 {}/{}".format( self._batch_name, time_out_pretty, batch_date, done_count, total_count, ) if self._batch_interval >= 1: msg += ", 期望时间{}天".format(self._batch_interval) else: msg += ", 期望时间{}小时".format(self._batch_interval * 24) result = self.get_deal_speed( total_count=total_count, done_count=done_count, last_batch_date=last_batch_date, ) if result: deal_speed, need_time, overflow_time, calculate_speed_time = ( result) msg += ", 任务处理速度于{}统计, 约 {}条/小时, 预计还需 {}".format( calculate_speed_time, deal_speed, tools.format_seconds(need_time), ) if overflow_time > 0: msg += ", 该批次预计总超时 {}, 请及时处理".format( tools.format_seconds(overflow_time)) log.info(msg) if (not self._last_send_msg_time or now_date - self._last_send_msg_time >= self._send_msg_interval): self._last_send_msg_time = now_date self.send_msg(msg, level="error") else: # 未超时 remaining_time = ( datetime.timedelta(days=self._batch_interval) - time_difference) remaining_time_pretty = tools.format_seconds( remaining_time.total_seconds()) if self._batch_interval >= 1: msg = "《{}》本批次正在进行, 批次时间 {}, 批次进度 {}/{}, 期望时间{}天, 剩余{}".format( self._batch_name, batch_date, done_count, total_count, self._batch_interval, remaining_time_pretty, ) else: msg = "《{}》本批次正在进行, 批次时间 {}, 批次进度 {}/{}, 期望时间{}小时, 剩余{}".format( self._batch_name, batch_date, done_count, total_count, self._batch_interval * 24, remaining_time_pretty, ) result = self.get_deal_speed( total_count=total_count, done_count=done_count, last_batch_date=last_batch_date, ) if result: deal_speed, need_time, overflow_time, calculate_speed_time = ( result) msg += ", 任务处理速度于{}统计, 约 {}条/小时, 预计还需 {}".format( calculate_speed_time, deal_speed, tools.format_seconds(need_time), ) if overflow_time > 0: msg += ", 该批次可能会超时 {}, 请及时处理".format( tools.format_seconds(overflow_time)) # 发送警报 if (not self._last_send_msg_time or now_date - self._last_send_msg_time >= self._send_msg_interval): self._last_send_msg_time = now_date self.send_msg(msg, level="error") elif overflow_time < 0: msg += ", 该批次预计提前 {} 完成".format( tools.format_seconds(-overflow_time)) log.info(msg) else: # 插入batch_date self.record_batch() # 初始化任务表状态 可能有产生任务的代码 self.init_task() return False