def record_end_time(self): # 记录结束时间 if self._batch_interval: current_timestamp = tools.get_current_timestamp() self._redisdb.hset( self._tab_spider_time, SPIDER_END_TIME_KEY, current_timestamp )
def spider_end(self): self.record_end_time() if self._end_callback: self._end_callback() for parser in self._parsers: parser.close() parser.end_callback() # 计算抓取时常 data = self._redisdb.hget( self._tab_spider_time, SPIDER_START_TIME_KEY, is_pop=True ) if data: begin_timestamp = int(data) spand_time = tools.get_current_timestamp() - begin_timestamp msg = "《%s》爬虫结束,耗时 %s" % ( self._spider_name, tools.format_seconds(spand_time), ) log.info(msg) if self._send_run_time: self.send_msg(msg) if not self._auto_stop_when_spider_done: log.info("爬虫不自动结束, 等待下一轮任务...") else: self.delete_tables(self._tab_spider_status)
def __input_data(self): if len(self._todo_requests) >= self._request_count: return # 汇报节点信息 self._db.zadd(self._tab_spider_status, self._spider_mark, 0) # 未做 request_count = self._request_count # 先赋值 # 根据等待节点数量,动态分配request spider_wait_count = self._db.zget_count(self._tab_spider_status, priority_min=0, priority_max=0) if spider_wait_count: # 任务数量 task_count = self._db.zget_count(self._tab_requests) # 动态分配的数量 = 任务数量 / 休息的节点数量 + 1 request_count = task_count // spider_wait_count + 1 request_count = (request_count if request_count <= self._request_count else self._request_count) if not request_count: return # requests_list = self._db.zget(self._tab_requests, count = request_count) # 取任务 current_timestamp = tools.get_current_timestamp() priority_max = current_timestamp - setting.REQUEST_TIME_OUT # 普通的任务 与 已经超时的任务 requests_list = self._db.zrangebyscore_set_score( self._tab_requests, priority_min="-inf", priority_max=priority_max, score=current_timestamp, count=request_count, ) # print('取任务', len(requests_list)) if not requests_list: pass else: self._is_collector_task = True # 将取到的任务放回到redis, 以当前时间戳标记,表示正在做的任务。任务做完在request_buffer中删除,没做完则到超时时间后重新做 # self._db.zadd(self._tab_requests, requests_list, prioritys=current_timestamp) # 汇报节点信息 self._db.zadd(self._tab_spider_status, self._spider_mark, 1) # 正在做 # 存request self.__put_requests(requests_list)
def spider_begin(self): """ @summary: start_monitor_task 方式启动,此函数与spider_end不在同一进程内,变量不可共享 --------- --------- @result: """ if self._begin_callback: self._begin_callback() # 记录开始时间 if not self._redisdb.hexists(self._tab_spider_time, SPIDER_START_TIME_KEY): current_timestamp = tools.get_current_timestamp() self._redisdb.hset( self._tab_spider_time, SPIDER_START_TIME_KEY, current_timestamp )
def is_reach_next_spider_time(self): if not self._batch_interval: return True last_spider_end_time = self._redisdb.hget(self._tab_spider_time, SPIDER_END_TIME_KEY) if last_spider_end_time: last_spider_end_time = int(last_spider_end_time) current_timestamp = tools.get_current_timestamp() time_interval = current_timestamp - last_spider_end_time if time_interval < self._batch_interval * 86400: log.info("上次运行结束时间为 {} 与当前时间间隔 为 {}, 小于规定的抓取时间间隔 {}。爬虫不执行,退出~". format( tools.timestamp_to_date(last_spider_end_time), tools.format_seconds(time_interval), tools.format_seconds(self._batch_interval * 86400), )) return False return True
def check_task_status(self): """ 检查任务状态 预警 """ # 每分钟检查一次 now_time = time.time() if now_time - self._last_check_task_status_time > 60: self._last_check_task_status_time = now_time else: return # 检查redis中任务状态,若连续20分钟内任务数量未发生变化(parser可能卡死),则发出报警信息 task_count = self._redisdb.zget_count(self._tab_requests) if task_count: if task_count != self._last_task_count: self._last_task_count = task_count self._redisdb.hset( self._tab_spider_time, SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY, tools.get_current_timestamp(), ) # 多进程会重复发消息, 使用reids记录上次统计时间 else: # 判断时间间隔是否超过20分钟 lua = """ local key = KEYS[1] local field = ARGV[1] local current_timestamp = ARGV[2] -- 取值 local last_timestamp = redis.call('hget', key, field) if last_timestamp and current_timestamp - last_timestamp >= 1200 then return current_timestamp - last_timestamp -- 返回任务停滞时间 秒 end if not last_timestamp then redis.call('hset', key, field, current_timestamp) end return 0 """ redis_obj = self._redisdb.get_redis_obj() cmd = redis_obj.register_script(lua) overtime = cmd( keys=[self._tab_spider_time], args=[ SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY, tools.get_current_timestamp(), ], ) if overtime: # 发送报警 msg = "《{}》爬虫任务停滞 {},请检查爬虫是否正常".format( self._spider_name, tools.format_seconds(overtime) ) log.error(msg) self.send_msg( msg, level="error", message_prefix="《{}》爬虫任务停滞".format(self._spider_name), ) else: self._last_task_count = 0 # 检查失败任务数量 超过1000 报警, failed_count = self._redisdb.zget_count(self._tab_failed_requests) if failed_count > setting.WARNING_FAILED_COUNT: # 发送报警 msg = "《%s》爬虫当前失败任务 %s, 请检查爬虫是否正常" % (self._spider_name, failed_count) log.error(msg) self.send_msg( msg, level="error", message_prefix="《%s》爬虫当前失败任务数预警" % (self._spider_name), ) # parser_control实时统计已做任务数及失败任务数,若失败数大于10且失败任务数/已做任务数>=0.5 则报警 failed_task_count, success_task_count = PaserControl.get_task_status_count() total_count = success_task_count + failed_task_count if total_count > 0: task_success_rate = success_task_count / total_count if task_success_rate < 0.5: # 发送报警 msg = "《%s》爬虫当前任务成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % ( self._spider_name, success_task_count, failed_task_count, task_success_rate, ) log.error(msg) # 统计下上次发消息的时间,若时间大于1小时,则报警(此处为多进程,需要考虑别报重复) self.send_msg( msg, level="error", message_prefix="《%s》爬虫当前任务成功率" % (self._spider_name), )