def run(self): if not self.is_reach_next_spider_time(): return self._start() while True: try: if self.all_thread_is_done(): if not self._is_notify_end: self.spider_end() # 跑完一轮 self.record_spider_state( spider_type=1, state=1, spider_end_time=tools.get_current_date(), batch_interval=self._batch_interval, ) self._is_notify_end = True if not self._keep_alive: self._stop_all_thread() break else: self._is_notify_end = False self.check_task_status() except Exception as e: log.exception(e) tools.delay_time(1) # 1秒钟检查一次爬虫状态
def __add_item_to_db(self, items, update_items, requests, callbacks, items_fingerprints): export_success = False self._is_adding_to_db = True # 去重 if setting.ITEM_FILTER_ENABLE: items, items_fingerprints = self.__dedup_items( items, items_fingerprints) # 分捡 items_dict = self.__pick_items(items) update_items_dict = self.__pick_items(update_items, is_update_item=True) # item批量入库 while items_dict: tab_item, datas = items_dict.popitem() log.debug(""" -------------- item 批量入库 -------------- 表名: %s datas: %s """ % (tab_item, tools.dumps_json(datas, indent=16))) export_success = self.__export_to_db(tab_item, datas) # 执行批量update while update_items_dict: tab_item, datas = update_items_dict.popitem() log.debug(""" -------------- item 批量更新 -------------- 表名: %s datas: %s """ % (tab_item, tools.dumps_json(datas, indent=16))) update_keys = self._item_update_keys.get(tab_item) export_success = self.__export_to_db(tab_item, datas, is_update=True, update_keys=update_keys) # 执行回调 while callbacks: try: callback = callbacks.pop(0) callback() except Exception as e: log.exception(e) # 删除做过的request if requests: self._db.zrem(self._table_request, requests) # 去重入库 if export_success and setting.ITEM_FILTER_ENABLE: if items_fingerprints: self.__class__.dedup.add(items_fingerprints, skip_check=True) self._is_adding_to_db = False
def get_user(self, block=True) -> Optional[GuestUser]: """ Args: block: 无用户时是否等待 Returns: """ while True: try: user_id = self._get_user_id() user_str = None if user_id: user_str = self._redisdb.hget(self._tab_user_pool, user_id) # 如果没取到user,可能是其他爬虫将此用户删除了,需要重刷新本地缓存的用户id if not user_str: self._load_users_id() continue if not user_id and block: self._keep_alive = False with RedisLock(key=self._tab_user_pool, lock_timeout=3600, wait_timeout=0) as _lock: if _lock.locked: self.run() continue return user_str and GuestUser(**eval(user_str)) except Exception as e: log.exception(e) tools.delay_time(1)
def put_failed_request(self, request, table=None): try: request_dict = request.to_dict self._db.zadd(table or self._table_failed_request, request_dict, request.priority) except Exception as e: log.exception(e)
def run(self): while True: try: now_user_count = self._redisdb.hget_count(self._tab_user_pool) need_user_count = self._min_users - now_user_count if need_user_count > 0: log.info("当前在线user数为 {} 小于 {}, 生产user".format( now_user_count, self._min_users)) try: user = self.login() if user: self.add_user(user) except Exception as e: log.exception(e) else: log.debug("当前user数为 {} 数量足够 暂不生产".format(now_user_count)) if self._keep_alive: tools.delay_time(10) else: break except Exception as e: log.exception(e) tools.delay_time(1)
def handel_exception(self, e): """ 处理异常 @param e: @return: """ log.exception(e)
def start_monitor_task(self, *args, **kws): if not self.is_reach_next_spider_time(): return self._auto_start_requests = False redisdb = RedisDB() if not self._parsers: # 不是add_parser 模式 self._parsers.append(self) while True: try: # 检查redis中是否有任务 tab_requests = setting.TAB_REQUSETS.format( redis_key=self._redis_key ) todo_task_count = redisdb.zget_count(tab_requests) if todo_task_count < self._min_task_count: # 添加任务 # make start requests self.distribute_task(*args, **kws) else: log.info("redis 中尚有%s条积压任务,暂时不派发新任务" % todo_task_count) except Exception as e: log.exception(e) if self._auto_stop_when_spider_done: break time.sleep(self._check_task_interval)
def __add_request_to_db(self): request_list = [] prioritys = [] callbacks = [] while self._requests_deque: request = self._requests_deque.popleft() self._is_adding_to_db = True if callable(request): # 函数 # 注意:应该考虑闭包情况。闭包情况可写成 # def test(xxx = xxx): # # TODO 业务逻辑 使用 xxx # 这么写不会导致xxx为循环结束后的最后一个值 callbacks.append(request) continue priority = request.priority # 如果需要去重并且库中已重复 则continue if (request.filter_repeat and setting.REQUEST_FILTER_ENABLE and not self.__class__.dedup.add(request.fingerprint)): log.debug("request已存在 url = %s" % request.url) continue else: request_list.append(str(request.to_dict)) prioritys.append(priority) if len(request_list) > MAX_URL_COUNT: self._db.zadd(self._table_request, request_list, prioritys) request_list = [] prioritys = [] # 入库 if request_list: self._db.zadd(self._table_request, request_list, prioritys) # 执行回调 for callback in callbacks: try: callback() except Exception as e: log.exception(e) # 删除已做任务 if self._del_requests_deque: request_done_list = [] while self._del_requests_deque: request_done_list.append(self._del_requests_deque.popleft()) # 去掉request_list中的requests, 否则可能会将刚添加的request删除 request_done_list = list( set(request_done_list) - set(request_list)) if request_done_list: self._db.zrem(self._table_request, request_done_list) self._is_adding_to_db = False
def run(self): while not self._thread_stop: try: self.__add_request_to_db() except Exception as e: log.exception(e) tools.delay_time(1)
def run(self): self._thread_stop = False while not self._thread_stop: try: self.__input_data() except Exception as e: log.exception(e) time.sleep(0.1) self._is_collector_task = False
def run(self): while not self._thread_stop: try: self.__report_node_heartbeat() self.__input_data() except Exception as e: log.exception(e) self._is_collector_task = False time.sleep(self._interval)
def run(self): while True: try: now_cookie_count = self._redisdb.lget_count(self._tab_cookie_pool) need_cookie_count = self._min_cookies - now_cookie_count if need_cookie_count > 0: log.info( "当前cookie数为 {} 小于 {}, 生产cookie".format( now_cookie_count, self._min_cookies ) ) try: cookies = self.create_cookie() if cookies: self.add_cookies(cookies) except Exception as e: log.exception(e) else: log.info("当前cookie数为 {} 数量足够 暂不生产".format(now_cookie_count)) # 判断cookie池近一分钟数量是否有变化,无变化则认为爬虫不再用了,退出 last_count_info = self._redisdb.strget( self._tab_cookie_pool_last_count ) if not last_count_info: self._redisdb.strset( self._tab_cookie_pool_last_count, "{}:{}".format(time.time(), now_cookie_count), ) else: last_time, last_count = last_count_info.split(":") last_time = float(last_time) last_count = int(last_count) if time.time() - last_time > 60: if now_cookie_count == last_count: log.info("近一分钟,cookie池数量无变化,判定爬虫未使用,退出生产") break else: self._redisdb.strset( self._tab_cookie_pool_last_count, "{}:{}".format(time.time(), now_cookie_count), ) if self._keep_alive: log.info("sleep 10") tools.delay_time(10) else: break except Exception as e: log.exception(e) tools.delay_time(1)
def run(self): """ @summary: 重写run方法 检查mysql中的任务是否做完, 做完停止 --------- --------- @result: """ try: self.create_batch_record_table() if not self._parsers: # 不是add_parser 模式 self._parsers.append(self) self._start() while True: try: self.heartbeat() if ( self.task_is_done() and self.all_thread_is_done() ): # redis全部的任务已经做完 并且mysql中的任务已经做完(检查各个线程all_thread_is_done,防止任务没做完,就更新任务状态,导致程序结束的情况) if not self._is_notify_end: self.spider_end() self.record_spider_state( spider_type=2, state=1, batch_date=self._batch_date_cache, spider_end_time=tools.get_current_date(), batch_interval=self._batch_interval, ) self._is_notify_end = True if not self._keep_alive: self._stop_all_thread() break else: self._is_notify_end = False self.check_task_status() except Exception as e: log.exception(e) tools.delay_time(10) # 10秒钟检查一次爬虫状态 except Exception as e: msg = "《%s》主线程异常 爬虫结束 exception: %s" % (self._batch_name, e) log.error(msg) self.send_msg(msg, level="error", message_prefix="《%s》爬虫异常结束".format(self._batch_name)) os._exit(137) # 使退出码为35072 方便爬虫管理器重启
def run(self): while True: try: try: with RedisLock( key=self._tab_user_pool, lock_timeout=3600, wait_timeout=0 ) as _lock: if _lock.locked: for user in self._load_user(): retry_times = 0 while retry_times <= self._login_retry_times: try: user = self.login(user) if user: self.add_user(user) else: self.handle_login_failed_user(user) break except NotImplementedError: log.error( f"{self.__class__.__name__} must be implementation login method!" ) os._exit(0) except Exception as e: self.handel_exception(e) log.debug( f"login failed, user: {user} retry_times: {retry_times}" ) retry_times += 1 else: self.handle_login_failed_user(user) now_user_count = self._redisdb.hget_count( self._tab_user_pool ) log.info("当前在线user数为 {}".format(now_user_count)) except Exception as e: log.exception(e) if self._keep_alive: tools.delay_time(10) else: break except Exception as e: log.exception(e) tools.delay_time(1)
def run(self): self._thread_stop = False while not self._thread_stop: try: request = self._collector.get_request() if not request: if not self.is_show_tip: log.debug("等待任务...") self.is_show_tip = True continue self.is_show_tip = False self.deal_request(request) except Exception as e: log.exception(e)
def flush(self): try: items = [] update_items = [] requests = [] callbacks = [] items_fingerprints = [] data_count = 0 while not self._items_queue.empty(): data = self._items_queue.get_nowait() data_count += 1 # data 分类 if callable(data): callbacks.append(data) elif isinstance(data, UpdateItem): update_items.append(data) elif isinstance(data, Item): items.append(data) if setting.ITEM_FILTER_ENABLE: items_fingerprints.append(data.fingerprint) else: # request-redis requests.append(data) if data_count >= UPLOAD_BATCH_MAX_SIZE: self.__add_item_to_db( items, update_items, requests, callbacks, items_fingerprints ) items = [] update_items = [] requests = [] callbacks = [] items_fingerprints = [] data_count = 0 if data_count: self.__add_item_to_db( items, update_items, requests, callbacks, items_fingerprints ) except Exception as e: log.exception(e)
def get_cookie(self, wait_when_null=True) -> User: while True: try: user_cookie = self._redisdb.rpoplpush(self._tab_cookie_pool) if not user_cookie and wait_when_null: log.info("暂无cookie 生产中...") self.login() continue if user_cookie: user_cookie = eval(user_cookie) return User(**user_cookie) return None except Exception as e: log.exception(e) tools.delay_time(1)
def run(self): if not self._parsers: # 不是add_parser 模式 self._parsers.append(self) self._start() while True: try: if self.all_thread_is_done(): self._stop_all_thread() break except Exception as e: log.exception(e) tools.delay_time(1) # 1秒钟检查一次爬虫状态 self.delete_tables([self._redis_key + "*"])
def __put_requests(self, requests_list): for request in requests_list: try: request_dict = { "request_obj": Request.from_dict(eval(request)), "request_redis": request, } except Exception as e: log.exception(""" error %s request %s """ % (e, request)) request_dict = None if request_dict: self._todo_requests.put(request_dict)
def run(self): while not self._thread_stop: try: request = self._memory_db.get() if not request: if not self.is_show_tip: log.debug("等待任务...") self.is_show_tip = True time.sleep(1) continue self.is_show_tip = False self.deal_request(request) except Exception as e: log.exception(e) time.sleep(3)
def run(self): while not self._thread_stop: try: requests = self._memory_db.get() if not requests: if not self.is_show_tip: log.debug("parser 等待任务 ...") self.is_show_tip = True time.sleep(1) self._wait_task_time += 1 continue self.is_show_tip = False self.deal_requests([requests]) except Exception as e: log.exception(e)
def get_cookie(self, wait_when_null=True): while True: try: cookie_info = self._redisdb.rpoplpush(self._tab_cookie_pool) if not cookie_info and wait_when_null: log.info("暂无cookie 生产中...") self._keep_alive = False self._min_cookies = 1 with RedisLock( key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=5 ) as _lock: if _lock.locked: self.run() continue return eval(cookie_info) if cookie_info else {} except Exception as e: log.exception(e) tools.delay_time(1)
def run(self): while not self._thread_stop: try: requests = self._collector.get_requests(setting.SPIDER_TASK_COUNT) if not requests: if not self.is_show_tip: log.info("parser 等待任务 ...") self.is_show_tip = True # log.info('parser 等待任务 {}...'.format(tools.format_seconds(self._wait_task_time))) time.sleep(1) self._wait_task_time += 1 continue self.is_show_tip = False self.deal_requests(requests) except Exception as e: log.exception(e)
def save_items(self, table, items: List[Dict]) -> bool: """ 保存数据 Args: table: 表名 items: 数据,[{},{},...] Returns: 是否保存成功 True / False 若False,不会将本批数据入到去重库,以便再次入库 """ try: add_count = self.to_db.add_batch(coll_name=table, datas=items) datas_size = len(items) log.info("共导出 %s 条数据到 %s, 新增 %s条, 重复 %s 条" % (datas_size, table, add_count, datas_size - add_count)) return True except Exception as e: log.exception(e) return False
def run(self): self.start_callback() for i in range(self._thread_count): parser_control = AirSpiderParserControl(self._memory_db, self._item_buffer) parser_control.add_parser(self) parser_control.start() self._parser_controls.append(parser_control) self._item_buffer.start() self.distribute_task() while True: try: if self.all_thread_is_done(): # 停止 parser_controls for parser_control in self._parser_controls: parser_control.stop() # 关闭item_buffer self._item_buffer.stop() # 关闭webdirver if Request.webdriver_pool: Request.webdriver_pool.close() log.info("无任务,爬虫结束") break except Exception as e: log.exception(e) tools.delay_time(1) # 1秒钟检查一次爬虫状态 self.end_callback() # 为了线程可重复start self._started.clear() # 关闭打点 metrics.close()
def get_user(self, block=True) -> Optional[NormalUser]: while True: try: user_id = self._get_user_id() user_str = None if user_id: user_str = self._redisdb.hget(self._tab_user_pool, user_id) # 如果没取到user,可能是其他爬虫将此用户删除了,需要重刷新本地缓存的用户id if not user_str: self._load_users_id() continue if not user_id and block: self._keep_alive = False self.run() continue return user_str and NormalUser(**eval(user_str)) except Exception as e: log.exception(e) tools.delay_time(1)
def reput_failed_requests_to_requests(self): log.debug("正在重置失败的requests...") total_count = 0 while True: try: failed_requests = self.get_failed_requests() if not failed_requests: break for request in failed_requests: request["retry_times"] = 0 request_obj = Request.from_dict(request) self._request_buffer.put_request(request_obj) total_count += 1 except Exception as e: log.exception(e) self._request_buffer.flush() log.debug("重置%s条失败requests为待抓取requests" % total_count)
def emit(self, point=None, force=False): """ 1. 添加新点到 pending 2. 如果符合条件,尝试聚合并打点 3. 更新打点时间 :param point: :param force: 强制提交所有点 默认False :return: """ if point: self.pending_points.put(point) # 判断是否需要提交点 1、数量 2、间隔 3、强力打点 if not (force or self.pending_points.qsize() >= self.max_points # noqa: W503 or time.time() - self.last_emit_ts > self.emit_interval # noqa: W503 ): return # 需要打点,读取可以打点的值, 确保只有一个线程在做点的压缩 with self.lock: points = self._get_ready_emit(force=force) if not points: return try: self.influxdb.write_points( points, batch_size=self.batch_size, time_precision=self.time_precision, retention_policy=self.retention_policy, ) except Exception: log.exception("error writing points") self.last_emit_ts = time.time()
def login(self): """ @return: 1 成功 0 失败 """ try: # 预检查 if not self.is_time_to_login(): log.info("此账号尚未到登陆时间: {}".format(self.username)) time.sleep(5) return 0 cookies = self.create_cookie() if not cookies: raise Exception("登陆失败 未获取到合法cookie") if not isinstance(cookies, dict): raise Exception("cookie 必须为字典格式") # 保存cookie self.set_login_time() self.set_cookies(cookies) log.info("登录成功 {}".format(self.username)) self.record_user_status(LimitTimesUserStatus.LOGIN_SUCCESS) return 1 except Exception as e: log.exception(e) send_msg( msg=f"{self.SITE_NAME} {self.username} 账号登陆异常 exception: {str(e)}", level="error", message_prefix=f"{self.SITE_NAME} {self.username} 账号登陆异常", ) log.info("登录失败 {}".format(self.username)) self.record_user_status(LimitTimesUserStatus.LOGIN_FALIED) return 0
def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool: """ 更新数据 Args: table: 表名 items: 数据,[{},{},...] update_keys: 更新的字段, 如 ("title", "publish_time") Returns: 是否更新成功 True / False 若False,不会将本批数据入到去重库,以便再次入库 """ try: add_count = self.to_db.add_batch( coll_name=table, datas=items, update_columns=update_keys or list(items[0].keys()), ) datas_size = len(items) update_count = datas_size - add_count msg = "共导出 %s 条数据到 %s, 新增 %s 条, 更新 %s 条" % ( datas_size, table, add_count, update_count, ) if update_keys: msg += " 更新字段为 {}".format(update_keys) log.info(msg) return True except Exception as e: log.exception(e) return False