def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool: """ 更新数据 Args: table: 表名 items: 数据,[{},{},...] update_keys: 更新的字段, 如 ("title", "publish_time") Returns: 是否更新成功 True / False 若False,不会将本批数据入到去重库,以便再次入库 """ update_count = self.to_db.add_batch( table=table, items=items, update_columns=update_keys or list(items[0].keys()), ) if update_count: msg = "共更新 %s 条数据 到 %s" % (update_count, table) if update_keys: msg += " 更新字段为 {}".format(update_keys) log.info(msg) return update_count != None
def run(self): with RedisLock( key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=100 ) as _lock: if _lock.locked: user_infos = self.get_user_info() if not isinstance(user_infos, Iterable): raise ValueError("get_user_info 返回值必须可迭代") if not user_infos: log.info("无可用用户") for username, password in user_infos: for i in range(self._login_retry_times): try: cookie = self.create_cookie(username, password) if cookie: self.save_cookie(username, cookie) else: self.handle_login_failed_user(username, password) break except Exception as e: self.handel_exception(e) else: self.handle_login_failed_user(username, password)
def acquire(self): start = time.time() while 1: # 尝试加锁 if self.redis_conn.setnx(self.lock_key, time.time()): self.redis_conn.expire(self.lock_key, self.timeout) self.locked = True break else: # 修复bug: 当加锁时被干掉 导致没有设置expire成功 锁无限存在 if self.redis_conn.ttl(self.lock_key) < 0: self.redis_conn.delete(self.lock_key) if self.wait_timeout > 0: if time.time() - start > self.wait_timeout: log.info("加锁失败") break else: # 不等待 break if self.break_wait(): log.info("break_wait 生效 不再等待加锁") break log.debug("等待加锁: {} wait:{}".format(self, time.time() - start)) if self.wait_timeout > 10: time.sleep(5) else: time.sleep(1) return
def __dedup_items(self, items, items_fingerprints): """ 去重 @param items: @param items_fingerprints: @return: 返回去重后的items, items_fingerprints """ if not items: return items, items_fingerprints is_exists = self.__class__.dedup.get(items_fingerprints) is_exists = is_exists if isinstance(is_exists, list) else [is_exists] dedup_items = [] dedup_items_fingerprints = [] items_count = dedup_items_count = dup_items_count = 0 while is_exists: item = items.pop(0) items_fingerprint = items_fingerprints.pop(0) is_exist = is_exists.pop(0) items_count += 1 if not is_exist: dedup_items.append(item) dedup_items_fingerprints.append(items_fingerprint) dedup_items_count += 1 else: dup_items_count += 1 log.info("待入库数据 {} 条, 重复 {} 条,实际待入库数据 {} 条".format( items_count, dup_items_count, dedup_items_count)) return dedup_items, dedup_items_fingerprints
def update_items(self, tab_item, items_data, update_keys=()): """ @summary: --------- @param tab_item: redis中items的表名 @param items_data: [item.to_dict] 数据 @param update_keys: 更新的字段 --------- @result: """ to_table = tools.get_info(tab_item, ":s_(.*?)_item", fetch_one=True) sql, datas = tools.make_batch_sql( to_table, items_data, update_columns=update_keys or list(items_data[0].keys()), ) update_count = self.to_db.add_batch(sql, datas) if update_count is None: log.error("更新表 %s 数据失败" % (to_table)) else: msg = "共更新 %s 条数据 到 %s" % (update_count // 2, to_table) if update_keys: msg += " 更新字段为 {}".format(update_keys) log.info(msg) return update_count != None
def run(self): for i in range(self._thread_count): parser_control = AirSpiderParserControl(self._memory_db, self._item_buffer) parser_control.add_parser(self) parser_control.start() self._parser_controls.append(parser_control) self._item_buffer.start() self.distribute_task() while True: if self.all_thread_is_done(): # 停止 parser_controls for parser_control in self._parser_controls: parser_control.stop() # 关闭item_buffer self._item_buffer.stop() # 关闭webdirver if Request.webdriver_pool: Request.webdriver_pool.close() log.info("无任务,爬虫结束") break
def run(self): while True: try: now_user_count = self._redisdb.hget_count(self._tab_user_pool) need_user_count = self._min_users - now_user_count if need_user_count > 0: log.info("当前在线user数为 {} 小于 {}, 生产user".format( now_user_count, self._min_users)) try: user = self.login() if user: self.add_user(user) except Exception as e: log.exception(e) else: log.debug("当前user数为 {} 数量足够 暂不生产".format(now_user_count)) if self._keep_alive: tools.delay_time(10) else: break except Exception as e: log.exception(e) tools.delay_time(1)
def start_monitor_task(self, *args, **kws): if not self.is_reach_next_spider_time(): return self._auto_start_requests = False redisdb = RedisDB() if not self._parsers: # 不是add_parser 模式 self._parsers.append(self) while True: try: # 检查redis中是否有任务 tab_requests = setting.TAB_REQUSETS.format( redis_key=self._redis_key ) todo_task_count = redisdb.zget_count(tab_requests) if todo_task_count < self._min_task_count: # 添加任务 # make start requests self.distribute_task(*args, **kws) else: log.info("redis 中尚有%s条积压任务,暂时不派发新任务" % todo_task_count) except Exception as e: log.exception(e) if self._auto_stop_when_spider_done: break time.sleep(self._check_task_interval)
def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool: """ 更新数据 Args: table: 表名 items: 数据,[{},{},...] update_keys: 更新的字段, 如 ("title", "publish_time") Returns: 是否更新成功 True / False 若False,不会将本批数据入到去重库,以便再次入库 """ sql, datas = tools.make_batch_sql(table, items, update_columns=update_keys or list(items[0].keys())) update_count = self.to_db.add_batch(sql, datas) if update_count is None: log.error("更新表 %s 数据失败" % (table)) else: msg = "共更新 %s 条数据 到 %s" % (update_count // 2, table) if update_keys: msg += " 更新字段为 {}".format(update_keys) log.info(msg) return update_count != None
def spider_end(self): self.record_end_time() if self._end_callback: self._end_callback() for parser in self._parsers: parser.close() parser.end_callback() # 计算抓取时常 data = self._redisdb.hget(self._tab_spider_time, SPIDER_START_TIME_KEY, is_pop=True) if data: begin_timestamp = int(data) spand_time = tools.get_current_timestamp() - begin_timestamp msg = "《%s》爬虫结束,耗时 %s" % ( self._spider_name, tools.format_seconds(spand_time), ) log.info(msg) if self._send_run_time: self.send_msg(msg) if not self._auto_stop_when_spider_done: log.info("爬虫不自动结束, 等待下一轮任务...") else: self.delete_tables(self._tab_spider_status)
def distribute_task(self): """ @summary: 分发任务 并将返回的request入库 --------- --------- @result: """ self._is_distributed_task = False for parser in self._parsers: requests = parser.__start_requests() if requests and not isinstance(requests, Iterable): raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests")) result_type = 1 for request in requests or []: if isinstance(request, Request): request.parser_name = request.parser_name or parser.name self._request_buffer.put_request(request) self._is_distributed_task = True result_type = 1 elif isinstance(request, Item): self._item_buffer.put_item(request) result_type = 2 elif callable(request): # callbale的request可能是更新数据库操作的函数 if result_type == 1: self._request_buffer.put_request(request) else: self._item_buffer.put_item(request) self._request_buffer.flush() self._item_buffer.flush() if self._is_distributed_task: # 有任务时才提示启动爬虫 # begin self.spider_begin() self.record_spider_state( spider_type=1, state=0, batch_date=tools.get_current_date(), spider_start_time=tools.get_current_date(), batch_interval=self._batch_interval, ) # 重置已经提示无任务状态为False self._is_show_not_task = False elif not self._is_show_not_task: # 无任务,且没推送过无任务信息 # 发送无任务消息 msg = "《%s》start_requests无任务添加" % (self._spider_name) log.info(msg) # self.send_msg(msg) self._is_show_not_task = True
def db_tip(self): msg = "" if setting.ADD_ITEM_TO_MYSQL: msg += "item 自动入mysql " if setting.ADD_ITEM_TO_REDIS: msg += "item 自动入redis " if not msg: log.warning("*** 请注意检查item是否入库 !!!") else: log.info(msg)
def __input_data(self): current_timestamp = tools.get_current_timestamp() if len(self._todo_requests) >= self._request_count: return request_count = self._request_count # 先赋值 # 查询最近有心跳的节点数量 spider_count = self._db.zget_count( self._tab_spider_status, priority_min=current_timestamp - (self._interval + 10), priority_max=current_timestamp, ) # 根据等待节点数量,动态分配request if spider_count: # 任务数量 task_count = self._db.zget_count(self._tab_requests) # 动态分配的数量 = 任务数量 / 休息的节点数量 + 1 request_count = task_count // spider_count + 1 request_count = (request_count if request_count <= self._request_count else self._request_count) if not request_count: return # 当前无其他节点,并且是首次取任务,则重置丢失的任务 if self._first_get_task and spider_count <= 1: datas = self._db.zrangebyscore_set_score( self._tab_requests, priority_min=current_timestamp, priority_max=current_timestamp + setting.REQUEST_LOST_TIMEOUT, score=300, count=None, ) self._first_get_task = False lose_count = len(datas) if lose_count: log.info("重置丢失任务完毕,共{}条".format(len(datas))) # 取任务,只取当前时间搓以内的任务,同时将任务分数修改为 current_timestamp + setting.REQUEST_LOST_TIMEOUT requests_list = self._db.zrangebyscore_set_score( self._tab_requests, priority_min="-inf", priority_max=current_timestamp, score=current_timestamp + setting.REQUEST_LOST_TIMEOUT, count=request_count, ) if requests_list: self._is_collector_task = True # 存request self.__put_requests(requests_list)
def _reconnect(self): # 检测连接状态, 当数据库重启或设置 timeout 导致断开连接时自动重连 retry_count = 0 while True: try: retry_count += 1 log.error(f"redis 连接断开, 重新连接 {retry_count}") if self.get_connect(): log.info(f"redis 连接成功") return True except (ConnectionError, TimeoutError) as e: log.error(f"连接失败 e: {e}") time.sleep(2)
def delete_tables(self, delete_tables_list): if isinstance(delete_tables_list, bool): delete_tables_list = [self._redis_key + "*"] elif not isinstance(delete_tables_list, (list, tuple)): delete_tables_list = [delete_tables_list] redis = RedisDB() for delete_tab in delete_tables_list: if delete_tab == "*": delete_tab = self._redis_key + "*" tables = redis.getkeys(delete_tab) for table in tables: log.info("正在删除表 %s" % table) redis.clear(table)
def delete_tables(self, delete_tables_list): if isinstance(delete_tables_list, bool): delete_tables_list = [self._redis_key + "*"] elif not isinstance(delete_tables_list, (list, tuple)): delete_tables_list = [delete_tables_list] redis = RedisDB() for delete_tab in delete_tables_list: if not delete_tab.startswith(self._redis_key): delete_tab = self._redis_key + delete_tab tables = redis.getkeys(delete_tab) for table in tables: if table != self._tab_spider_time: log.info("正在删除key %s" % table) redis.clear(table)
def run(self): while True: try: try: with RedisLock( key=self._tab_user_pool, lock_timeout=3600, wait_timeout=0 ) as _lock: if _lock.locked: for user in self._load_user(): retry_times = 0 while retry_times <= self._login_retry_times: try: user = self.login(user) if user: self.add_user(user) else: self.handle_login_failed_user(user) break except NotImplementedError: log.error( f"{self.__class__.__name__} must be implementation login method!" ) os._exit(0) except Exception as e: self.handel_exception(e) log.debug( f"login failed, user: {user} retry_times: {retry_times}" ) retry_times += 1 else: self.handle_login_failed_user(user) now_user_count = self._redisdb.hget_count( self._tab_user_pool ) log.info("当前在线user数为 {}".format(now_user_count)) except Exception as e: log.exception(e) if self._keep_alive: tools.delay_time(10) else: break except Exception as e: log.exception(e) tools.delay_time(1)
def __add_task(self): # 启动parser 的 start_requests self.spider_begin() # 不自动结束的爬虫此处只能执行一遍 self.record_spider_state( spider_type=1, state=0, batch_date=tools.get_current_date(), spider_start_time=tools.get_current_date(), batch_interval=self._batch_interval, ) # 判断任务池中属否还有任务,若有接着抓取 todo_task_count = self._collector.get_requests_count() if todo_task_count: log.info("检查到有待做任务 %s 条,不重下发新任务。将接着上回异常终止处继续抓取" % todo_task_count) else: for parser in self._parsers: results = parser.start_requests() # 添加request到请求队列,由请求队列统一入库 if results and not isinstance(results, Iterable): raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests")) result_type = 1 for result in results or []: if isinstance(result, Request): result.parser_name = result.parser_name or parser.name self._request_buffer.put_request(result) result_type = 1 elif isinstance(result, Item): self._item_buffer.put_item(result) result_type = 2 elif callable(result): # callbale的request可能是更新数据库操作的函数 if result_type == 1: self._request_buffer.put_request(result) else: self._item_buffer.put_item(result) else: raise TypeError( "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format( type(result) ) ) self._request_buffer.flush() self._item_buffer.flush()
def run(self): self.distribute_task() for i in range(self._thread_count): parser_control = AirSpiderParserControl(self._memory_db) parser_control.add_parser(self) parser_control.start() self._parser_controls.append(parser_control) while True: if self.all_thread_is_done(): # 停止 parser_controls for parser_control in self._parser_controls: parser_control.stop() log.info("无任务,爬虫结束") break
def get_cookie(self, wait_when_null=True) -> User: while True: try: user_cookie = self._redisdb.rpoplpush(self._tab_cookie_pool) if not user_cookie and wait_when_null: log.info("暂无cookie 生产中...") self.login() continue if user_cookie: user_cookie = eval(user_cookie) return User(**user_cookie) return None except Exception as e: log.exception(e) tools.delay_time(1)
def reset_task(self, heartbeat_interval=10): """ 重置丢失的任务 Returns: """ if self.have_alive_spider(heartbeat_interval=heartbeat_interval): current_timestamp = tools.get_current_timestamp() datas = self._redisdb.zrangebyscore_set_score( self._tab_requests, priority_min=current_timestamp, priority_max=current_timestamp + setting.REQUEST_LOST_TIMEOUT, score=300, count=None, ) lose_count = len(datas) if lose_count: log.info("重置丢失任务完毕,共{}条".format(len(datas)))
def get_cookie(self, wait_when_null=True): while True: try: cookie_info = self._redisdb.rpoplpush(self._tab_cookie_pool) if not cookie_info and wait_when_null: log.info("暂无cookie 生产中...") self._keep_alive = False self._min_cookies = 1 with RedisLock( key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=5 ) as _lock: if _lock.locked: self.run() continue return eval(cookie_info) if cookie_info else {} except Exception as e: log.exception(e) tools.delay_time(1)
def save_items(self, table, items: List[Dict]) -> bool: """ 保存数据 Args: table: 表名 items: 数据,[{},{},...] Returns: 是否保存成功 True / False 若False,不会将本批数据入到去重库,以便再次入库 """ add_count = self.to_db.add_batch(table=table, datas=items) datas_size = len(items) if add_count is not None: log.info("共导出 %s 条数据 到 %s, 重复 %s 条" % (datas_size, table, datas_size - add_count)) return add_count != None
def run(self): while not self._thread_stop: try: requests = self._memory_db.get() if not requests: if not self.is_show_tip: log.info("parser 等待任务 ...") self.is_show_tip = True time.sleep(1) self._wait_task_time += 1 continue self.is_show_tip = False self.deal_requests([requests]) except Exception as e: log.exception(e)
def run(self): while True: try: now_cookie_count = self._redisdb.lget_count(self._tab_cookie_pool) need_cookie_count = self._min_cookies - now_cookie_count if need_cookie_count > 0: log.info( "当前cookie数为 {} 小于 {}, 生产cookie".format( now_cookie_count, self._min_cookies ) ) try: cookies = self.create_cookie() if cookies: self.add_cookies(cookies) except Exception as e: log.exception(e) else: log.info("当前cookie数为 {} 数量足够 暂不生产".format(now_cookie_count)) # 判断cookie池近一分钟数量是否有变化,无变化则认为爬虫不再用了,退出 last_count_info = self._redisdb.strget( self._tab_cookie_pool_last_count ) if not last_count_info: self._redisdb.strset( self._tab_cookie_pool_last_count, "{}:{}".format(time.time(), now_cookie_count), ) else: last_time, last_count = last_count_info.split(":") last_time = float(last_time) last_count = int(last_count) if time.time() - last_time > 60: if now_cookie_count == last_count: log.info("近一分钟,cookie池数量无变化,判定爬虫未使用,退出生产") break else: self._redisdb.strset( self._tab_cookie_pool_last_count, "{}:{}".format(time.time(), now_cookie_count), ) if self._keep_alive: log.info("sleep 10") tools.delay_time(10) else: break except Exception as e: log.exception(e) tools.delay_time(1)
def prolong_life(self): """ 延长锁的过期时间 :return: """ spend_time = 0 while not self.stop_prolong_life: expire = self.redis_conn.ttl(self.lock_key) if expire < 0: # key 不存在 time.sleep(1) continue self.redis_conn.expire(self.lock_key, expire + 5) # 延长5秒 time.sleep(expire) # 临过期5秒前,再次延长 spend_time += expire if self.lock_timeout and spend_time > self.lock_timeout: log.info("锁超时,释放") self.redis_conn.delete(self.lock_key) break
def spider_end(self): self.record_end_time() if self._end_callback: self._end_callback() for parser in self._parsers: if not self._keep_alive: parser.close() parser.end_callback() if not self._keep_alive: # 关闭webdirver if Request.webdriver_pool: Request.webdriver_pool.close() # 关闭打点 metrics.close() else: metrics.flush() # 计算抓取时长 data = self._redisdb.hget(self._tab_spider_time, SPIDER_START_TIME_KEY, is_pop=True) if data: begin_timestamp = int(data) spand_time = tools.get_current_timestamp() - begin_timestamp msg = "《%s》爬虫结束,耗时 %s" % ( self._spider_name, tools.format_seconds(spand_time), ) log.info(msg) self.send_msg(msg) if self._keep_alive: log.info("爬虫不自动结束, 等待下一轮任务...") else: self.delete_tables(self._tab_spider_status)
def save_items(self, table, items: List[Dict]) -> bool: """ 保存数据 Args: table: 表名 items: 数据,[{},{},...] Returns: 是否保存成功 True / False 若False,不会将本批数据入到去重库,以便再次入库 """ try: add_count = self.to_db.add_batch(coll_name=table, datas=items) datas_size = len(items) log.info("共导出 %s 条数据到 %s, 新增 %s条, 重复 %s 条" % (datas_size, table, add_count, datas_size - add_count)) return True except Exception as e: log.exception(e) return False
def acquire(self): start = time.time() while True: # 尝试加锁 if self.redis_conn.set(self.lock_key, time.time(), nx=True, ex=5): self.locked = True break if self.wait_timeout > 0: if time.time() - start > self.wait_timeout: log.info("加锁失败") break else: break log.debug("等待加锁: {} wait:{}".format(self, time.time() - start)) if self.wait_timeout > 10: time.sleep(5) else: time.sleep(1) return
def run(self): while not self._thread_stop: try: requests = self._collector.get_requests(setting.SPIDER_TASK_COUNT) if not requests: if not self.is_show_tip: log.info("parser 等待任务 ...") self.is_show_tip = True # log.info('parser 等待任务 {}...'.format(tools.format_seconds(self._wait_task_time))) time.sleep(1) self._wait_task_time += 1 continue self.is_show_tip = False self.deal_requests(requests) except Exception as e: log.exception(e)