def put_failed_request(self, request, table=None): try: request_dict = request.to_dict self._db.zadd(table or self._table_failed_request, request_dict, request.priority) except Exception as e: log.exception(e)
def start_monitor_task(self, *args, **kws): if not self.is_reach_next_spider_time(): return self._auto_start_requests = False redisdb = RedisDB() if not self._parsers: # 不是add_parser 模式 self._parsers.append(self) while True: try: # 检查redis中是否有任务 tab_requests = setting.TAB_REQUSETS.format( table_folder=self._table_folder) todo_task_count = redisdb.zget_count(tab_requests) if todo_task_count < self._min_task_count: # 添加任务 # make start requests self.distribute_task(*args, **kws) else: log.info("redis 中尚有%s条积压任务,暂时不派发新任务" % todo_task_count) except Exception as e: log.exception(e) if self._auto_stop_when_spider_done: break time.sleep(self._check_task_interval)
def __add_item_to_db(self, items, update_items, requests, callbacks, items_fingerprints): export_success = False self._is_adding_to_db = True # 去重 if setting.ITEM_FILTER_ENABLE: items, items_fingerprints = self.__dedup_items( items, items_fingerprints) # 分捡 items_dict = self.__pick_items(items) update_items_dict = self.__pick_items(update_items, is_update_item=True) # item批量入库 while items_dict: tab_item, datas = items_dict.popitem() log.debug(""" -------------- item 批量入库 -------------- 表名: %s datas: %s """ % (tab_item, tools.dumps_json(datas, indent=16))) export_success = self.__export_to_db(tab_item, datas) # 执行批量update while update_items_dict: tab_item, datas = update_items_dict.popitem() log.debug(""" -------------- item 批量更新 -------------- 表名: %s datas: %s """ % (tab_item, tools.dumps_json(datas, indent=16))) update_keys = self._item_update_keys.get(tab_item) export_success = self.__export_to_db(tab_item, datas, is_update=True, update_keys=update_keys) # 执行回调 while callbacks: try: callback = callbacks.pop(0) callback() except Exception as e: log.exception(e) # 删除做过的request if requests: self._db.zrem(self._table_request, requests) # 去重入库 if export_success and setting.ITEM_FILTER_ENABLE: if items_fingerprints: self.__class__.dedup.add(items_fingerprints, skip_check=True) self._is_adding_to_db = False
def __add_request_to_db(self): request_list = [] prioritys = [] callbacks = [] while self._requests_deque: request = self._requests_deque.popleft() self._is_adding_to_db = True if callable(request): # 函数 # 注意:应该考虑闭包情况。闭包情况可写成 # def test(xxx = xxx): # # TODO 业务逻辑 使用 xxx # 这么写不会导致xxx为循环结束后的最后一个值 callbacks.append(request) continue priority = request.priority # 如果需要去重并且库中已重复 则continue if (request.filter_repeat and setting.REQUEST_FILTER_ENABLE and not self.__class__.dedup.add(request.fingerprint)): log.debug("request已存在 url = %s" % request.url) continue else: request_list.append(str(request.to_dict)) prioritys.append(priority) if len(request_list) > MAX_URL_COUNT: self._db.zadd(self._table_request, request_list, prioritys) request_list = [] prioritys = [] # 入库 if request_list: self._db.zadd(self._table_request, request_list, prioritys) # 执行回调 for callback in callbacks: try: callback() except Exception as e: log.exception(e) # 删除已做任务 if self._del_requests_deque: request_done_list = [] while self._del_requests_deque: request_done_list.append(self._del_requests_deque.popleft()) # 去掉request_list中的requests, 否则可能会将刚添加的request删除 request_done_list = list( set(request_done_list) - set(request_list)) if request_done_list: self._db.zrem(self._table_request, request_done_list) self._is_adding_to_db = False
def run(self): while not self._thread_stop: try: self.__add_request_to_db() except Exception as e: log.exception(e) tools.delay_time(1)
def run(self): while not self._thread_stop: try: self.__input_data() except Exception as e: log.exception(e) self._is_collector_task = False time.sleep(self._interval)
def flush(self): try: items = [] update_items = [] requests = [] callbacks = [] items_fingerprints = [] data_count = 0 while not self._items_queue.empty(): data = self._items_queue.get_nowait() data_count += 1 # data 分类 if callable(data): callbacks.append(data) elif isinstance(data, UpdateItem): update_items.append(data) elif isinstance(data, Item): items.append(data) if setting.ITEM_FILTER_ENABLE: items_fingerprints.append(data.fingerprint) else: # request-redis requests.append(data) if data_count >= UPLOAD_BATCH_MAX_SIZE: self.__add_item_to_db( items, update_items, requests, callbacks, items_fingerprints ) items = [] update_items = [] requests = [] callbacks = [] items_fingerprints = [] data_count = 0 if data_count: self.__add_item_to_db( items, update_items, requests, callbacks, items_fingerprints ) except Exception as e: log.exception(e)
def __put_requests(self, requests_list): for request in requests_list: try: request_dict = { "request_obj": Request.from_dict(eval(request)), "request_redis": request, } except Exception as e: log.exception(""" error %s request %s """ % (e, request)) request_dict = None if request_dict: self._todo_requests.append(request_dict)
def run(self): while not self._thread_stop: try: requests = self._memory_db.get() if not requests: if not self.is_show_tip: log.info("parser 等待任务 ...") self.is_show_tip = True time.sleep(1) self._wait_task_time += 1 continue self.is_show_tip = False self.deal_requests([requests]) except Exception as e: log.exception(e)
def run(self): while not self._thread_stop: try: requests = self._collector.get_requests( setting.PARSER_TASK_COUNT) if not requests: if not self.is_show_tip: log.info("parser 等待任务 ...") self.is_show_tip = True # log.info('parser 等待任务 {}...'.format(tools.format_seconds(self._wait_task_time))) time.sleep(1) self._wait_task_time += 1 continue self.is_show_tip = False self.deal_requests(requests) except Exception as e: log.exception(e)
def export(self, from_table, to_table, auto_update=False, batch_count=100): """ @summary: 用于从redis的item中导出数据到关系型数据库,如mysql/oracle from_table与to_table表结构必须一致 --------- @param from_table: @param to_table: @param auto_update: 当数据存在时是否自动更新 默认否 --------- @result: """ total_count = 0 while True: datas = [] try: datas = self._redisdb.sget(from_table, count=batch_count, is_pop=False) if not datas: log.info(""" \r%s -> %s 共导出 %s 条数据""" % (from_table, to_table, total_count)) break json_datas = [eval(data) for data in datas] sql, json_datas = tools.make_batch_sql(to_table, json_datas, auto_update) if self._to_db.add_batch(sql, json_datas): total_count += len(json_datas) self._redisdb.srem(from_table, datas) except Exception as e: log.exception(e) log.error(datas)
def start_monitor_task(self): """ @summary: 监控任务状态 --------- --------- @result: """ if not self._parsers: # 不是多模版模式, 将自己注入到parsers,自己为模版 self._is_more_parsers = False self._parsers.append(self) elif len(self._parsers) <= 1: self._is_more_parsers = False self.create_batch_record_table() # 添加任务 for parser in self._parsers: parser.add_task() is_first_check = True while True: try: if self.check_batch(is_first_check): # 该批次已经做完 if not self._auto_stop_when_spider_done: is_first_check = True log.info("爬虫所有任务已做完,不自动结束,等待新任务...") time.sleep(self._check_task_interval) continue else: break is_first_check = False # 检查redis中是否有任务 任务小于_min_task_count 则从mysql中取 tab_requests = setting.TAB_REQUSETS.format( table_folder=self._table_folder) todo_task_count = self._redisdb.zget_count(tab_requests) tasks = [] if todo_task_count < self._min_task_count: # 从mysql中取任务 # 更新batch表的任务状态数量 self.update_task_done_count() log.info("redis 中剩余任务%s 数量过小 从mysql中取任务追加" % todo_task_count) tasks = self.get_todo_task_from_mysql() if not tasks: # 状态为0的任务已经做完,需要检查状态为2的任务是否丢失 if (todo_task_count == 0 ): # redis 中无待做任务,此时mysql中状态为2的任务为丢失任务。需重新做 lose_task_count = self.get_lose_task_count() if not lose_task_count: time.sleep(self._check_task_interval) continue elif ( lose_task_count > self._task_limit * 5 ): # 丢失任务太多,直接重置,否则每次等redis任务消耗完再取下一批丢失任务,速度过慢 log.info("正在重置丢失任务为待做 共 {} 条".format( lose_task_count)) # 重置正在做的任务为待做 if self.reset_lose_task_from_mysql(): log.info("重置丢失任务成功") else: log.info("重置丢失任务失败") continue else: # 丢失任务少,直接取 log.info("正在取丢失任务 共 {} 条, 取 {} 条".format( lose_task_count, self._task_limit if self._task_limit <= lose_task_count else lose_task_count, )) tasks = self.get_doing_task_from_mysql() else: log.info("mysql 中取到待做任务 %s 条" % len(tasks)) else: log.info("redis 中尚有%s条积压任务,暂时不派发新任务" % todo_task_count) if not tasks: if todo_task_count >= self._min_task_count: # log.info('任务正在进行 redis中剩余任务 %s' % todo_task_count) pass else: log.info("mysql 中无待做任务 redis中剩余任务 %s" % todo_task_count) else: # make start requests self.distribute_task(tasks) log.info("添加任务到redis成功") except Exception as e: log.exception(e) time.sleep(self._check_task_interval)
def deal_requests(self, requests): for request in requests: response = None request_redis = request["request_redis"] request = request["request_obj"] del_request_redis_after_item_to_db = False del_request_redis_after_request_to_db = False for parser in self._parsers: if parser.name == request.parser_name: used_download_midware_enable = False try: # 记录需下载的文档 self.record_download_status( PaserControl.DOWNLOAD_TOTAL, parser.name) # 解析request if request.auto_request: request_temp = None if request.download_midware: download_midware = ( request.download_midware if callable(request.download_midware) else tools.get_method( parser, request.download_midware)) request_temp = download_midware(request) elif request.download_midware != False: request_temp = parser.download_midware(request) if request_temp: if not isinstance(request_temp, Request): raise Exception( "download_midware need return a request, but received type: {}" .format(type(request_temp))) used_download_midware_enable = True response = ( request_temp.get_response() if not setting.RESPONSE_CACHED_USED else request_temp.get_response_from_cached( save_cached=False)) else: response = (request.get_response() if not setting.RESPONSE_CACHED_USED else request.get_response_from_cached( save_cached=False)) if response == None: raise Exception( "连接超时 url: %s" % (request.url or request_temp.url)) else: response = None if request.callback: # 如果有parser的回调函数,则用回调处理 callback_parser = (request.callback if callable( request.callback) else tools.get_method( parser, request.callback)) results = callback_parser(request, response) else: # 否则默认用parser处理 results = parser.parser(request, response) if results and not isinstance(results, Iterable): raise Exception( "%s.%s返回值必须可迭代" % (parser.name, request.callback or "parser")) # 标识上一个result是什么 result_type = 0 # 0\1\2 (初始值\request\item) # 此处判断是request 还是 item for result in results or []: if isinstance(result, Request): result_type = 1 # 给request的 parser_name 赋值 result.parser_name = result.parser_name or parser.name # 判断是同步的callback还是异步的 if result.request_sync: # 同步 request_dict = { "request_obj": result, "request_redis": None, } requests.append(request_dict) else: # 异步 # 将next_request 入库 self._request_buffer.put_request(result) del_request_redis_after_request_to_db = True elif isinstance(result, Item): result_type = 2 # 将item入库 self._item_buffer.put_item(result) # 需删除正在做的request del_request_redis_after_item_to_db = True elif callable(result): # result为可执行的无参函数 if (result_type == 2 ): # item 的 callback,buffer里的item均入库后再执行 self._item_buffer.put_item(result) del_request_redis_after_item_to_db = True else: # result_type == 1: # request 的 callback,buffer里的request均入库后再执行。可能有的parser直接返回callback self._request_buffer.put_request(result) del_request_redis_after_request_to_db = True # else: # raise TypeError('Expect Request、Item、callback func, bug get type: {}'.format(type(result))) except Exception as e: exception_type = (str(type(e)).replace("<class '", "").replace( "'>", "")) if exception_type.startswith("requests"): # 记录下载失败的文档 self.record_download_status( PaserControl.DOWNLOAD_EXCEPTION, parser.name) else: # 记录解析程序异常 self.record_download_status( PaserControl.PAESERS_EXCEPTION, parser.name) if setting.LOG_LEVEL == "DEBUG": # 只有debug模式下打印, 超时的异常篇幅太多 log.exception(e) log.error(""" -------------- %s.%s error ------------- error %s response %s deal request %s """ % ( parser.name, (request.callback and callable(request.callback) and getattr(request.callback, "__name__") or request.callback) or "parser", str(e), response, tools.dumps_json(request.to_dict, indent=28) if setting.LOG_LEVEL == "DEBUG" else request, )) request.error_msg = "%s: %s" % (exception_type, e) request.response = str(response) if "Invalid URL" in str(e): request.is_abandoned = True requests = parser.exception_request( request, response) or [request] if not isinstance(requests, Iterable): raise Exception("%s.%s返回值必须可迭代" % (parser.name, "exception_request")) for request in requests: if callable(request): self._request_buffer.put_request(request) continue if not isinstance(request, Request): raise Exception( "exception_request 需return request") if (request.retry_times + 1 > setting.PARSER_MAX_RETRY_TIMES or request.is_abandoned): self.__class__._failed_task_count += 1 # 记录失败任务数 # 处理failed_request的返回值 request 或 func results = parser.failed_request( request, response) or [request] if not isinstance(results, Iterable): raise Exception( "%s.%s返回值必须可迭代" % (parser.name, "failed_request")) for result in results: if isinstance(result, Request): if setting.SAVE_FAILED_REQUEST: if used_download_midware_enable: # 去掉download_midware 添加的属性 original_request = ( Request.from_dict( eval(request_redis)) if request_redis else result) original_request.error_msg = ( request.error_msg) original_request.response = ( request.response) self._request_buffer.put_failed_request( original_request) else: self._request_buffer.put_failed_request( result) elif callable(result): self._request_buffer.put_request( result) elif isinstance(result, Item): self._item_buffer.put_item(result) del_request_redis_after_request_to_db = True else: # 将 requests 重新入库 爬取 request.retry_times += 1 request.filter_repeat = False log.info(""" 入库 等待重试 url %s 重试次数 %s 最大允许重试次数 %s""" % ( request.url, request.retry_times, setting.PARSER_MAX_RETRY_TIMES, )) if used_download_midware_enable: # 去掉download_midware 添加的属性 使用原来的requests original_request = (Request.from_dict( eval(request_redis)) if request_redis else request) if hasattr(request, "error_msg"): original_request.error_msg = request.error_msg if hasattr(request, "response"): original_request.response = request.response original_request.retry_times = request.retry_times original_request.filter_repeat = ( request.filter_repeat) self._request_buffer.put_request( original_request) else: self._request_buffer.put_request(request) del_request_redis_after_request_to_db = True else: # 记录下载成功的文档 self.record_download_status( PaserControl.DOWNLOAD_SUCCESS, parser.name) # 记录成功任务数 self.__class__._success_task_count += 1 # 缓存下载成功的文档 if setting.RESPONSE_CACHED_ENABLE: request.save_cached( response=response, expire_time=setting. RESPONSE_CACHED_EXPIRE_TIME, ) break # 删除正在做的request 跟随item优先 if request_redis: if del_request_redis_after_item_to_db: self._item_buffer.put_item(request_redis) elif del_request_redis_after_request_to_db: self._request_buffer.put_del_request(request_redis) else: self._request_buffer.put_del_request(request_redis) if setting.PARSER_SLEEP_TIME: time.sleep(setting.PARSER_SLEEP_TIME)
def deal_requests(self, requests): for request in requests: response = None for parser in self._parsers: if parser.name == request.parser_name: try: # 记录需下载的文档 self.record_download_status( PaserControl.DOWNLOAD_TOTAL, parser.name) # 解析request if request.auto_request: request_temp = None if request.download_midware: download_midware = ( request.download_midware if callable(request.download_midware) else tools.get_method( parser, request.download_midware)) request_temp = download_midware(request) elif request.download_midware != False: request_temp = parser.download_midware(request) if request_temp: if not isinstance(request_temp, Request): raise Exception( "download_midware need return a request, but received type: {}" .format(type(request_temp))) request = request_temp response = (request.get_response() if not setting.RESPONSE_CACHED_USED else request.get_response_from_cached( save_cached=False)) else: response = None if request.callback: # 如果有parser的回调函数,则用回调处理 callback_parser = (request.callback if callable( request.callback) else tools.get_method( parser, request.callback)) results = callback_parser(request, response) else: # 否则默认用parser处理 results = parser.parser(request, response) if results and not isinstance(results, Iterable): raise Exception( "%s.%s返回值必须可迭代" % (parser.name, request.callback or "parser")) # 此处判断是request 还是 item for result in results or []: if isinstance(result, Request): # 给request的 parser_name 赋值 result.parser_name = result.parser_name or parser.name # 判断是同步的callback还是异步的 if result.request_sync: # 同步 requests.append(result) else: # 异步 # 将next_request 入库 self._memory_db.add(result) except Exception as e: exception_type = (str(type(e)).replace("<class '", "").replace( "'>", "")) if exception_type.startswith("requests"): # 记录下载失败的文档 self.record_download_status( PaserControl.DOWNLOAD_EXCEPTION, parser.name) else: # 记录解析程序异常 self.record_download_status( PaserControl.PAESERS_EXCEPTION, parser.name) if setting.LOG_LEVEL == "DEBUG": # 只有debug模式下打印, 超时的异常篇幅太多 log.exception(e) log.error(""" -------------- %s.%s error ------------- error %s response %s deal request %s """ % ( parser.name, (request.callback and callable(request.callback) and getattr(request.callback, "__name__") or request.callback) or "parser", str(e), response, tools.dumps_json(request.to_dict, indent=28) if setting.LOG_LEVEL == "DEBUG" else request, )) request.error_msg = "%s: %s" % (exception_type, e) request.response = str(response) if "Invalid URL" in str(e): request.is_abandoned = True requests = parser.exception_request( request, response) or [request] if not isinstance(requests, Iterable): raise Exception("%s.%s返回值必须可迭代" % (parser.name, "exception_request")) for request in requests: if not isinstance(request, Request): raise Exception( "exception_request 需return request") if (request.retry_times + 1 > setting.PARSER_MAX_RETRY_TIMES or request.is_abandoned): self.__class__._failed_task_count += 1 # 记录失败任务数 # 处理failed_request的返回值 request 或 func results = parser.failed_request( request, response) or [request] if not isinstance(results, Iterable): raise Exception( "%s.%s返回值必须可迭代" % (parser.name, "failed_request")) log.info(""" 任务超过最大重试次数,丢弃 url %s 重试次数 %s 最大允许重试次数 %s""" % ( request.url, request.retry_times, setting.PARSER_MAX_RETRY_TIMES, )) else: # 将 requests 重新入库 爬取 request.retry_times += 1 request.filter_repeat = False log.info(""" 入库 等待重试 url %s 重试次数 %s 最大允许重试次数 %s""" % ( request.url, request.retry_times, setting.PARSER_MAX_RETRY_TIMES, )) self._memory_db.add(request) else: # 记录下载成功的文档 self.record_download_status( PaserControl.DOWNLOAD_SUCCESS, parser.name) # 记录成功任务数 self.__class__._success_task_count += 1 # 缓存下载成功的文档 if setting.RESPONSE_CACHED_ENABLE: request.save_cached( response=response, expire_time=setting. RESPONSE_CACHED_EXPIRE_TIME, ) break if setting.PARSER_SLEEP_TIME: time.sleep(setting.PARSER_SLEEP_TIME)
def flush(self): try: self.__add_request_to_db() except Exception as e: log.exception(e)