def __dedup_items(self, items, items_fingerprints): """ 去重 @param items: @param items_fingerprints: @return: 返回去重后的items, items_fingerprints """ if not items: return items, items_fingerprints is_exists = self.__class__.dedup.get(items_fingerprints) is_exists = is_exists if isinstance(is_exists, list) else [is_exists] dedup_items = [] dedup_items_fingerprints = [] items_count = dedup_items_count = dup_items_count = 0 while is_exists: item = items.pop(0) items_fingerprint = items_fingerprints.pop(0) is_exist = is_exists.pop(0) items_count += 1 if not is_exist: dedup_items.append(item) dedup_items_fingerprints.append(items_fingerprint) dedup_items_count += 1 else: dup_items_count += 1 log.info("待入库数据 {} 条, 重复 {} 条,实际待入库数据 {} 条".format( items_count, dup_items_count, dedup_items_count)) return dedup_items, dedup_items_fingerprints
def spider_end(self): self.record_end_time() if self._end_callback: self._end_callback() for parser in self._parsers: parser.close() parser.end_callback() # 计算抓取时常 data = self._redisdb.hget( self._tab_spider_time, SPIDER_START_TIME_KEY, is_pop=True ) if data: begin_timestamp = int(data) spand_time = tools.get_current_timestamp() - begin_timestamp msg = "《%s》爬虫结束,耗时 %s" % ( self._spider_name, tools.format_seconds(spand_time), ) log.info(msg) if self._send_run_time: self.send_msg(msg) if not self._auto_stop_when_spider_done: log.info("爬虫不自动结束, 等待下一轮任务...") else: self.delete_tables(self._tab_spider_status)
def distribute_task(self, *args, **kws): """ @summary: 分发任务 并将返回的request入库 --------- @param tasks: --------- @result: """ self._is_distributed_task = False for parser in self._parsers: requests = parser.__start_requests(*args, **kws) if requests and not isinstance(requests, Iterable): raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests")) result_type = 1 for request in requests or []: if isinstance(request, Request): request.parser_name = request.parser_name or parser.name self._request_buffer.put_request(request) self._is_distributed_task = True result_type = 1 elif isinstance(request, Item): self._item_buffer.put_item(request) result_type = 2 elif callable(request): # callbale的request可能是更新数据库操作的函数 if result_type == 1: self._request_buffer.put_request(request) else: self._item_buffer.put_item(request) self._request_buffer.flush() self._item_buffer.flush() if self._is_distributed_task: # 有任务时才提示启动爬虫 # begin self.spider_begin() self.record_spider_state( spider_type=1, state=0, batch_date=tools.get_current_date(), spider_start_time=tools.get_current_date(), batch_interval=self._batch_interval, ) # 重置已经提示无任务状态为False self._is_show_not_task = False elif not self._is_show_not_task: # 无任务,且没推送过无任务信息 # 发送无任务消息 msg = "《%s》start_requests无任务添加" % (self._spider_name) log.info(msg) # self.send_msg(msg) self._is_show_not_task = True
def start_monitor_task(self, *args, **kws): if not self.is_reach_next_spider_time(): return self._auto_start_requests = False redisdb = RedisDB() if not self._parsers: # 不是add_parser 模式 self._parsers.append(self) while True: try: # 检查redis中是否有任务 tab_requests = setting.TAB_REQUSETS.format( table_folder=self._table_folder) todo_task_count = redisdb.zget_count(tab_requests) if todo_task_count < self._min_task_count: # 添加任务 # make start requests self.distribute_task(*args, **kws) else: log.info("redis 中尚有%s条积压任务,暂时不派发新任务" % todo_task_count) except Exception as e: log.exception(e) if self._auto_stop_when_spider_done: break time.sleep(self._check_task_interval)
def update_items(self, tab_item, items_data, update_keys=()): """ @summary: --------- @param tab_item: redis中items的表名 @param items_data: [item.to_dict] 数据 @param update_keys: 更新的字段 --------- @result: """ to_table = tools.get_info(tab_item, ":s_(.*?)_item", fetch_one=True) sql, datas = tools.make_batch_sql( to_table, items_data, update_columns=update_keys or list(items_data[0].keys()), ) update_count = self._to_db.add_batch(sql, datas) if update_count is None: log.error("更新表 %s 数据失败" % (to_table)) else: msg = "共更新 %s 条数据 到 %s" % (update_count // 2, to_table) if update_keys: msg += " 更新字段为 {}".format(update_keys) log.info(msg) return update_count != None
def db_tip(self): msg = "\n" if setting.ADD_ITEM_TO_MYSQL: msg += "item 自动入mysql\n" if setting.ADD_ITEM_TO_REDIS: msg += "item 自动入redis\n" if msg == "\n": log.warning("*** 请注意检查item是否入库 !!!") else: log.info(msg)
def delete_tables(self, delete_tables_list): if isinstance(delete_tables_list, bool): delete_tables_list = [self._table_folder + "*"] elif not isinstance(delete_tables_list, (list, tuple)): delete_tables_list = [delete_tables_list] redis = RedisDB() for delete_tab in delete_tables_list: if delete_tab == "*": delete_tab = self._table_folder + "*" tables = redis.getkeys(delete_tab) for table in tables: log.info("正在删除表 %s" % table) redis.clear(table)
def __init__(self, ip_ports=None, db=None, user_pass=None, url=None, decode_responses=True): # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值 if ip_ports is None: ip_ports = setting.REDISDB_IP_PORTS if db is None: db = setting.REDISDB_DB if user_pass is None: user_pass = setting.REDISDB_USER_PASS self._is_redis_cluster = False try: if not url: ip_ports = (ip_ports if isinstance(ip_ports, list) else ip_ports.split(",")) if len(ip_ports) > 1: pass else: ip, port = ip_ports[0].split(":") self._redis = redis.Redis( host=ip, port=port, db=db, password=user_pass, decode_responses=decode_responses, ) # redis默认端口是6379 else: self._redis = redis.from_url(url, decode_responses=decode_responses) except Exception as e: raise else: if not url: log.info("连接到redis数据库 %s db%s" % (ip_ports, db)) else: log.info("连接到redis数据库 %s" % (url)) self._ip_ports = ip_ports self._db = db self._user_pass = user_pass self._url = url
def __add_task(self): # 启动parser 的 start_requests self.spider_begin() # 不自动结束的爬虫此处只能执行一遍 self.record_spider_state( spider_type=1, state=0, batch_date=tools.get_current_date(), spider_start_time=tools.get_current_date(), batch_interval=self._batch_interval, ) # 判断任务池中属否还有任务,若有接着抓取 todo_task_count = self._collector.get_requests_count() if todo_task_count: log.info("检查到有待做任务 %s 条,不重下发新任务。将接着上回异常终止处继续抓取" % todo_task_count) else: for parser in self._parsers: results = parser.start_requests(*self._parser_args, **self._parser_kwargs) # 添加request到请求队列,由请求队列统一入库 if results and not isinstance(results, Iterable): raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests")) result_type = 1 for result in results or []: if isinstance(result, Request): result.parser_name = result.parser_name or parser.name self._request_buffer.put_request(result) result_type = 1 elif isinstance(result, Item): self._item_buffer.put_item(result) result_type = 2 elif callable(result): # callbale的request可能是更新数据库操作的函数 if result_type == 1: self._request_buffer.put_request(result) else: self._item_buffer.put_item(result) else: raise TypeError( "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}" .format(type(result))) self._request_buffer.flush() self._item_buffer.flush()
def run(self): while not self._thread_stop: try: requests = self._memory_db.get() if not requests: if not self.is_show_tip: log.info("parser 等待任务 ...") self.is_show_tip = True time.sleep(1) self._wait_task_time += 1 continue self.is_show_tip = False self.deal_requests([requests]) except Exception as e: log.exception(e)
def __export_to_db(self, tab_item, datas, is_update=False, update_keys=()): export_success = False # 打点 校验 to_table = tools.get_info(tab_item, ":s_(.*?)_item", fetch_one=True) item_name = to_table + "_item" self.check_datas(table=to_table, datas=datas) if setting.ADD_ITEM_TO_MYSQL: # 任务表需要入mysql if isinstance(setting.ADD_ITEM_TO_MYSQL, (list, tuple)): for item in setting.ADD_ITEM_TO_MYSQL: if item in item_name: export_success = ( self._export_data.export_items(tab_item, datas) if not is_update else self._export_data.update_items( tab_item, datas, update_keys=update_keys ) ) else: export_success = ( self._export_data.export_items(tab_item, datas) if not is_update else self._export_data.update_items( tab_item, datas, update_keys=update_keys ) ) if setting.ADD_ITEM_TO_REDIS: if isinstance(setting.ADD_ITEM_TO_REDIS, (list, tuple)): for item in setting.ADD_ITEM_TO_REDIS: if item in item_name: self._db.sadd(tab_item, datas) export_success = True log.info("共导出 %s 条数据 到redis %s" % (len(datas), tab_item)) break else: self._db.sadd(tab_item, datas) export_success = True log.info("共导出 %s 条数据 到redis %s" % (len(datas), tab_item)) return export_success
def run(self): while not self._thread_stop: try: requests = self._collector.get_requests( setting.PARSER_TASK_COUNT) if not requests: if not self.is_show_tip: log.info("parser 等待任务 ...") self.is_show_tip = True # log.info('parser 等待任务 {}...'.format(tools.format_seconds(self._wait_task_time))) time.sleep(1) self._wait_task_time += 1 continue self.is_show_tip = False self.deal_requests(requests) except Exception as e: log.exception(e)
def export_items(self, tab_item, items_data): """ @summary: --------- @param tab_item: redis中items的表名 @param items_data: [item.to_dict] 数据 --------- @result: """ to_table = tools.get_info(tab_item, ":s_(.*?)_item", fetch_one=True) sql, datas = tools.make_batch_sql(to_table, items_data) add_count = self._to_db.add_batch(sql, datas) datas_size = len(datas) if add_count is None: log.error("导出数据到表 %s 失败" % (to_table)) else: log.info("共导出 %s 条数据 到 %s, 重复 %s 条" % (datas_size, to_table, datas_size - add_count)) return add_count != None
def is_reach_next_spider_time(self): if not self._batch_interval: return True last_spider_end_time = self._redisdb.hget(self._tab_spider_time, SPIDER_END_TIME_KEY) if last_spider_end_time: last_spider_end_time = int(last_spider_end_time) current_timestamp = tools.get_current_timestamp() time_interval = current_timestamp - last_spider_end_time if time_interval < self._batch_interval * 86400: log.info("上次运行结束时间为 {} 与当前时间间隔 为 {}, 小于规定的抓取时间间隔 {}。爬虫不执行,退出~". format( tools.timestamp_to_date(last_spider_end_time), tools.format_seconds(time_interval), tools.format_seconds(self._batch_interval * 86400), )) return False return True
def export_all( self, tables, auto_update=False, batch_count=100, every_table_per_export_callback=None, ): """ @summary: 导出所有item --------- @param tables: 如qidian 则导出起点下面所有的items 数据库中的表格式必须有规律 如导出 qidian:comment:s_qidian_book_comment_dynamic_item 对应导入 qidian_book_comment_dynamic @param auto_update: 是否自动更新 @param batch_count: 每批次导出的数量 @every_table_per_export_callback: 导出前的回调函数, 用来修改特定表的参数 to_table, auto_update, batch_count 如: def every_table_per_export_callback(to_table, auto_update, batch_count): if to_table == 'xxx': auto_update = True return to_table, auto_update, batch_count --------- @result: """ tables = (self._redisdb.getkeys(tables + "*_item") if not isinstance(tables, list) else tables) if not tables: log.info("无表数据") for table in tables: from_table = table to_table = tools.get_info(str(from_table), ":s_(.*?)_item", fetch_one=True) if callable(every_table_per_export_callback): to_table, auto_update, batch_count = every_table_per_export_callback( to_table, auto_update, batch_count) log.info(""" \r正在导出 %s -> %s""" % (from_table, to_table)) self.export(from_table, to_table, auto_update, batch_count)
def get_response_from_cached(self, save_cached=True): """ 从缓存中获取response 注意: 属性值为空: -raw : urllib3.response.HTTPResponse -connection:requests.adapters.HTTPAdapter -history 属性含义改变: - request 由requests 改为Request @param: save_cached 当无缓存 直接下载 下载完是否保存缓存 @return: """ response_dict = self._cache_db.strget(self._cached_table_folder) if not response_dict: log.info("无response缓存 重新下载") response_obj = self.get_response(save_cached=save_cached) else: response_dict = eval(response_dict) response_obj = Response.from_dict(response_dict) return response_obj
def export(self, from_table, to_table, auto_update=False, batch_count=100): """ @summary: 用于从redis的item中导出数据到关系型数据库,如mysql/oracle from_table与to_table表结构必须一致 --------- @param from_table: @param to_table: @param auto_update: 当数据存在时是否自动更新 默认否 --------- @result: """ total_count = 0 while True: datas = [] try: datas = self._redisdb.sget(from_table, count=batch_count, is_pop=False) if not datas: log.info(""" \r%s -> %s 共导出 %s 条数据""" % (from_table, to_table, total_count)) break json_datas = [eval(data) for data in datas] sql, json_datas = tools.make_batch_sql(to_table, json_datas, auto_update) if self._to_db.add_batch(sql, json_datas): total_count += len(json_datas) self._redisdb.srem(from_table, datas) except Exception as e: log.exception(e) log.error(datas)
def __init__( self, table_folder=None, parser_count=None, begin_callback=None, end_callback=None, delete_tabs=(), process_num=None, auto_stop_when_spider_done=None, auto_start_requests=None, send_run_time=True, batch_interval=0, *parser_args, **parser_kwargs ): """ @summary: 调度器 --------- @param table_folder: 爬虫request及item存放reis中的文件夹 @param parser_count: 线程数,默认为配置文件中的线程数 @param begin_callback: 爬虫开始回调函数 @param end_callback: 爬虫结束回调函数 @param delete_tabs: 爬虫启动时删除的表,类型: 元组/bool/string。 支持正则 @param process_num: 进程数 @param auto_stop_when_spider_done: 爬虫抓取完毕后是否自动结束或等待任务,默认自动结束 @param auto_start_requests: 爬虫是否自动添加任务 @param send_run_time: 发送运行时间 @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时,只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时,爬虫才启动 @param *parser_args: 传给parser下start_requests的参数, tuple() @param **parser_kwargs: 传给parser下start_requests的参数, dict() --------- @result: """ super(Scheduler, self).__init__() for key, value in self.__class__.__custom_setting__.items(): setattr(setting, key, value) self._table_folder = table_folder or setting.TABLE_FOLDER if not self._table_folder: raise Exception( """ table_folder 为redis中存放request与item的目录。不能为空, 可在setting中配置,如 TABLE_FOLDER = 'test' 或spider初始化时传参, 如 TestSpider(table_folder='test') """ ) self._request_buffer = RequestBuffer(table_folder) self._item_buffer = ItemBuffer(table_folder) self._collector = Collector(table_folder, process_num) self._parsers = [] self._parser_controls = [] self._parser_control_obj = PaserControl self._parser_args = parser_args self._parser_kwargs = parser_kwargs self._auto_stop_when_spider_done = ( auto_stop_when_spider_done if auto_stop_when_spider_done is not None else setting.AUTO_STOP_WHEN_SPIDER_DONE ) self._auto_start_requests = ( auto_start_requests if auto_start_requests is not None else setting.PARSER_AUTO_START_REQUESTS ) self._send_run_time = send_run_time self._batch_interval = batch_interval self._begin_callback = ( begin_callback if begin_callback else lambda: log.info("\n********** spider begin **********") ) self._end_callback = ( end_callback if end_callback else lambda: log.info("\n********** spider end **********") ) self._parser_count = setting.PARSER_COUNT if not parser_count else parser_count self._spider_name = table_folder self._project_name = table_folder.split(":")[0] self._tab_spider_time = setting.TAB_SPIDER_TIME.format( table_folder=table_folder ) self._tab_spider_status = setting.TAB_SPIDER_STATUS.format( table_folder=table_folder ) self._tab_requests = setting.TAB_REQUSETS.format(table_folder=table_folder) self._tab_failed_requests = setting.TAB_FAILED_REQUSETS.format( table_folder=table_folder ) self._is_notify_end = False # 是否已经通知结束 self._last_task_count = 0 # 最近一次任务数量 self._redisdb = RedisDB() self._project_total_state_table = "{}_total_state".format(self._project_name) self._is_exist_project_total_state_table = False # Request 缓存设置 Request.cached_table_folder = table_folder Request.cached_expire_time = setting.RESPONSE_CACHED_EXPIRE_TIME delete_tabs = delete_tabs or setting.DELETE_TABS if delete_tabs: self.delete_tables(delete_tabs) self._last_check_task_status_time = 0
def _start(self): if self._auto_start_requests: # 将添加任务处加锁,防止多进程之间添加重复的任务 with RedisLock( key=self._spider_name, timeout=3600, wait_timeout=60, redis_uri="redis://:{password}@{host_post}/{db}".format( password=setting.REDISDB_USER_PASS, host_post=setting.REDISDB_IP_PORTS, db=setting.REDISDB_DB, ), ) as lock: if lock.locked: # 启动parser 的 start_requests self.spider_begin() # 不自动结束的爬虫此处只能执行一遍 self.record_spider_state( spider_type=1, state=0, batch_date=tools.get_current_date(), spider_start_time=tools.get_current_date(), batch_interval=self._batch_interval, ) # 判断任务池中属否还有任务,若有接着抓取 todo_task_count = self._collector.get_requests_count() if todo_task_count: log.info( "检查到有待做任务 %s 条,不重下发新任务。将接着上回异常终止处继续抓取" % todo_task_count ) else: for parser in self._parsers: results = parser.start_requests( *self._parser_args, **self._parser_kwargs ) # 添加request到请求队列,由请求队列统一入库 if results and not isinstance(results, Iterable): raise Exception( "%s.%s返回值必须可迭代" % (parser.name, "start_requests") ) result_type = 1 for result in results or []: if isinstance(result, Request): result.parser_name = ( result.parser_name or parser.name ) self._request_buffer.put_request(result) result_type = 1 elif isinstance(result, Item): self._item_buffer.put_item(result) result_type = 2 elif callable(result): # callbale的request可能是更新数据库操作的函数 if result_type == 1: self._request_buffer.put_request(result) else: self._item_buffer.put_item(result) else: raise TypeError( "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format( type(result) ) ) self._request_buffer.flush() self._item_buffer.flush() # 启动collector self._collector.start() # 启动parser control for i in range(self._parser_count): parser_control = self._parser_control_obj( self._collector, self._table_folder, self._request_buffer, self._item_buffer, ) for parser in self._parsers: parser_control.add_parser(parser) parser_control.start() self._parser_controls.append(parser_control) # 启动request_buffer self._request_buffer.start() # 启动item_buffer self._item_buffer.start()
def deal_requests(self, requests): for request in requests: response = None for parser in self._parsers: if parser.name == request.parser_name: try: # 记录需下载的文档 self.record_download_status( PaserControl.DOWNLOAD_TOTAL, parser.name) # 解析request if request.auto_request: request_temp = None if request.download_midware: download_midware = ( request.download_midware if callable(request.download_midware) else tools.get_method( parser, request.download_midware)) request_temp = download_midware(request) elif request.download_midware != False: request_temp = parser.download_midware(request) if request_temp: if not isinstance(request_temp, Request): raise Exception( "download_midware need return a request, but received type: {}" .format(type(request_temp))) request = request_temp response = (request.get_response() if not setting.RESPONSE_CACHED_USED else request.get_response_from_cached( save_cached=False)) else: response = None if request.callback: # 如果有parser的回调函数,则用回调处理 callback_parser = (request.callback if callable( request.callback) else tools.get_method( parser, request.callback)) results = callback_parser(request, response) else: # 否则默认用parser处理 results = parser.parser(request, response) if results and not isinstance(results, Iterable): raise Exception( "%s.%s返回值必须可迭代" % (parser.name, request.callback or "parser")) # 此处判断是request 还是 item for result in results or []: if isinstance(result, Request): # 给request的 parser_name 赋值 result.parser_name = result.parser_name or parser.name # 判断是同步的callback还是异步的 if result.request_sync: # 同步 requests.append(result) else: # 异步 # 将next_request 入库 self._memory_db.add(result) except Exception as e: exception_type = (str(type(e)).replace("<class '", "").replace( "'>", "")) if exception_type.startswith("requests"): # 记录下载失败的文档 self.record_download_status( PaserControl.DOWNLOAD_EXCEPTION, parser.name) else: # 记录解析程序异常 self.record_download_status( PaserControl.PAESERS_EXCEPTION, parser.name) if setting.LOG_LEVEL == "DEBUG": # 只有debug模式下打印, 超时的异常篇幅太多 log.exception(e) log.error(""" -------------- %s.%s error ------------- error %s response %s deal request %s """ % ( parser.name, (request.callback and callable(request.callback) and getattr(request.callback, "__name__") or request.callback) or "parser", str(e), response, tools.dumps_json(request.to_dict, indent=28) if setting.LOG_LEVEL == "DEBUG" else request, )) request.error_msg = "%s: %s" % (exception_type, e) request.response = str(response) if "Invalid URL" in str(e): request.is_abandoned = True requests = parser.exception_request( request, response) or [request] if not isinstance(requests, Iterable): raise Exception("%s.%s返回值必须可迭代" % (parser.name, "exception_request")) for request in requests: if not isinstance(request, Request): raise Exception( "exception_request 需return request") if (request.retry_times + 1 > setting.PARSER_MAX_RETRY_TIMES or request.is_abandoned): self.__class__._failed_task_count += 1 # 记录失败任务数 # 处理failed_request的返回值 request 或 func results = parser.failed_request( request, response) or [request] if not isinstance(results, Iterable): raise Exception( "%s.%s返回值必须可迭代" % (parser.name, "failed_request")) log.info(""" 任务超过最大重试次数,丢弃 url %s 重试次数 %s 最大允许重试次数 %s""" % ( request.url, request.retry_times, setting.PARSER_MAX_RETRY_TIMES, )) else: # 将 requests 重新入库 爬取 request.retry_times += 1 request.filter_repeat = False log.info(""" 入库 等待重试 url %s 重试次数 %s 最大允许重试次数 %s""" % ( request.url, request.retry_times, setting.PARSER_MAX_RETRY_TIMES, )) self._memory_db.add(request) else: # 记录下载成功的文档 self.record_download_status( PaserControl.DOWNLOAD_SUCCESS, parser.name) # 记录成功任务数 self.__class__._success_task_count += 1 # 缓存下载成功的文档 if setting.RESPONSE_CACHED_ENABLE: request.save_cached( response=response, expire_time=setting. RESPONSE_CACHED_EXPIRE_TIME, ) break if setting.PARSER_SLEEP_TIME: time.sleep(setting.PARSER_SLEEP_TIME)
def task_is_done(self): """ @summary: 检查任务状态 是否做完 同时更新批次时间 (不能挂 挂了批次时间就不更新了) --------- --------- @result: True / False (做完 / 未做完) """ is_done = False # 查看批次记录表任务状态 sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count, is_done from {batch_record_table} order by id desc limit 1'.format( date_format=self._date_format.replace(":%M", ":%i"), batch_record_table=self._batch_record_table, ) batch_info = self._mysqldb.find(sql) if batch_info is None: raise Exception("查询批次信息失败") if batch_info: self._batch_date_cache, total_count, done_count, is_done = batch_info[ 0] # 更新self._batch_date_cache, 防止新批次已经开始了,但self._batch_date_cache还是原来的批次时间 log.info("《%s》 批次时间%s 批次进度 %s/%s 完成状态 %d" % ( self._batch_name, self._batch_date_cache, done_count, total_count, is_done, )) os.environ[ "batch_date"] = self._batch_date_cache # 更新BatchParser里边的批次时间 if is_done: # 检查任务表中是否有没做的任务 若有则is_done 为 False # 比较耗时 加锁防止多进程同时查询 with RedisLock( key=self._spider_name, timeout=3600, wait_timeout=0, redis_uri="redis://:{password}@{host_post}/{db}".format( password=setting.REDISDB_USER_PASS, host_post=setting.REDISDB_IP_PORTS, db=setting.REDISDB_DB, ), ) as lock: if lock.locked: log.info("批次表标记已完成,正在检查任务表是否有未完成的任务") sql = "select 1 from %s where (%s = 0 or %s=2)%s limit 1" % ( self._task_table, self._task_state, self._task_state, self._task_condition_prefix_and, ) tasks = self._mysqldb.find(sql) # [(1,)] / [] if tasks: log.info("检测到任务表中有未完成任务,等待任务下发") is_done = False # 更新batch_record 表的is_done 状态,减少查询任务表的次数 sql = 'update {batch_record_table} set is_done = 0 where batch_date = "{batch_date}"'.format( batch_record_table=self._batch_record_table, batch_date=self._batch_date_cache, ) self._mysqldb.update(sql) else: log.info("任务表中任务均已完成,爬虫结束") else: log.info("批次表标记已完成,其他爬虫进程正在检查任务表是否有未完成的任务,本进程跳过检查,继续等待") is_done = False return is_done
def check_batch(self, is_first_check=False): """ @summary: 检查批次是否完成 --------- @param: is_first_check 是否为首次检查,若首次检查,且检查结果为批次已完成,则不发送批次完成消息。因为之前发送过了 --------- @result: 完成返回True 否则False """ sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count from {batch_record_table} order by id desc limit 1'.format( date_format=self._date_format.replace(":%M", ":%i"), batch_record_table=self._batch_record_table, ) batch_info = self._mysqldb.find(sql) # (('2018-08-19', 49686, 0),) if batch_info: batch_date, total_count, done_count = batch_info[0] now_date = datetime.datetime.now() last_batch_date = datetime.datetime.strptime( batch_date, self._date_format) time_difference = now_date - last_batch_date if total_count == done_count and time_difference < datetime.timedelta( days=self._batch_interval): # 若在本批次内,再次检查任务表是否有新增任务 # # 改成查询任务表 看是否真的没任务了,因为batch_record表里边的数量可能没来得及更新 task_count = self.__get_task_state_count() total_count = task_count.get("total_count") done_count = task_count.get("done_count") if total_count == done_count: # 检查相关联的爬虫是否完成 releated_spider_is_done = self.related_spider_is_done() if releated_spider_is_done == False: msg = "《{}》本批次未完成, 正在等待依赖爬虫 {} 结束. 批次时间 {} 批次进度 {}/{}".format( self._batch_name, self._related_batch_record or self._related_task_tables, batch_date, done_count, total_count, ) log.info(msg) # 检查是否超时 超时发出报警 if time_difference >= datetime.timedelta( days=self._batch_interval): # 已经超时 if (not self._last_send_msg_time or now_date - self._last_send_msg_time >= self._send_msg_interval): self._last_send_msg_time = now_date self.send_msg(msg, level="error") return False elif releated_spider_is_done == True: # 更新is_done 状态 self.update_is_done() else: self.update_is_done() msg = "《{}》本批次完成 批次时间 {} 共处理 {} 条任务".format( self._batch_name, batch_date, done_count) log.info(msg) if not is_first_check: self.send_msg(msg) # 判断下一批次是否到 if time_difference >= datetime.timedelta( days=self._batch_interval): msg = "《{}》下一批次开始".format(self._batch_name) log.info(msg) self.send_msg(msg) # 初始化任务表状态 if self.init_task() != False: # 更新失败返回False 其他返回True/None # 初始化属性 self.init_property() is_success = ( self.record_batch() ) # 有可能插入不成功,但是任务表已经重置了,不过由于当前时间为下一批次的时间,检查批次是否结束时不会检查任务表,所以下次执行时仍然会重置 if is_success: log.info( "插入新批次记录成功 1分钟后开始下发任务") # 防止work批次时间没来得及更新 tools.delay_time(60) return False # 下一批次开始 else: return True # 下一批次不开始。先不派发任务,因为批次表新批次插入失败了,需要插入成功后再派发任务 else: log.info("《{}》下次批次时间未到".format(self._batch_name)) if not is_first_check: self.send_msg("《{}》下次批次时间未到".format(self._batch_name)) return True else: if time_difference >= datetime.timedelta( days=self._batch_interval): # 已经超时 time_out = time_difference - datetime.timedelta( days=self._batch_interval) time_out_pretty = tools.format_seconds( time_out.total_seconds()) msg = "《{}》本批次已超时{} 批次时间 {}, 批次进度 {}/{}".format( self._batch_name, time_out_pretty, batch_date, done_count, total_count, ) if self._batch_interval >= 1: msg += ", 期望时间{}天".format(self._batch_interval) else: msg += ", 期望时间{}小时".format(self._batch_interval * 24) result = self.get_deal_speed( total_count=total_count, done_count=done_count, last_batch_date=last_batch_date, ) if result: deal_speed, need_time, overflow_time, calculate_speed_time = ( result) msg += ", 任务处理速度于{}统计, 约 {}条/小时, 预计还需 {}".format( calculate_speed_time, deal_speed, tools.format_seconds(need_time), ) if overflow_time > 0: msg += ", 该批次预计总超时 {}, 请及时处理".format( tools.format_seconds(overflow_time)) log.info(msg) if (not self._last_send_msg_time or now_date - self._last_send_msg_time >= self._send_msg_interval): self._last_send_msg_time = now_date self.send_msg(msg, level="error") else: # 未超时 remaining_time = ( datetime.timedelta(days=self._batch_interval) - time_difference) remaining_time_pretty = tools.format_seconds( remaining_time.total_seconds()) if self._batch_interval >= 1: msg = "《{}》本批次正在进行, 批次时间 {}, 批次进度 {}/{}, 期望时间{}天, 剩余{}".format( self._batch_name, batch_date, done_count, total_count, self._batch_interval, remaining_time_pretty, ) else: msg = "《{}》本批次正在进行, 批次时间 {}, 批次进度 {}/{}, 期望时间{}小时, 剩余{}".format( self._batch_name, batch_date, done_count, total_count, self._batch_interval * 24, remaining_time_pretty, ) result = self.get_deal_speed( total_count=total_count, done_count=done_count, last_batch_date=last_batch_date, ) if result: deal_speed, need_time, overflow_time, calculate_speed_time = ( result) msg += ", 任务处理速度于{}统计, 约 {}条/小时, 预计还需 {}".format( calculate_speed_time, deal_speed, tools.format_seconds(need_time), ) if overflow_time > 0: msg += ", 该批次可能会超时 {}, 请及时处理".format( tools.format_seconds(overflow_time)) # 发送警报 if (not self._last_send_msg_time or now_date - self._last_send_msg_time >= self._send_msg_interval): self._last_send_msg_time = now_date self.send_msg(msg, level="error") elif overflow_time < 0: msg += ", 该批次预计提前 {} 完成".format( tools.format_seconds(-overflow_time)) log.info(msg) else: # 插入batch_date self.record_batch() # 初始化任务表状态 可能有产生任务的代码 self.init_task() return False
def start_monitor_task(self): """ @summary: 监控任务状态 --------- --------- @result: """ if not self._parsers: # 不是多模版模式, 将自己注入到parsers,自己为模版 self._is_more_parsers = False self._parsers.append(self) elif len(self._parsers) <= 1: self._is_more_parsers = False self.create_batch_record_table() # 添加任务 for parser in self._parsers: parser.add_task() is_first_check = True while True: try: if self.check_batch(is_first_check): # 该批次已经做完 if not self._auto_stop_when_spider_done: is_first_check = True log.info("爬虫所有任务已做完,不自动结束,等待新任务...") time.sleep(self._check_task_interval) continue else: break is_first_check = False # 检查redis中是否有任务 任务小于_min_task_count 则从mysql中取 tab_requests = setting.TAB_REQUSETS.format( table_folder=self._table_folder) todo_task_count = self._redisdb.zget_count(tab_requests) tasks = [] if todo_task_count < self._min_task_count: # 从mysql中取任务 # 更新batch表的任务状态数量 self.update_task_done_count() log.info("redis 中剩余任务%s 数量过小 从mysql中取任务追加" % todo_task_count) tasks = self.get_todo_task_from_mysql() if not tasks: # 状态为0的任务已经做完,需要检查状态为2的任务是否丢失 if (todo_task_count == 0 ): # redis 中无待做任务,此时mysql中状态为2的任务为丢失任务。需重新做 lose_task_count = self.get_lose_task_count() if not lose_task_count: time.sleep(self._check_task_interval) continue elif ( lose_task_count > self._task_limit * 5 ): # 丢失任务太多,直接重置,否则每次等redis任务消耗完再取下一批丢失任务,速度过慢 log.info("正在重置丢失任务为待做 共 {} 条".format( lose_task_count)) # 重置正在做的任务为待做 if self.reset_lose_task_from_mysql(): log.info("重置丢失任务成功") else: log.info("重置丢失任务失败") continue else: # 丢失任务少,直接取 log.info("正在取丢失任务 共 {} 条, 取 {} 条".format( lose_task_count, self._task_limit if self._task_limit <= lose_task_count else lose_task_count, )) tasks = self.get_doing_task_from_mysql() else: log.info("mysql 中取到待做任务 %s 条" % len(tasks)) else: log.info("redis 中尚有%s条积压任务,暂时不派发新任务" % todo_task_count) if not tasks: if todo_task_count >= self._min_task_count: # log.info('任务正在进行 redis中剩余任务 %s' % todo_task_count) pass else: log.info("mysql 中无待做任务 redis中剩余任务 %s" % todo_task_count) else: # make start requests self.distribute_task(tasks) log.info("添加任务到redis成功") except Exception as e: log.exception(e) time.sleep(self._check_task_interval)
def deal_requests(self, requests): for request in requests: response = None request_redis = request["request_redis"] request = request["request_obj"] del_request_redis_after_item_to_db = False del_request_redis_after_request_to_db = False for parser in self._parsers: if parser.name == request.parser_name: used_download_midware_enable = False try: # 记录需下载的文档 self.record_download_status( PaserControl.DOWNLOAD_TOTAL, parser.name) # 解析request if request.auto_request: request_temp = None if request.download_midware: download_midware = ( request.download_midware if callable(request.download_midware) else tools.get_method( parser, request.download_midware)) request_temp = download_midware(request) elif request.download_midware != False: request_temp = parser.download_midware(request) if request_temp: if not isinstance(request_temp, Request): raise Exception( "download_midware need return a request, but received type: {}" .format(type(request_temp))) used_download_midware_enable = True response = ( request_temp.get_response() if not setting.RESPONSE_CACHED_USED else request_temp.get_response_from_cached( save_cached=False)) else: response = (request.get_response() if not setting.RESPONSE_CACHED_USED else request.get_response_from_cached( save_cached=False)) if response == None: raise Exception( "连接超时 url: %s" % (request.url or request_temp.url)) else: response = None if request.callback: # 如果有parser的回调函数,则用回调处理 callback_parser = (request.callback if callable( request.callback) else tools.get_method( parser, request.callback)) results = callback_parser(request, response) else: # 否则默认用parser处理 results = parser.parser(request, response) if results and not isinstance(results, Iterable): raise Exception( "%s.%s返回值必须可迭代" % (parser.name, request.callback or "parser")) # 标识上一个result是什么 result_type = 0 # 0\1\2 (初始值\request\item) # 此处判断是request 还是 item for result in results or []: if isinstance(result, Request): result_type = 1 # 给request的 parser_name 赋值 result.parser_name = result.parser_name or parser.name # 判断是同步的callback还是异步的 if result.request_sync: # 同步 request_dict = { "request_obj": result, "request_redis": None, } requests.append(request_dict) else: # 异步 # 将next_request 入库 self._request_buffer.put_request(result) del_request_redis_after_request_to_db = True elif isinstance(result, Item): result_type = 2 # 将item入库 self._item_buffer.put_item(result) # 需删除正在做的request del_request_redis_after_item_to_db = True elif callable(result): # result为可执行的无参函数 if (result_type == 2 ): # item 的 callback,buffer里的item均入库后再执行 self._item_buffer.put_item(result) del_request_redis_after_item_to_db = True else: # result_type == 1: # request 的 callback,buffer里的request均入库后再执行。可能有的parser直接返回callback self._request_buffer.put_request(result) del_request_redis_after_request_to_db = True # else: # raise TypeError('Expect Request、Item、callback func, bug get type: {}'.format(type(result))) except Exception as e: exception_type = (str(type(e)).replace("<class '", "").replace( "'>", "")) if exception_type.startswith("requests"): # 记录下载失败的文档 self.record_download_status( PaserControl.DOWNLOAD_EXCEPTION, parser.name) else: # 记录解析程序异常 self.record_download_status( PaserControl.PAESERS_EXCEPTION, parser.name) if setting.LOG_LEVEL == "DEBUG": # 只有debug模式下打印, 超时的异常篇幅太多 log.exception(e) log.error(""" -------------- %s.%s error ------------- error %s response %s deal request %s """ % ( parser.name, (request.callback and callable(request.callback) and getattr(request.callback, "__name__") or request.callback) or "parser", str(e), response, tools.dumps_json(request.to_dict, indent=28) if setting.LOG_LEVEL == "DEBUG" else request, )) request.error_msg = "%s: %s" % (exception_type, e) request.response = str(response) if "Invalid URL" in str(e): request.is_abandoned = True requests = parser.exception_request( request, response) or [request] if not isinstance(requests, Iterable): raise Exception("%s.%s返回值必须可迭代" % (parser.name, "exception_request")) for request in requests: if callable(request): self._request_buffer.put_request(request) continue if not isinstance(request, Request): raise Exception( "exception_request 需return request") if (request.retry_times + 1 > setting.PARSER_MAX_RETRY_TIMES or request.is_abandoned): self.__class__._failed_task_count += 1 # 记录失败任务数 # 处理failed_request的返回值 request 或 func results = parser.failed_request( request, response) or [request] if not isinstance(results, Iterable): raise Exception( "%s.%s返回值必须可迭代" % (parser.name, "failed_request")) for result in results: if isinstance(result, Request): if setting.SAVE_FAILED_REQUEST: if used_download_midware_enable: # 去掉download_midware 添加的属性 original_request = ( Request.from_dict( eval(request_redis)) if request_redis else result) original_request.error_msg = ( request.error_msg) original_request.response = ( request.response) self._request_buffer.put_failed_request( original_request) else: self._request_buffer.put_failed_request( result) elif callable(result): self._request_buffer.put_request( result) elif isinstance(result, Item): self._item_buffer.put_item(result) del_request_redis_after_request_to_db = True else: # 将 requests 重新入库 爬取 request.retry_times += 1 request.filter_repeat = False log.info(""" 入库 等待重试 url %s 重试次数 %s 最大允许重试次数 %s""" % ( request.url, request.retry_times, setting.PARSER_MAX_RETRY_TIMES, )) if used_download_midware_enable: # 去掉download_midware 添加的属性 使用原来的requests original_request = (Request.from_dict( eval(request_redis)) if request_redis else request) if hasattr(request, "error_msg"): original_request.error_msg = request.error_msg if hasattr(request, "response"): original_request.response = request.response original_request.retry_times = request.retry_times original_request.filter_repeat = ( request.filter_repeat) self._request_buffer.put_request( original_request) else: self._request_buffer.put_request(request) del_request_redis_after_request_to_db = True else: # 记录下载成功的文档 self.record_download_status( PaserControl.DOWNLOAD_SUCCESS, parser.name) # 记录成功任务数 self.__class__._success_task_count += 1 # 缓存下载成功的文档 if setting.RESPONSE_CACHED_ENABLE: request.save_cached( response=response, expire_time=setting. RESPONSE_CACHED_EXPIRE_TIME, ) break # 删除正在做的request 跟随item优先 if request_redis: if del_request_redis_after_item_to_db: self._item_buffer.put_item(request_redis) elif del_request_redis_after_request_to_db: self._request_buffer.put_del_request(request_redis) else: self._request_buffer.put_del_request(request_redis) if setting.PARSER_SLEEP_TIME: time.sleep(setting.PARSER_SLEEP_TIME)
def __init__(self, ip_ports=None, db=None, user_pass=None, url=None, decode_responses=True, service_name=None, **kwargs): """ redis的封装 Args: ip_ports: ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"] db: user_pass: url: decode_responses: service_name: 适用于redis哨兵模式 """ # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值 if ip_ports is None: ip_ports = setting.REDISDB_IP_PORTS if db is None: db = setting.REDISDB_DB if user_pass is None: user_pass = setting.REDISDB_USER_PASS if service_name is None: service_name = setting.REDISDB_SERVICE_NAME self._is_redis_cluster = False try: if not url: ip_ports = (ip_ports if isinstance(ip_ports, list) else ip_ports.split(",")) if len(ip_ports) > 1: startup_nodes = [] for ip_port in ip_ports: ip, port = ip_port.split(":") startup_nodes.append({"host": ip, "port": port}) if service_name: log.info("使用redis哨兵模式") hosts = [(node["host"], node["port"]) for node in startup_nodes] sentinel = Sentinel(hosts, socket_timeout=0.5, **kwargs) self._redis = sentinel.master_for( service_name, password=user_pass, db=db, redis_class=redis.Redis, **kwargs) else: log.info("使用redis集群模式") self._redis = StrictRedisCluster( startup_nodes=startup_nodes, decode_responses=decode_responses, password=user_pass, **kwargs) self._is_redis_cluster = True else: ip, port = ip_ports[0].split(":") self._redis = redis.Redis( host=ip, port=port, db=db, password=user_pass, decode_responses=decode_responses, **kwargs) else: self._redis = redis.from_url(url, decode_responses=decode_responses) except Exception as e: raise else: if not url: log.info("连接到redis数据库 %s db%s" % (ip_ports, db)) else: log.info("连接到redis数据库 %s" % (url)) self._ip_ports = ip_ports self._db = db self._user_pass = user_pass self._url = url