def start_monitor_task(self, *args, **kws): if not self.is_reach_next_spider_time(): return self._auto_start_requests = False redisdb = RedisDB() if not self._parsers: # 不是add_parser 模式 self._parsers.append(self) while True: try: # 检查redis中是否有任务 tab_requests = setting.TAB_REQUSETS.format( table_folder=self._table_folder) todo_task_count = redisdb.zget_count(tab_requests) if todo_task_count < self._min_task_count: # 添加任务 # make start requests self.distribute_task(*args, **kws) else: log.info("redis 中尚有%s条积压任务,暂时不派发新任务" % todo_task_count) except Exception as e: log.exception(e) if self._auto_stop_when_spider_done: break time.sleep(self._check_task_interval)
def __init__(self, table_folder, process_num=None): """ @summary: --------- @param table_folder: @param process_num: 进程编号 --------- @result: """ super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._todo_requests = collections.deque() self._tab_requests = setting.TAB_REQUSETS.format( table_folder=table_folder) self._tab_spider_status = setting.TAB_SPIDER_STATUS.format( table_folder=table_folder) self._spider_mark = LOCAL_HOST_IP + ("_%s" % process_num if process_num else "_0") self._interval = setting.COLLECTOR_SLEEP_TIME self._request_count = setting.COLLECTOR_TASK_COUNT self._is_collector_task = False self._db.clear(self._tab_spider_status)
def __init__(self, table_folder): if not hasattr(self, "_table_item"): super(ItemBuffer, self).__init__() self._thread_stop = False self._is_adding_to_db = False self._table_folder = table_folder self._items_queue = Queue(maxsize=MAX_ITEM_COUNT) self._db = RedisDB() self._table_item = setting.TAB_ITEM self._table_request = setting.TAB_REQUSETS.format( table_folder=table_folder) self._item_tables = { # 'xxx_item': {'tab_item': 'xxx:xxx_item'} # 记录item名与redis中item名对应关系 } self._item_update_keys = { # 'xxx:xxx_item': ['id', 'name'...] # 记录redis中item名与需要更新的key对应关系 } self._export_data = ExportData( ) if setting.ADD_ITEM_TO_MYSQL else None self.db_tip()
def __init__(self, table_folder): super(HandleFailedRequests, self).__init__() self._table_folder = table_folder self._redisdb = RedisDB() self._request_buffer = RequestBuffer(self._table_folder) self._table_failed_request = setting.TAB_FAILED_REQUSETS.format( table_folder=table_folder)
def delete_tables(self, delete_tables_list): if isinstance(delete_tables_list, bool): delete_tables_list = [self._table_folder + "*"] elif not isinstance(delete_tables_list, (list, tuple)): delete_tables_list = [delete_tables_list] redis = RedisDB() for delete_tab in delete_tables_list: if delete_tab == "*": delete_tab = self._table_folder + "*" tables = redis.getkeys(delete_tab) for table in tables: log.info("正在删除表 %s" % table) redis.clear(table)
def __init__(self, table_folder): if not hasattr(self, "_requests_deque"): super(RequestBuffer, self).__init__() self._thread_stop = False self._is_adding_to_db = False self._requests_deque = collections.deque() self._del_requests_deque = collections.deque() self._db = RedisDB() self._table_request = setting.TAB_REQUSETS.format( table_folder=table_folder) self._table_failed_request = setting.TAB_FAILED_REQUSETS.format( table_folder=table_folder)
def _start(self): # 启动request_buffer self._request_buffer.start() # 启动item_buffer self._item_buffer.start() # 启动collector self._collector.start() # 启动parser control for i in range(self._parser_count): parser_control = self._parser_control_obj( self._collector, self._table_folder, self._request_buffer, self._item_buffer, ) for parser in self._parsers: parser_control.add_parser(parser) parser_control.start() self._parser_controls.append(parser_control) # 下发任务 因为时间可能比较长,放到最后面 if setting.RETRY_FAILED_REQUESTS: # 重设失败的任务, 不用加锁,原子性操作 handle_failed_requests = HandleFailedRequests(self._table_folder) handle_failed_requests.reput_failed_requests_to_requests() # 下发新任务 if self._auto_start_requests: # 自动下发 if self.wait_lock: # 将添加任务处加锁,防止多进程之间添加重复的任务 with RedisLock( key=self._spider_name, timeout=3600, wait_timeout=60, redis_cli=RedisDB().get_redis_obj(), ) as lock: if lock.locked: self.__add_task() else: self.__add_task()
def __init__(self, name: str, expire_time: int, expire_time_record_key=None): if not name: raise ValueError("name cant't be None") if not expire_time: raise ValueError("please set expire time, units is seconds") if not self.__class__.redis_db: self.__class__.redis_db = RedisDB() self.name = name self.expire_time = expire_time self.expire_time_record_key = expire_time_record_key self.record_expire_time() self.del_expire_key()
def check_filter_capacity(self): """ 检测filter状态,如果已满,加载新的filter @return: """ if (not self._check_capacity_time or time.time() - self._check_capacity_time > 1800): # with self._thread_lock: with RedisLock( key="ScalableBloomFilter", timeout=300, wait_timeout=300, redis_cli=RedisDB().get_redis_obj(), ) as lock: # 全局锁 同一时间只有一个进程在真正的创建新的filter,等这个进程创建完,其他进程只是把刚创建的filter append进来 if lock.locked: while True: if self.filters[-1].is_at_capacity: self.filters.append(self.create_filter()) else: break self._check_capacity_time = time.time()
def __init__(self, table_folder): if not hasattr(self, "_requests_deque"): super(RequestBuffer, self).__init__() self._thread_stop = False self._is_adding_to_db = False self._requests_deque = collections.deque() self._del_requests_deque = collections.deque() self._db = RedisDB() self._table_request = setting.TAB_REQUSETS.format( table_folder=table_folder) self._table_failed_request = setting.TAB_FAILED_REQUSETS.format( table_folder=table_folder) if not self.__class__.dedup and setting.REQUEST_FILTER_ENABLE: self.__class__.dedup = Dedup( filter_type=Dedup.ExpireFilter, name=table_folder, expire_time=2592000, to_md5=False, ) # 过期时间为一个月
def __init__( self, table_folder=None, parser_count=None, begin_callback=None, end_callback=None, delete_tabs=(), process_num=None, auto_stop_when_spider_done=None, auto_start_requests=None, send_run_time=True, batch_interval=0, *parser_args, **parser_kwargs ): """ @summary: 调度器 --------- @param table_folder: 爬虫request及item存放reis中的文件夹 @param parser_count: 线程数,默认为配置文件中的线程数 @param begin_callback: 爬虫开始回调函数 @param end_callback: 爬虫结束回调函数 @param delete_tabs: 爬虫启动时删除的表,类型: 元组/bool/string。 支持正则 @param process_num: 进程数 @param auto_stop_when_spider_done: 爬虫抓取完毕后是否自动结束或等待任务,默认自动结束 @param auto_start_requests: 爬虫是否自动添加任务 @param send_run_time: 发送运行时间 @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时,只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时,爬虫才启动 @param *parser_args: 传给parser下start_requests的参数, tuple() @param **parser_kwargs: 传给parser下start_requests的参数, dict() --------- @result: """ super(Scheduler, self).__init__() for key, value in self.__class__.__custom_setting__.items(): setattr(setting, key, value) self._table_folder = table_folder or setting.TABLE_FOLDER if not self._table_folder: raise Exception( """ table_folder 为redis中存放request与item的目录。不能为空, 可在setting中配置,如 TABLE_FOLDER = 'test' 或spider初始化时传参, 如 TestSpider(table_folder='test') """ ) self._request_buffer = RequestBuffer(table_folder) self._item_buffer = ItemBuffer(table_folder) self._collector = Collector(table_folder, process_num) self._parsers = [] self._parser_controls = [] self._parser_control_obj = PaserControl self._parser_args = parser_args self._parser_kwargs = parser_kwargs self._auto_stop_when_spider_done = ( auto_stop_when_spider_done if auto_stop_when_spider_done is not None else setting.AUTO_STOP_WHEN_SPIDER_DONE ) self._auto_start_requests = ( auto_start_requests if auto_start_requests is not None else setting.PARSER_AUTO_START_REQUESTS ) self._send_run_time = send_run_time self._batch_interval = batch_interval self._begin_callback = ( begin_callback if begin_callback else lambda: log.info("\n********** spider begin **********") ) self._end_callback = ( end_callback if end_callback else lambda: log.info("\n********** spider end **********") ) self._parser_count = setting.PARSER_COUNT if not parser_count else parser_count self._spider_name = table_folder self._project_name = table_folder.split(":")[0] self._tab_spider_time = setting.TAB_SPIDER_TIME.format( table_folder=table_folder ) self._tab_spider_status = setting.TAB_SPIDER_STATUS.format( table_folder=table_folder ) self._tab_requests = setting.TAB_REQUSETS.format(table_folder=table_folder) self._tab_failed_requests = setting.TAB_FAILED_REQUSETS.format( table_folder=table_folder ) self._is_notify_end = False # 是否已经通知结束 self._last_task_count = 0 # 最近一次任务数量 self._redisdb = RedisDB() self._project_total_state_table = "{}_total_state".format(self._project_name) self._is_exist_project_total_state_table = False # Request 缓存设置 Request.cached_table_folder = table_folder Request.cached_expire_time = setting.RESPONSE_CACHED_EXPIRE_TIME delete_tabs = delete_tabs or setting.DELETE_TABS if delete_tabs: self.delete_tables(delete_tabs) self._last_check_task_status_time = 0
def __init__(self, task_table, batch_record_table, batch_name, batch_interval, task_keys, task_state="state", min_task_count=10000, check_task_interval=5, task_limit=10000, related_table_folder=None, related_batch_record=None, task_condition="", task_order_by="", table_folder=None, parser_count=None, begin_callback=None, end_callback=None, delete_tabs=(), process_num=None, auto_stop_when_spider_done=None, send_run_time=False, *parser_args, **parser_kwargs): """ @summary: 批次爬虫 必要条件 1、需有任务表 任务表中必须有id 及 任务状态字段 如 state。如指定parser_name字段,则任务会自动下发到对应的parser下, 否则会下发到所有的parser下。其他字段可根据爬虫需要的参数自行扩充 参考建表语句如下: CREATE TABLE `table_name` ( `id` int(11) NOT NULL AUTO_INCREMENT, `param` varchar(1000) DEFAULT NULL COMMENT '爬虫需要的抓取数据需要的参数', `state` int(11) DEFAULT NULL COMMENT '任务状态', `parser_name` varchar(255) DEFAULT NULL COMMENT '任务解析器的脚本类名', PRIMARY KEY (`id`), UNIQUE KEY `nui` (`param`) USING BTREE ) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=utf8; 2、需有批次记录表 不存在自动创建 此表节结构固定,参考建表语句如下: CREATE TABLE `xxx_batch_record` ( `id` int(11) NOT NULL AUTO_INCREMENT, `batch_date` date DEFAULT NULL, `done_count` int(11) DEFAULT NULL, `total_count` int(11) DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8; --------- @param task_table: mysql中的任务表 @param batch_record_table: mysql 中的批次记录表 @param batch_name: 批次采集程序名称 @param batch_interval: 批次间隔 天为单位。 如想一小时一批次,可写成1/24 @param task_keys: 需要获取的任务字段 列表 [] 如需指定解析的parser,则需将parser_name字段取出来。 @param task_state: mysql中任务表的state字段名 @param min_task_count: redis 中最少任务数, 少于这个数量会从mysql的任务表取任务 @param check_task_interval: 检查是否还有任务的时间间隔; @param task_limit: 数据库中取任务的数量 @param table_folder: 爬虫request及item存放reis中的文件夹 @param parser_count: 线程数,默认为配置文件中的线程数 @param begin_callback: 爬虫开始回调函数 @param end_callback: 爬虫结束回调函数 @param delete_tabs: 爬虫启动时删除的表,元组类型。 支持正则 @param process_num: 进程数 @param auto_stop_when_spider_done: 爬虫抓取完毕后是否自动结束或等待任务,默认自动结束 @param send_run_time: 发送运行时间 @param related_table_folder: 有关联的其他爬虫任务表(redis) @param related_batch_record: 有关联的其他爬虫批次表(mysql)注意:要避免环路 如 A -> B & B -> A 。 环路可用related_table_folder指定 related_table_folder 与 related_batch_record 选其一配置即可。 若相关连的爬虫为批次爬虫,推荐以related_batch_record配置, 若相关连的爬虫为普通爬虫,无批次表,可以以related_table_folder配置 @param task_condition: 任务条件 用于从一个大任务表中挑选出数据自己爬虫的任务,及where后的条件语句 @param task_order_by: 取任务时的排序条件 如 id desc @param *parser_args: 传给parser下start_requests的参数, tuple() @param **parser_kwargs: 传给parser下start_requests的参数, dict() --------- @result: """ Scheduler.__init__( self, table_folder=table_folder, parser_count=parser_count, begin_callback=begin_callback, end_callback=end_callback, delete_tabs=delete_tabs, process_num=process_num, auto_stop_when_spider_done=auto_stop_when_spider_done, auto_start_requests=False, send_run_time=send_run_time, batch_interval=batch_interval, *parser_args, **parser_kwargs) self._redisdb = RedisDB() self._mysqldb = MysqlDB() self._request_buffer = RequestBuffer(self._table_folder) self._task_table = task_table # mysql中的任务表 self._batch_record_table = batch_record_table # mysql 中的批次记录表 self._batch_name = batch_name # 批次采集程序名称 self._task_keys = task_keys # 需要获取的任务字段 self._task_state = task_state # mysql中任务表的state字段名 self._min_task_count = min_task_count # redis 中最少任务数 self._check_task_interval = check_task_interval self._task_limit = task_limit # mysql中一次取的任务数量 self._related_task_tables = [ setting.TAB_REQUSETS.format(table_folder=table_folder) ] # 自己的task表也需要检查是否有任务 if related_table_folder: self._related_task_tables.append( setting.TAB_REQUSETS.format(table_folder=related_table_folder)) self._related_batch_record = related_batch_record self._task_condition_prefix_and = task_condition and " and {}".format( task_condition) self._task_condition_prefix_where = task_condition and " where {}".format( task_condition) self._task_order_by = task_order_by and " order by {}".format( task_order_by) self._batch_date_cache = None if self._batch_interval >= 1: self._date_format = "%Y-%m-%d" elif self._batch_interval < 1 and self._batch_interval >= 1 / 24: self._date_format = "%Y-%m-%d %H" else: self._date_format = "%Y-%m-%d %H:%M" # 报警相关 self._send_msg_interval = datetime.timedelta(hours=1) # 每隔1小时发送一次报警 self._last_send_msg_time = None self._spider_last_done_time = None # 爬虫最近已做任务数量时间 self._spider_last_done_count = 0 # 爬虫最近已做任务数量 self._spider_deal_speed_cached = None self._is_more_parsers = True # 多模版类爬虫
def task_is_done(self): """ @summary: 检查任务状态 是否做完 同时更新批次时间 (不能挂 挂了批次时间就不更新了) --------- --------- @result: True / False (做完 / 未做完) """ is_done = False # 查看批次记录表任务状态 sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count, is_done from {batch_record_table} order by id desc limit 1'.format( date_format=self._date_format.replace(":%M", ":%i"), batch_record_table=self._batch_record_table, ) batch_info = self._mysqldb.find(sql) if batch_info is None: raise Exception("查询批次信息失败") if batch_info: self._batch_date_cache, total_count, done_count, is_done = batch_info[ 0] # 更新self._batch_date_cache, 防止新批次已经开始了,但self._batch_date_cache还是原来的批次时间 log.info("《%s》 批次时间%s 批次进度 %s/%s 完成状态 %d" % ( self._batch_name, self._batch_date_cache, done_count, total_count, is_done, )) os.environ[ "batch_date"] = self._batch_date_cache # 更新BatchParser里边的批次时间 if is_done: # 检查任务表中是否有没做的任务 若有则is_done 为 False # 比较耗时 加锁防止多进程同时查询 with RedisLock( key=self._spider_name, timeout=3600, wait_timeout=0, redis_cli=RedisDB().get_redis_obj(), ) as lock: if lock.locked: log.info("批次表标记已完成,正在检查任务表是否有未完成的任务") sql = "select 1 from %s where (%s = 0 or %s=2)%s limit 1" % ( self._task_table, self._task_state, self._task_state, self._task_condition_prefix_and, ) tasks = self._mysqldb.find(sql) # [(1,)] / [] if tasks: log.info("检测到任务表中有未完成任务,等待任务下发") is_done = False # 更新batch_record 表的is_done 状态,减少查询任务表的次数 sql = 'update {batch_record_table} set is_done = 0 where batch_date = "{batch_date}"'.format( batch_record_table=self._batch_record_table, batch_date=self._batch_date_cache, ) self._mysqldb.update(sql) else: log.info("任务表中任务均已完成,爬虫结束") else: log.info("批次表标记已完成,其他爬虫进程正在检查任务表是否有未完成的任务,本进程跳过检查,继续等待") is_done = False return is_done
def __init__(self, name): self.name = name self.count_cached_name = name + "_count_cached" if not self.__class__.redis_db: self.__class__.redis_db = RedisDB()
def _cache_db(self): if not self.__class__.cache_db: self.__class__.cache_db = RedisDB( ) # .from_url(setting.pika_spider_1_uri) return self.__class__.cache_db
def __init__(self): self._redisdb = RedisDB() self._to_db = MysqlDB()