def delete_tables(self, delete_tables_list): if isinstance(delete_tables_list, bool): delete_tables_list = [self._redis_key + "*"] elif not isinstance(delete_tables_list, (list, tuple)): delete_tables_list = [delete_tables_list] redis = RedisDB() for delete_tab in delete_tables_list: if delete_tab == "*": delete_tab = self._redis_key + "*" tables = redis.getkeys(delete_tab) for table in tables: if table != self._tab_spider_time: log.info("正在删除表 %s" % table) redis.clear(table)
def __init__(self, redis_key): """ @summary: --------- @param redis_key: --------- @result: """ super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._todo_requests = Queue(maxsize=setting.COLLECTOR_TASK_COUNT) self._tab_requests = setting.TAB_REQUESTS.format(redis_key=redis_key) self._is_collector_task = False
def __init__( self, redis_key, page_url=None, min_users=1, must_contained_keys=(), keep_alive=False, **kwargs, ): """ @param redis_key: user存放在redis中的key前缀 @param page_url: 生产user的url @param min_users: 最小user数 @param must_contained_keys: cookie中必须包含的key,用于校验cookie是否正确 @param keep_alive: 是否保持常驻,以便user不足时立即补充 --- @param kwargs: WebDriver的一些参数 load_images: 是否加载图片 user_agent: 字符串 或 无参函数,返回值为user_agent proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址 headless: 是否启用无头模式 driver_type: CHROME 或 PHANTOMJS,FIREFOX timeout: 请求超时时间 window_size: # 窗口大小 executable_path: 浏览器路径,默认为默认路径 """ self._redisdb = RedisDB() self._tab_user_pool = setting.TAB_USER_POOL.format( redis_key=redis_key, user_type="guest" ) self._page_url = page_url self._min_users = min_users self._must_contained_keys = must_contained_keys self._keep_alive = keep_alive self._kwargs = kwargs self._kwargs.setdefault("load_images", False) self._kwargs.setdefault("headless", True) self._users_id = []
def __init__( self, redis_key, *, table_userbase, login_state_key="login_state", lock_state_key="lock_state", username_key="username", password_key="password", login_retry_times=1, keep_alive=False, ): """ @param redis_key: 项目名 @param table_userbase: 用户表名 @param login_state_key: 登录状态列名 @param lock_state_key: 封锁状态列名 @param username_key: 登陆名列名 @param password_key: 密码列名 @param login_retry_times: 登陆失败重试次数 @param keep_alive: 是否保持常驻,以便user不足时立即补充 """ self._tab_user_pool = setting.TAB_USER_POOL.format( redis_key=redis_key, user_type="normal" ) self._login_retry_times = login_retry_times self._table_userbase = table_userbase self._login_state_key = login_state_key self._lock_state_key = lock_state_key self._username_key = username_key self._password_key = password_key self._keep_alive = keep_alive self._users_id = [] self._redisdb = RedisDB() self._mysqldb = MysqlDB() self._create_userbase()
def __init__( self, redis_key, page_url=None, min_cookies=10000, must_contained_keys=(), keep_alive=False, **kwargs, ): """ @param redis_key: 项目名 @param page_url: 生产cookie的url @param min_cookies: 最小cookie数 @param must_contained_keys: cookie 必须包含的key @param keep_alive: 当cookie数量足够是是否保持随时待命,生产cookie的状态。False为否,满足则退出 --- @param kwargs: WebDriver的一些参数 load_images: 是否加载图片 user_agent_pool: user-agent池 为None时不使用 proxies_pool: ;代理池 为None时不使用 headless: 是否启用无头模式 driver_type: web driver 类型 timeout: 请求超时时间 默认16s window_size: 屏幕分辨率 (width, height) """ self._redisdb = RedisDB() self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key) self._tab_cookie_pool_last_count = "{}:str_cookie_pool_count".format( redis_key ) # 存储上一次统计cookie 数量的时间,格式为 时间戳:数量 self._page_url = page_url self._min_cookies = min_cookies self._must_contained_keys = must_contained_keys self._keep_alive = keep_alive self._kwargs = kwargs self._kwargs.setdefault("load_images", False) self._kwargs.setdefault("headless", True)
def __init__(self, redis_key): if not hasattr(self, "_requests_deque"): super(RequestBuffer, self).__init__() self._thread_stop = False self._is_adding_to_db = False self._requests_deque = collections.deque() self._del_requests_deque = collections.deque() self._db = RedisDB() self._table_request = setting.TAB_REQUSETS.format( redis_key=redis_key) self._table_failed_request = setting.TAB_FAILED_REQUSETS.format( redis_key=redis_key) if not self.__class__.dedup and setting.REQUEST_FILTER_ENABLE: self.__class__.dedup = Dedup( name=redis_key, to_md5=False, **setting.REQUEST_FILTER_SETTING) # 默认过期时间为一个月
def __init__( self, username, password, max_search_times, proxies=None, search_interval=0, **kwargs, ): """ @param username: @param password: @param max_search_times: @param proxies: @param search_interval: 调用时间间隔。 支持元组 指定间隔的时间范围 如(5,10)即5到10秒;或直接传整数 """ self.__dict__.update(kwargs) self.username = username self.password = password self.max_search_times = max_search_times self.proxies = proxies self.search_interval = search_interval self.delay_use = 0 # 延时使用,用于等待解封的用户 if isinstance(search_interval, (tuple, list)): if len(search_interval) != 2: raise ValueError("search_interval 需传递两个值的元组或列表。如(5,10)即5到10秒") self.used_for_time_length = ( search_interval[1] * 5 ) # 抢占式爬虫独享cookie时间,这段时间内其他爬虫不可抢占 else: self.used_for_time_length = ( search_interval * 5 ) # 抢占式爬虫独享cookie时间,这段时间内其他爬虫不可抢占 self.account_info = { "login_time": 0, "cookies": {}, "search_times": 0, "last_search_time": 0, "used_for_spider_name": None, # 只被某个爬虫使用 其他爬虫不可使用 "init_search_times_time": 0, # 初始化搜索次数的时间 } if not self.__class__.redisdb: self.__class__.redisdb = RedisDB() self.sync_account_info_from_redis() self.__init_metrics()
def __init__(self, name: str, expire_time: int, expire_time_record_key=None): if not name: raise ValueError("name cant't be None") if not expire_time: raise ValueError("please set expire time, units is seconds") if not self.__class__.redis_db: self.__class__.redis_db = RedisDB() self.name = name self.expire_time = expire_time self.expire_time_record_key = expire_time_record_key self.record_expire_time() self.del_expire_key()
def _start(self): # 启动request_buffer self._request_buffer.start() # 启动item_buffer self._item_buffer.start() # 启动collector self._collector.start() # 启动parser control for i in range(self._thread_count): parser_control = self._parser_control_obj( self._collector, self._redis_key, self._request_buffer, self._item_buffer, ) for parser in self._parsers: parser_control.add_parser(parser) parser_control.start() self._parser_controls.append(parser_control) # 下发任务 因为时间可能比较长,放到最后面 if setting.RETRY_FAILED_REQUESTS: # 重设失败的任务, 不用加锁,原子性操作 handle_failed_requests = HandleFailedRequests(self._redis_key) handle_failed_requests.reput_failed_requests_to_requests() # 下发新任务 if self._auto_start_requests: # 自动下发 if self.wait_lock: # 将添加任务处加锁,防止多进程之间添加重复的任务 with RedisLock( key=self._spider_name, timeout=3600, wait_timeout=60, redis_cli=RedisDB().get_redis_obj(), ) as lock: if lock.locked: self.__add_task() else: self.__add_task()
class HandleFailedRequests(object): """docstring for HandleFailedRequests""" def __init__(self, redis_key): super(HandleFailedRequests, self).__init__() self._redis_key = redis_key self._redisdb = RedisDB() self._request_buffer = RequestBuffer(self._redis_key) self._table_failed_request = setting.TAB_FAILED_REQUSETS.format( redis_key=redis_key) def get_failed_requests(self, count=10000): failed_requests = self._redisdb.zget(self._table_failed_request, count=count) failed_requests = [ eval(failed_request) for failed_request in failed_requests ] return failed_requests def reput_failed_requests_to_requests(self): log.debug("正在重置失败的requests...") total_count = 0 while True: try: failed_requests = self.get_failed_requests() if not failed_requests: break for request in failed_requests: request["retry_times"] = 0 request_obj = Request.from_dict(request) self._request_buffer.put_request(request_obj) total_count += 1 except Exception as e: log.exception(e) self._request_buffer.flush() log.debug("重置%s条失败requests为待抓取requests" % total_count)
def check_filter_capacity(self): """ 检测filter状态,如果已满,加载新的filter @return: """ if (not self._check_capacity_time or time.time() - self._check_capacity_time > 1800): # with self._thread_lock: with RedisLock( key="ScalableBloomFilter", timeout=300, wait_timeout=300, redis_cli=RedisDB().get_redis_obj(), ) as lock: # 全局锁 同一时间只有一个进程在真正的创建新的filter,等这个进程创建完,其他进程只是把刚创建的filter append进来 if lock.locked: while True: if self.filters[-1].is_at_capacity: self.filters.append(self.create_filter()) else: break self._check_capacity_time = time.time()
class Collector(threading.Thread): def __init__(self, redis_key): """ @summary: --------- @param redis_key: --------- @result: """ super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._todo_requests = Queue(maxsize=setting.COLLECTOR_TASK_COUNT) self._tab_requests = setting.TAB_REQUESTS.format(redis_key=redis_key) self._is_collector_task = False def run(self): self._thread_stop = False while not self._thread_stop: try: self.__input_data() except Exception as e: log.exception(e) time.sleep(0.1) self._is_collector_task = False def stop(self): self._thread_stop = True self._started.clear() def __input_data(self): if setting.COLLECTOR_TASK_COUNT / setting.SPIDER_THREAD_COUNT > 1 and ( self._todo_requests.qsize() > setting.SPIDER_THREAD_COUNT or self._todo_requests.qsize() >= self._todo_requests.maxsize): time.sleep(0.1) return current_timestamp = tools.get_current_timestamp() # 取任务,只取当前时间搓以内的任务,同时将任务分数修改为 current_timestamp + setting.REQUEST_LOST_TIMEOUT requests_list = self._db.zrangebyscore_set_score( self._tab_requests, priority_min="-inf", priority_max=current_timestamp, score=current_timestamp + setting.REQUEST_LOST_TIMEOUT, count=setting.COLLECTOR_TASK_COUNT, ) if requests_list: self._is_collector_task = True # 存request self.__put_requests(requests_list) else: time.sleep(0.1) def __put_requests(self, requests_list): for request in requests_list: try: request_dict = { "request_obj": Request.from_dict(eval(request)), "request_redis": request, } except Exception as e: log.exception(""" error %s request %s """ % (e, request)) request_dict = None if request_dict: self._todo_requests.put(request_dict) def get_request(self): try: request = self._todo_requests.get(timeout=1) return request except Empty as e: return None def get_requests_count(self): return (self._todo_requests.qsize() or self._db.zget_count(self._tab_requests) or 0) def is_collector_task(self): return self._is_collector_task
from feapder.db.redisdb import RedisDB db = RedisDB(ip_ports="localhost:6379") # db.zadd("test", list(range(10)), list(range(10))) # db.zremrangebyscore("test", 1, 3) db.zrem("test", [4, 0])
def _cache_db(self): if not self.__class__.cache_db: self.__class__.cache_db = RedisDB( ) # .from_url(setting.pika_spider_1_uri) return self.__class__.cache_db
class Collector(threading.Thread): def __init__(self, redis_key): """ @summary: --------- @param redis_key: --------- @result: """ super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._todo_requests = collections.deque() self._tab_requests = setting.TAB_REQUSETS.format(redis_key=redis_key) self._tab_spider_status = setting.TAB_SPIDER_STATUS.format( redis_key=redis_key) self._spider_mark = tools.get_localhost_ip() + f"-{time.time()}" self._interval = setting.COLLECTOR_SLEEP_TIME self._request_count = setting.COLLECTOR_TASK_COUNT self._is_collector_task = False self.__delete_dead_node() def run(self): while not self._thread_stop: try: self.__report_node_heartbeat() self.__input_data() except Exception as e: log.exception(e) self._is_collector_task = False time.sleep(self._interval) def stop(self): self._thread_stop = True def __input_data(self): current_timestamp = tools.get_current_timestamp() if len(self._todo_requests) >= self._request_count: return request_count = self._request_count # 先赋值 # 查询最近有心跳的节点数量 spider_wait_count = self._db.zget_count( self._tab_spider_status, priority_min=current_timestamp - (self._interval + 10), priority_max=current_timestamp, ) # 根据等待节点数量,动态分配request if spider_wait_count: # 任务数量 task_count = self._db.zget_count(self._tab_requests) # 动态分配的数量 = 任务数量 / 休息的节点数量 + 1 request_count = task_count // spider_wait_count + 1 request_count = (request_count if request_count <= self._request_count else self._request_count) if not request_count: return # 取任务,只取当前时间搓以内的任务,同时将任务分数修改为 current_timestamp + setting.REQUEST_TIME_OUT requests_list = self._db.zrangebyscore_set_score( self._tab_requests, priority_min="-inf", priority_max=current_timestamp, score=current_timestamp + setting.REQUEST_TIME_OUT, count=request_count, ) if requests_list: self._is_collector_task = True # 存request self.__put_requests(requests_list) def __report_node_heartbeat(self): """ 汇报节点心跳,以便任务平均分配 """ self._db.zadd(self._tab_spider_status, self._spider_mark, tools.get_current_timestamp()) def __delete_dead_node(self): """ 删除没有心跳的节点信息 """ self._db.zremrangebyscore( self._tab_spider_status, "-inf", tools.get_current_timestamp() - (self._interval + 10), ) def __put_requests(self, requests_list): for request in requests_list: try: request_dict = { "request_obj": Request.from_dict(eval(request)), "request_redis": request, } except Exception as e: log.exception(""" error %s request %s """ % (e, request)) request_dict = None if request_dict: self._todo_requests.append(request_dict) def get_requests(self, count): requests = [] count = count if count <= len(self._todo_requests) else len( self._todo_requests) while count: requests.append(self._todo_requests.popleft()) count -= 1 return requests def get_requests_count(self): return len(self._todo_requests) or self._db.zget_count( self._tab_requests) def is_collector_task(self): return self._is_collector_task
def run(): while True: redisdb = RedisDB() try: block_ip = redisdb.sget(setting.CAPTCHA_BLOCK_IP_REDIS_KEY) if not block_ip: log.debug("暂无被封ip") for ip in block_ip: task = redisdb.hget(setting.CAPTCHA_REDIS_KEY, ip, is_pop=True) task = eval(task) ua = task.get("ua") url = task.get("url") with WebDriver(proxy=ip, user_agent=ua) as browser: log.info("解封ip {}, url {}".format(ip, url)) browser.get(url) browser.implicitly_wait(5) frame = browser.find_element_by_id("tcaptcha_iframe") browser.switch_to.frame(frame) for i in range(20): for i in range(1000): bg_url = browser.find_element_by_id( "slideBg").get_attribute("src") slide_url = browser.find_element_by_id( "slideBlock").get_attribute("src") if bg_url and slide_url: break else: log.error("滑块加载失败") return bg_image = os.path.join( CAPTCHA_PATH, "bg_" + tools.get_md5(bg_url) + ".png") slide_image = os.path.join( CAPTCHA_PATH, "slider_" + tools.get_md5(slide_url) + ".png") if tools.download_file( bg_url, bg_image) and tools.download_file( slide_url, slide_image): # 识别缺口 x, y = get_gap_center_point(bg_image, slide_image, show=False) # 缩放 x = x * 340 / 680 x = x - 27.5 - 30 # 滑动 slide_btn = browser.find_element_by_id( "tcaptcha_drag_thumb") tracks = track.get_tracks(x) drag_and_drop(browser, slide_btn, tracks) # 删除图片 os.remove(bg_image) os.remove(slide_image) tools.delay_time(2) if "verify.maoyan.com" not in browser.current_url: log.info("解封成功") break else: try: browser.find_element_by_css_selector( ".tc-action-icon").click() except: pass tools.delay_time(3) except Exception as e: log.error(e)
def task_is_done(self): """ @summary: 检查任务状态 是否做完 同时更新批次时间 (不能挂 挂了批次时间就不更新了) --------- --------- @result: True / False (做完 / 未做完) """ is_done = False # 查看批次记录表任务状态 sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count, is_done from {batch_record_table} order by id desc limit 1'.format( date_format=self._date_format.replace(":%M", ":%i"), batch_record_table=self._batch_record_table, ) batch_info = self._mysqldb.find(sql) if batch_info is None: raise Exception("查询批次信息失败") if batch_info: self._batch_date_cache, total_count, done_count, is_done = batch_info[ 0] # 更新self._batch_date_cache, 防止新批次已经开始了,但self._batch_date_cache还是原来的批次时间 log.info("《%s》 批次时间%s 批次进度 %s/%s 完成状态 %d" % ( self._batch_name, self._batch_date_cache, done_count, total_count, is_done, )) os.environ[ "batch_date"] = self._batch_date_cache # 更新BatchParser里边的批次时间 if is_done: # 检查任务表中是否有没做的任务 若有则is_done 为 False # 比较耗时 加锁防止多进程同时查询 with RedisLock( key=self._spider_name, timeout=3600, wait_timeout=0, redis_cli=RedisDB().get_redis_obj(), ) as lock: if lock.locked: log.info("批次表标记已完成,正在检查任务表是否有未完成的任务") sql = "select 1 from %s where (%s = 0 or %s=2)%s limit 1" % ( self._task_table, self._task_state, self._task_state, self._task_condition_prefix_and, ) tasks = self._mysqldb.find(sql) # [(1,)] / [] if tasks: log.info("检测到任务表中有未完成任务,等待任务下发") is_done = False # 更新batch_record 表的is_done 状态,减少查询任务表的次数 sql = 'update {batch_record_table} set is_done = 0 where batch_date = "{batch_date}"'.format( batch_record_table=self._batch_record_table, batch_date=self._batch_date_cache, ) self._mysqldb.update(sql) else: log.info("任务表中任务均已完成,爬虫结束") else: log.info("批次表标记已完成,其他爬虫进程正在检查任务表是否有未完成的任务,本进程跳过检查,继续等待") is_done = False return is_done
def _redis_cli(self): if self.__class__.__redis_cli is None: self.__class__.__redis_cli = RedisDB( url=self.redis_url).get_redis_obj() return self.__class__.__redis_cli
# -*- coding: utf-8 -*- """ Created on 2021/3/4 11:01 下午 --------- @summary: --------- @author: Boris @email: [email protected] """ from feapder.db.redisdb import RedisDB redis = RedisDB(ip_ports="localhost:6379", db=0) redis.lpush("l_test", 2) redis.lpush("l_test", 3) print(redis.lrange("l_test")) print(redis.lrem("l_test", 2)) print(redis.lrange("l_test"))
class Scheduler(threading.Thread): __custom_setting__ = {} def __init__( self, redis_key=None, thread_count=None, begin_callback=None, end_callback=None, delete_keys=(), auto_stop_when_spider_done=None, auto_start_requests=None, send_run_time=True, batch_interval=0, wait_lock=True, task_table=None, ): """ @summary: 调度器 --------- @param redis_key: 爬虫request及item存放reis中的文件夹 @param thread_count: 线程数,默认为配置文件中的线程数 @param begin_callback: 爬虫开始回调函数 @param end_callback: 爬虫结束回调函数 @param delete_keys: 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则 @param auto_stop_when_spider_done: 爬虫抓取完毕后是否自动结束或等待任务,默认自动结束 @param auto_start_requests: 爬虫是否自动添加任务 @param send_run_time: 发送运行时间 @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时,只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时,爬虫才启动 @param wait_lock: 下发任务时否等待锁,若不等待锁,可能会存在多进程同时在下发一样的任务,因此分布式环境下请将该值设置True @param task_table: 任务表, 批次爬虫传递 --------- @result: """ super(Scheduler, self).__init__() for key, value in self.__class__.__custom_setting__.items(): setattr(setting, key, value) log.reload() self._redis_key = redis_key or setting.REDIS_KEY if not self._redis_key: raise Exception( """ redis_key 为redis中存放request与item的目录。不能为空, 可在setting中配置,如 REDIS_KEY = 'test' 或spider初始化时传参, 如 TestSpider(redis_key='test') """ ) self._request_buffer = RequestBuffer(redis_key) self._item_buffer = ItemBuffer(redis_key, task_table) self._collector = Collector(redis_key) self._parsers = [] self._parser_controls = [] self._parser_control_obj = PaserControl self._auto_stop_when_spider_done = ( auto_stop_when_spider_done if auto_stop_when_spider_done is not None else setting.AUTO_STOP_WHEN_SPIDER_DONE ) self._auto_start_requests = ( auto_start_requests if auto_start_requests is not None else setting.SPIDER_AUTO_START_REQUESTS ) self._send_run_time = send_run_time self._batch_interval = batch_interval self._begin_callback = ( begin_callback if begin_callback else lambda: log.info("\n********** feapder begin **********") ) self._end_callback = ( end_callback if end_callback else lambda: log.info("\n********** feapder end **********") ) self._thread_count = ( setting.SPIDER_THREAD_COUNT if not thread_count else thread_count ) self._spider_name = redis_key self._project_name = redis_key.split(":")[0] self._tab_spider_time = setting.TAB_SPIDER_TIME.format(redis_key=redis_key) self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key) self._tab_requests = setting.TAB_REQUSETS.format(redis_key=redis_key) self._tab_failed_requests = setting.TAB_FAILED_REQUSETS.format( redis_key=redis_key ) self._is_notify_end = False # 是否已经通知结束 self._last_task_count = 0 # 最近一次任务数量 self._redisdb = RedisDB() self._project_total_state_table = "{}_total_state".format(self._project_name) self._is_exist_project_total_state_table = False # Request 缓存设置 Request.cached_redis_key = redis_key Request.cached_expire_time = setting.RESPONSE_CACHED_EXPIRE_TIME delete_keys = delete_keys or setting.DELETE_KEYS if delete_keys: self.delete_tables(delete_keys) self._last_check_task_status_time = 0 self.wait_lock = wait_lock def add_parser(self, parser): parser = parser() # parser 实例化 if isinstance(parser, BaseParser): self._parsers.append(parser) else: raise ValueError("类型错误,爬虫需继承feapder.BaseParser或feapder.BatchParser") def run(self): if not self.is_reach_next_spider_time(): return self._start() while True: if self.all_thread_is_done(): if not self._is_notify_end: self.spider_end() # 跑完一轮 self.record_spider_state( spider_type=1, state=1, spider_end_time=tools.get_current_date(), batch_interval=self._batch_interval, ) self._is_notify_end = True if self._auto_stop_when_spider_done: self._stop_all_thread() break else: self._is_notify_end = False self.check_task_status() tools.delay_time(1) # 1秒钟检查一次爬虫状态 def __add_task(self): # 启动parser 的 start_requests self.spider_begin() # 不自动结束的爬虫此处只能执行一遍 self.record_spider_state( spider_type=1, state=0, batch_date=tools.get_current_date(), spider_start_time=tools.get_current_date(), batch_interval=self._batch_interval, ) # 判断任务池中属否还有任务,若有接着抓取 todo_task_count = self._collector.get_requests_count() if todo_task_count: log.info("检查到有待做任务 %s 条,不重下发新任务。将接着上回异常终止处继续抓取" % todo_task_count) else: for parser in self._parsers: results = parser.start_requests() # 添加request到请求队列,由请求队列统一入库 if results and not isinstance(results, Iterable): raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests")) result_type = 1 for result in results or []: if isinstance(result, Request): result.parser_name = result.parser_name or parser.name self._request_buffer.put_request(result) result_type = 1 elif isinstance(result, Item): self._item_buffer.put_item(result) result_type = 2 elif callable(result): # callbale的request可能是更新数据库操作的函数 if result_type == 1: self._request_buffer.put_request(result) else: self._item_buffer.put_item(result) else: raise TypeError( "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format( type(result) ) ) self._request_buffer.flush() self._item_buffer.flush() def _start(self): # 启动request_buffer self._request_buffer.start() # 启动item_buffer self._item_buffer.start() # 启动collector self._collector.start() # 启动parser control for i in range(self._thread_count): parser_control = self._parser_control_obj( self._collector, self._redis_key, self._request_buffer, self._item_buffer, ) for parser in self._parsers: parser_control.add_parser(parser) parser_control.start() self._parser_controls.append(parser_control) # 下发任务 因为时间可能比较长,放到最后面 if setting.RETRY_FAILED_REQUESTS: # 重设失败的任务, 不用加锁,原子性操作 handle_failed_requests = HandleFailedRequests(self._redis_key) handle_failed_requests.reput_failed_requests_to_requests() # 下发新任务 if self._auto_start_requests: # 自动下发 if self.wait_lock: # 将添加任务处加锁,防止多进程之间添加重复的任务 with RedisLock( key=self._spider_name, timeout=3600, wait_timeout=60, redis_cli=RedisDB().get_redis_obj(), ) as lock: if lock.locked: self.__add_task() else: self.__add_task() def all_thread_is_done(self): for i in range(3): # 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性 # 检测 collector 状态 if ( self._collector.is_collector_task() or self._collector.get_requests_count() > 0 ): return False # 检测 parser_control 状态 for parser_control in self._parser_controls: if not parser_control.is_not_task(): return False # 检测 item_buffer 状态 if ( self._item_buffer.get_items_count() > 0 or self._item_buffer.is_adding_to_db() ): return False # 检测 request_buffer 状态 if ( self._request_buffer.get_requests_count() > 0 or self._request_buffer.is_adding_to_db() ): return False tools.delay_time(1) return True @tools.run_safe_model("check_task_status") def check_task_status(self): """ 检查任务状态 预警 """ # 每分钟检查一次 now_time = time.time() if now_time - self._last_check_task_status_time > 60: self._last_check_task_status_time = now_time else: return # 检查redis中任务状态,若连续20分钟内任务数量未发生变化(parser可能卡死),则发出报警信息 task_count = self._redisdb.zget_count(self._tab_requests) if task_count: if task_count != self._last_task_count: self._last_task_count = task_count self._redisdb.hset( self._tab_spider_time, SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY, tools.get_current_timestamp(), ) # 多进程会重复发消息, 使用reids记录上次统计时间 else: # 判断时间间隔是否超过20分钟 lua = """ -- local key = KEYS[1] local field = ARGV[1] local current_timestamp = ARGV[2] -- 取值 local last_timestamp = redis.call('hget', KEYS[1], field) if last_timestamp and current_timestamp - last_timestamp >= 1200 then return current_timestamp - last_timestamp -- 返回任务停滞时间 秒 end if not last_timestamp then redis.call('hset', KEYS[1], field, current_timestamp) end return 0 """ redis_obj = self._redisdb.get_redis_obj() cmd = redis_obj.register_script(lua) overtime = cmd( keys=[self._tab_spider_time], args=[ SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY, tools.get_current_timestamp(), ], ) if overtime: # 发送报警 msg = "《{}》爬虫任务停滞 {},请检查爬虫是否正常".format( self._spider_name, tools.format_seconds(overtime) ) log.error(msg) self.send_msg( msg, level="error", message_prefix="《{}》爬虫任务停滞".format(self._spider_name), ) else: self._last_task_count = 0 # 检查失败任务数量 超过1000 报警, failed_count = self._redisdb.zget_count(self._tab_failed_requests) if failed_count > setting.WARNING_FAILED_COUNT: # 发送报警 msg = "《%s》爬虫当前失败任务 %s, 请检查爬虫是否正常" % (self._spider_name, failed_count) log.error(msg) self.send_msg( msg, level="error", message_prefix="《%s》爬虫当前失败任务数预警" % (self._spider_name), ) # parser_control实时统计已做任务数及失败任务数,若失败数大于10且失败任务数/已做任务数>=0.5 则报警 failed_task_count, success_task_count = PaserControl.get_task_status_count() total_count = success_task_count + failed_task_count if total_count > 0: task_success_rate = success_task_count / total_count if task_success_rate < 0.5: # 发送报警 msg = "《%s》爬虫当前任务成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % ( self._spider_name, success_task_count, failed_task_count, task_success_rate, ) log.error(msg) # 统计下上次发消息的时间,若时间大于1小时,则报警(此处为多进程,需要考虑别报重复) self.send_msg( msg, level="error", message_prefix="《%s》爬虫当前任务成功率" % (self._spider_name), ) def delete_tables(self, delete_tables_list): if isinstance(delete_tables_list, bool): delete_tables_list = [self._redis_key + "*"] elif not isinstance(delete_tables_list, (list, tuple)): delete_tables_list = [delete_tables_list] redis = RedisDB() for delete_tab in delete_tables_list: if not delete_tab.startswith(self._redis_key): delete_tab = self._redis_key + delete_tab tables = redis.getkeys(delete_tab) for table in tables: if table != self._tab_spider_time: log.info("正在删除key %s" % table) redis.clear(table) def _stop_all_thread(self): self._request_buffer.stop() self._item_buffer.stop() # 停止 collector self._collector.stop() # 停止 parser_controls for parser_control in self._parser_controls: parser_control.stop() def send_msg(self, msg, level="debug", message_prefix=""): if setting.WARNING_LEVEL == "ERROR": if level != "error": return if setting.DINGDING_WARNING_PHONE: keyword = "feapder报警系统\n" tools.dingding_warning(keyword + msg, message_prefix=message_prefix) if setting.EMAIL_RECEIVER: tools.email_warning( msg, message_prefix=message_prefix, title=self._spider_name ) def spider_begin(self): """ @summary: start_monitor_task 方式启动,此函数与spider_end不在同一进程内,变量不可共享 --------- --------- @result: """ if self._begin_callback: self._begin_callback() for parser in self._parsers: parser.start_callback() # 记录开始时间 if not self._redisdb.hexists(self._tab_spider_time, SPIDER_START_TIME_KEY): current_timestamp = tools.get_current_timestamp() self._redisdb.hset( self._tab_spider_time, SPIDER_START_TIME_KEY, current_timestamp ) # 发送消息 self.send_msg("《%s》爬虫开始" % self._spider_name) def spider_end(self): self.record_end_time() if self._end_callback: self._end_callback() for parser in self._parsers: parser.close() parser.end_callback() # 关闭webdirver if Request.webdriver_pool: Request.webdriver_pool.close() # 计算抓取时常 data = self._redisdb.hget( self._tab_spider_time, SPIDER_START_TIME_KEY, is_pop=True ) if data: begin_timestamp = int(data) spand_time = tools.get_current_timestamp() - begin_timestamp msg = "《%s》爬虫结束,耗时 %s" % ( self._spider_name, tools.format_seconds(spand_time), ) log.info(msg) if self._send_run_time: self.send_msg(msg) if not self._auto_stop_when_spider_done: log.info("爬虫不自动结束, 等待下一轮任务...") else: self.delete_tables(self._tab_spider_status) def record_end_time(self): # 记录结束时间 if self._batch_interval: current_timestamp = tools.get_current_timestamp() self._redisdb.hset( self._tab_spider_time, SPIDER_END_TIME_KEY, current_timestamp ) def is_reach_next_spider_time(self): if not self._batch_interval: return True last_spider_end_time = self._redisdb.hget( self._tab_spider_time, SPIDER_END_TIME_KEY ) if last_spider_end_time: last_spider_end_time = int(last_spider_end_time) current_timestamp = tools.get_current_timestamp() time_interval = current_timestamp - last_spider_end_time if time_interval < self._batch_interval * 86400: log.info( "上次运行结束时间为 {} 与当前时间间隔 为 {}, 小于规定的抓取时间间隔 {}。爬虫不执行,退出~".format( tools.timestamp_to_date(last_spider_end_time), tools.format_seconds(time_interval), tools.format_seconds(self._batch_interval * 86400), ) ) return False return True def record_spider_state( self, spider_type, state, batch_date=None, spider_start_time=None, spider_end_time=None, batch_interval=None, ): pass
def __init__(self, name, redis_url=None): self.name = name self.count_cached_name = name + "_count_cached" if not self.__class__.redis_db: self.__class__.redis_db = RedisDB(url=redis_url)
class Collector(threading.Thread): def __init__(self, redis_key, process_num=None): """ @summary: --------- @param redis_key: @param process_num: 进程编号 --------- @result: """ super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._todo_requests = collections.deque() self._tab_requests = setting.TAB_REQUSETS.format(redis_key=redis_key) self._tab_spider_status = setting.TAB_SPIDER_STATUS.format( redis_key=redis_key) self._spider_mark = LOCAL_HOST_IP + ("_%s" % process_num if process_num else "_0") self._interval = setting.COLLECTOR_SLEEP_TIME self._request_count = setting.COLLECTOR_TASK_COUNT self._is_collector_task = False self._db.clear(self._tab_spider_status) def run(self): while not self._thread_stop: try: self.__input_data() except Exception as e: log.exception(e) self._is_collector_task = False time.sleep(self._interval) def stop(self): self._thread_stop = True def __input_data(self): if len(self._todo_requests) >= self._request_count: return # 汇报节点信息 self._db.zadd(self._tab_spider_status, self._spider_mark, 0) # 未做 request_count = self._request_count # 先赋值 # 根据等待节点数量,动态分配request spider_wait_count = self._db.zget_count(self._tab_spider_status, priority_min=0, priority_max=0) if spider_wait_count: # 任务数量 task_count = self._db.zget_count(self._tab_requests) # 动态分配的数量 = 任务数量 / 休息的节点数量 + 1 request_count = task_count // spider_wait_count + 1 request_count = (request_count if request_count <= self._request_count else self._request_count) if not request_count: return # 取任务,只取当前时间搓以内的任务,同时将任务分数修改为 current_timestamp + setting.REQUEST_TIME_OUT current_timestamp = tools.get_current_timestamp() requests_list = self._db.zrangebyscore_set_score( self._tab_requests, priority_min="-inf", priority_max=current_timestamp, score=current_timestamp + setting.REQUEST_TIME_OUT, count=request_count, ) if requests_list: self._is_collector_task = True # 汇报节点信息 self._db.zadd(self._tab_spider_status, self._spider_mark, 1) # 正在做 # 存request self.__put_requests(requests_list) def __put_requests(self, requests_list): for request in requests_list: try: request_dict = { "request_obj": Request.from_dict(eval(request)), "request_redis": request, } except Exception as e: log.exception(""" error %s request %s """ % (e, request)) request_dict = None if request_dict: self._todo_requests.append(request_dict) def get_requests(self, count): requests = [] count = count if count <= len(self._todo_requests) else len( self._todo_requests) while count: requests.append(self._todo_requests.popleft()) count -= 1 return requests def get_requests_count(self): return len(self._todo_requests) or self._db.zget_count( self._tab_requests) def is_collector_task(self): return self._is_collector_task
class BatchSpider(BatchParser, Scheduler): def __init__( self, task_table, batch_record_table, batch_name, batch_interval, task_keys, task_state="state", min_task_count=10000, check_task_interval=5, task_limit=10000, related_redis_key=None, related_batch_record=None, task_condition="", task_order_by="", redis_key=None, thread_count=None, begin_callback=None, end_callback=None, delete_keys=(), auto_stop_when_spider_done=None, send_run_time=False, ): """ @summary: 批次爬虫 必要条件 1、需有任务表 任务表中必须有id 及 任务状态字段 如 state。如指定parser_name字段,则任务会自动下发到对应的parser下, 否则会下发到所有的parser下。其他字段可根据爬虫需要的参数自行扩充 参考建表语句如下: CREATE TABLE `table_name` ( `id` int(11) NOT NULL AUTO_INCREMENT, `param` varchar(1000) DEFAULT NULL COMMENT '爬虫需要的抓取数据需要的参数', `state` int(11) DEFAULT NULL COMMENT '任务状态', `parser_name` varchar(255) DEFAULT NULL COMMENT '任务解析器的脚本类名', PRIMARY KEY (`id`), UNIQUE KEY `nui` (`param`) USING BTREE ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8; 2、需有批次记录表 不存在自动创建 --------- @param task_table: mysql中的任务表 @param batch_record_table: mysql 中的批次记录表 @param batch_name: 批次采集程序名称 @param batch_interval: 批次间隔 天为单位。 如想一小时一批次,可写成1/24 @param task_keys: 需要获取的任务字段 列表 [] 如需指定解析的parser,则需将parser_name字段取出来。 @param task_state: mysql中任务表的任务状态字段 @param min_task_count: redis 中最少任务数, 少于这个数量会从mysql的任务表取任务 @param check_task_interval: 检查是否还有任务的时间间隔; @param task_limit: 从数据库中取任务的数量 @param redis_key: 任务等数据存放在redis中的key前缀 @param thread_count: 线程数,默认为配置文件中的线程数 @param begin_callback: 爬虫开始回调函数 @param end_callback: 爬虫结束回调函数 @param delete_keys: 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则; 常用于清空任务队列,否则重启时会断点续爬 @param auto_stop_when_spider_done: 爬虫抓取完毕后是否自动结束或等待任务,默认自动结束 @param send_run_time: 发送运行时间 @param related_redis_key: 有关联的其他爬虫任务表(redis)注意:要避免环路 如 A -> B & B -> A 。 @param related_batch_record: 有关联的其他爬虫批次表(mysql)注意:要避免环路 如 A -> B & B -> A 。 related_redis_key 与 related_batch_record 选其一配置即可;用于相关联的爬虫没结束时,本爬虫也不结束 若相关连的爬虫为批次爬虫,推荐以related_batch_record配置, 若相关连的爬虫为普通爬虫,无批次表,可以以related_redis_key配置 @param task_condition: 任务条件 用于从一个大任务表中挑选出数据自己爬虫的任务,即where后的条件语句 @param task_order_by: 取任务时的排序条件 如 id desc --------- @result: """ Scheduler.__init__( self, redis_key=redis_key, thread_count=thread_count, begin_callback=begin_callback, end_callback=end_callback, delete_keys=delete_keys, auto_stop_when_spider_done=auto_stop_when_spider_done, auto_start_requests=False, send_run_time=send_run_time, batch_interval=batch_interval, ) self._redisdb = RedisDB() self._mysqldb = MysqlDB() self._request_buffer = RequestBuffer(self._redis_key) self._task_table = task_table # mysql中的任务表 self._batch_record_table = batch_record_table # mysql 中的批次记录表 self._batch_name = batch_name # 批次采集程序名称 self._task_keys = task_keys # 需要获取的任务字段 self._task_state = task_state # mysql中任务表的state字段名 self._min_task_count = min_task_count # redis 中最少任务数 self._check_task_interval = check_task_interval self._task_limit = task_limit # mysql中一次取的任务数量 self._related_task_tables = [ setting.TAB_REQUSETS.format(redis_key=redis_key) ] # 自己的task表也需要检查是否有任务 if related_redis_key: self._related_task_tables.append( setting.TAB_REQUSETS.format(redis_key=related_redis_key)) self._related_batch_record = related_batch_record self._task_condition = task_condition self._task_condition_prefix_and = task_condition and " and {}".format( task_condition) self._task_condition_prefix_where = task_condition and " where {}".format( task_condition) self._task_order_by = task_order_by and " order by {}".format( task_order_by) self._batch_date_cache = None if self._batch_interval >= 1: self._date_format = "%Y-%m-%d" elif self._batch_interval < 1 and self._batch_interval >= 1 / 24: self._date_format = "%Y-%m-%d %H" else: self._date_format = "%Y-%m-%d %H:%M" # 报警相关 self._send_msg_interval = datetime.timedelta(hours=1) # 每隔1小时发送一次报警 self._last_send_msg_time = None self._spider_last_done_time = None # 爬虫最近已做任务数量时间 self._spider_last_done_count = 0 # 爬虫最近已做任务数量 self._spider_deal_speed_cached = None self._is_more_parsers = True # 多模版类爬虫 def init_property(self): """ 每个批次开始时需要重置的属性 @return: """ self._last_send_msg_time = None self._spider_last_done_time = None self._spider_last_done_count = 0 # 爬虫刚开始启动时已做任务数量 def add_parser(self, parser): parser = parser( self._task_table, self._batch_record_table, self._task_state, self._date_format, self._mysqldb, ) # parser 实例化 self._parsers.append(parser) def start_monitor_task(self): """ @summary: 监控任务状态 --------- --------- @result: """ if not self._parsers: # 不是多模版模式, 将自己注入到parsers,自己为模版 self._is_more_parsers = False self._parsers.append(self) elif len(self._parsers) <= 1: self._is_more_parsers = False self.create_batch_record_table() # 添加任务 for parser in self._parsers: parser.add_task() is_first_check = True while True: try: if self.check_batch(is_first_check): # 该批次已经做完 if not self._auto_stop_when_spider_done: is_first_check = True log.info("爬虫所有任务已做完,不自动结束,等待新任务...") time.sleep(self._check_task_interval) continue else: break is_first_check = False # 检查redis中是否有任务 任务小于_min_task_count 则从mysql中取 tab_requests = setting.TAB_REQUSETS.format( redis_key=self._redis_key) todo_task_count = self._redisdb.zget_count(tab_requests) tasks = [] if todo_task_count < self._min_task_count: # 从mysql中取任务 # 更新batch表的任务状态数量 self.update_task_done_count() log.info("redis 中剩余任务%s 数量过小 从mysql中取任务追加" % todo_task_count) tasks = self.get_todo_task_from_mysql() if not tasks: # 状态为0的任务已经做完,需要检查状态为2的任务是否丢失 if (todo_task_count == 0 ): # redis 中无待做任务,此时mysql中状态为2的任务为丢失任务。需重新做 lose_task_count = self.get_lose_task_count() if not lose_task_count: time.sleep(self._check_task_interval) continue elif ( lose_task_count > self._task_limit * 5 ): # 丢失任务太多,直接重置,否则每次等redis任务消耗完再取下一批丢失任务,速度过慢 log.info("正在重置丢失任务为待做 共 {} 条".format( lose_task_count)) # 重置正在做的任务为待做 if self.reset_lose_task_from_mysql(): log.info("重置丢失任务成功") else: log.info("重置丢失任务失败") continue else: # 丢失任务少,直接取 log.info("正在取丢失任务 共 {} 条, 取 {} 条".format( lose_task_count, self._task_limit if self._task_limit <= lose_task_count else lose_task_count, )) tasks = self.get_doing_task_from_mysql() else: log.info("mysql 中取到待做任务 %s 条" % len(tasks)) else: log.info("redis 中尚有%s条积压任务,暂时不派发新任务" % todo_task_count) if not tasks: if todo_task_count >= self._min_task_count: # log.info('任务正在进行 redis中剩余任务 %s' % todo_task_count) pass else: log.info("mysql 中无待做任务 redis中剩余任务 %s" % todo_task_count) else: # make start requests self.distribute_task(tasks) log.info("添加任务到redis成功") except Exception as e: log.exception(e) time.sleep(self._check_task_interval) def create_batch_record_table(self): sql = ( "select table_name from information_schema.tables where table_name like '%s'" % self._batch_record_table) tables_name = self._mysqldb.find(sql) if not tables_name: sql = """ CREATE TABLE `{table_name}` ( `id` int(11) UNSIGNED NOT NULL AUTO_INCREMENT, `batch_date` {batch_date} DEFAULT NULL COMMENT '批次时间', `total_count` int(11) DEFAULT NULL COMMENT '任务总数', `done_count` int(11) DEFAULT NULL COMMENT '完成数 (1,-1)', `fail_count` int(11) DEFAULT NULL COMMENT '失败任务数 (-1)', `interval` float(11) DEFAULT NULL COMMENT '批次间隔', `interval_unit` varchar(20) DEFAULT NULL COMMENT '批次间隔单位 day, hour', `create_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '批次开始时间', `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '本条记录更新时间', `is_done` int(11) DEFAULT '0' COMMENT '批次是否完成 0 未完成 1 完成', PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8; """.format( table_name=self._batch_record_table, batch_date="date" if self._date_format == "%Y-%m-%d" else "datetime", ) self._mysqldb.execute(sql) def distribute_task(self, tasks): """ @summary: 分发任务 --------- @param tasks: --------- @result: """ if self._is_more_parsers: # 为多模版类爬虫,需要下发指定的parser for task in tasks: for parser in self._parsers: # 寻找task对应的parser if parser.name in task: requests = parser.start_requests(task) if requests and not isinstance(requests, Iterable): raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests")) result_type = 1 for request in requests or []: if isinstance(request, Request): request.parser_name = request.parser_name or parser.name self._request_buffer.put_request(request) result_type = 1 elif isinstance(request, Item): self._item_buffer.put_item(request) result_type = 2 if (self._item_buffer.get_items_count() >= MAX_ITEM_COUNT): self._item_buffer.flush() elif callable( request): # callbale的request可能是更新数据库操作的函数 if result_type == 1: self._request_buffer.put_request(request) else: self._item_buffer.put_item(request) if (self._item_buffer.get_items_count() >= MAX_ITEM_COUNT): self._item_buffer.flush() else: raise TypeError( "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}" .format(type(requests))) break else: # task没对应的parser 则将task下发到所有的parser for task in tasks: for parser in self._parsers: requests = parser.start_requests(task) if requests and not isinstance(requests, Iterable): raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests")) result_type = 1 for request in requests or []: if isinstance(request, Request): request.parser_name = request.parser_name or parser.name self._request_buffer.put_request(request) result_type = 1 elif isinstance(request, Item): self._item_buffer.put_item(request) result_type = 2 if self._item_buffer.get_items_count( ) >= MAX_ITEM_COUNT: self._item_buffer.flush() elif callable( request): # callbale的request可能是更新数据库操作的函数 if result_type == 1: self._request_buffer.put_request(request) else: self._item_buffer.put_item(request) if (self._item_buffer.get_items_count() >= MAX_ITEM_COUNT): self._item_buffer.flush() self._request_buffer.flush() self._item_buffer.flush() def __get_task_state_count(self): sql = "select {state}, count(1) from {task_table}{task_condition} group by {state}".format( state=self._task_state, task_table=self._task_table, task_condition=self._task_condition_prefix_where, ) task_state_count = self._mysqldb.find(sql) task_state = { "total_count": sum(count for state, count in task_state_count), "done_count": sum(count for state, count in task_state_count if state in (1, -1)), "failed_count": sum(count for state, count in task_state_count if state == -1), } return task_state def update_task_done_count(self): """ @summary: 更新批次表中的任务状态 --------- --------- @result: """ task_count = self.__get_task_state_count() # log.info('《%s》 批次进度 %s/%s' % (self._batch_name, done_task_count, total_task_count)) # 更新批次表 sql = "update {} set done_count = {}, total_count = {}, fail_count = {}, update_time = CURRENT_TIME, is_done=0, `interval` = {}, interval_unit = '{}' where batch_date = '{}'".format( self._batch_record_table, task_count.get("done_count"), task_count.get("total_count"), task_count.get("failed_count"), self._batch_interval if self._batch_interval >= 1 else self._batch_interval * 24, "day" if self._batch_interval >= 1 else "hour", self.batch_date, ) self._mysqldb.update(sql) def update_is_done(self): sql = "update {} set is_done = 1, update_time = CURRENT_TIME where batch_date = '{}' and is_done = 0".format( self._batch_record_table, self.batch_date) self._mysqldb.update(sql) def get_todo_task_from_mysql(self): """ @summary: 取待做的任务 --------- --------- @result: """ # TODO 分批取数据 每批最大取 1000000个,防止内存占用过大 # 查询任务 sql = "select %s from %s where %s = 0%s%s limit %s" % ( ", ".join(self._task_keys), self._task_table, self._task_state, self._task_condition_prefix_and, self._task_order_by, self._task_limit, ) tasks = self._mysqldb.find(sql) if tasks: # 更新任务状态 for i in range(0, len(tasks), 10000): # 10000 一批量更新 task_ids = str(tuple([task[0] for task in tasks[i:i + 10000] ])).replace(",)", ")") sql = "update %s set %s = 2 where id in %s" % ( self._task_table, self._task_state, task_ids, ) self._mysqldb.update(sql) return tasks def get_doing_task_from_mysql(self): """ @summary: 取正在做的任务 --------- --------- @result: """ # 查询任务 sql = "select %s from %s where %s = 2%s%s limit %s" % ( ", ".join(self._task_keys), self._task_table, self._task_state, self._task_condition_prefix_and, self._task_order_by, self._task_limit, ) tasks = self._mysqldb.find(sql) return tasks def get_lose_task_count(self): sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count from {batch_record_table} order by id desc limit 1'.format( date_format=self._date_format.replace(":%M", ":%i"), batch_record_table=self._batch_record_table, ) batch_info = self._mysqldb.find(sql) # (('2018-08-19', 49686, 0),) batch_date, total_count, done_count = batch_info[0] return total_count - done_count def reset_lose_task_from_mysql(self): """ @summary: 重置丢失任务为待做 --------- --------- @result: """ sql = "update {table} set {state} = 0 where {state} = 2{task_condition}".format( table=self._task_table, state=self._task_state, task_condition=self._task_condition_prefix_and, ) return self._mysqldb.update(sql) def get_deal_speed(self, total_count, done_count, last_batch_date): """ 获取处理速度 @param total_count: 总数量 @param done_count: 做完数量 @param last_batch_date: 批次时间 datetime @return: deal_speed (条/小时), need_time (秒), overflow_time(秒) ( overflow_time < 0 时表示提前多少秒完成 ) 或 None """ if not self._spider_last_done_count: now_date = datetime.datetime.now() self._spider_last_done_count = done_count self._spider_last_done_time = now_date if done_count > self._spider_last_done_count: now_date = datetime.datetime.now() time_interval = (now_date - self._spider_last_done_time).total_seconds() deal_speed = (done_count - self._spider_last_done_count) / time_interval # 条/秒 need_time = (total_count - done_count) / deal_speed # 单位秒 overflow_time = ( (now_date - last_batch_date).total_seconds() + need_time - datetime.timedelta(days=self._batch_interval).total_seconds() ) # 溢出时间 秒 calculate_speed_time = now_date.strftime( "%Y-%m-%d %H:%M:%S") # 统计速度时间 deal_speed = int(deal_speed * 3600) # 条/小时 # 更新最近已做任务数及时间 self._spider_last_done_count = done_count self._spider_last_done_time = now_date self._spider_deal_speed_cached = ( deal_speed, need_time, overflow_time, calculate_speed_time, ) return self._spider_deal_speed_cached def init_task(self): """ @summary: 初始化任务表中的任务, 新一个批次开始时调用。 可能会重写 --------- --------- @result: """ sql = "update {task_table} set {state} = 0 where {state} != -1{task_condition}".format( task_table=self._task_table, state=self._task_state, task_condition=self._task_condition_prefix_and, ) return self._mysqldb.update(sql) def check_batch(self, is_first_check=False): """ @summary: 检查批次是否完成 --------- @param: is_first_check 是否为首次检查,若首次检查,且检查结果为批次已完成,则不发送批次完成消息。因为之前发送过了 --------- @result: 完成返回True 否则False """ sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count from {batch_record_table} order by id desc limit 1'.format( date_format=self._date_format.replace(":%M", ":%i"), batch_record_table=self._batch_record_table, ) batch_info = self._mysqldb.find(sql) # (('2018-08-19', 49686, 0),) if batch_info: batch_date, total_count, done_count = batch_info[0] now_date = datetime.datetime.now() last_batch_date = datetime.datetime.strptime( batch_date, self._date_format) time_difference = now_date - last_batch_date if total_count == done_count and time_difference < datetime.timedelta( days=self._batch_interval): # 若在本批次内,再次检查任务表是否有新增任务 # # 改成查询任务表 看是否真的没任务了,因为batch_record表里边的数量可能没来得及更新 task_count = self.__get_task_state_count() total_count = task_count.get("total_count") done_count = task_count.get("done_count") if total_count == done_count: # 检查相关联的爬虫是否完成 releated_spider_is_done = self.related_spider_is_done() if releated_spider_is_done == False: msg = "《{}》本批次未完成, 正在等待依赖爬虫 {} 结束. 批次时间 {} 批次进度 {}/{}".format( self._batch_name, self._related_batch_record or self._related_task_tables, batch_date, done_count, total_count, ) log.info(msg) # 检查是否超时 超时发出报警 if time_difference >= datetime.timedelta( days=self._batch_interval): # 已经超时 if (not self._last_send_msg_time or now_date - self._last_send_msg_time >= self._send_msg_interval): self._last_send_msg_time = now_date self.send_msg(msg, level="error") return False elif releated_spider_is_done == True: # 更新is_done 状态 self.update_is_done() else: self.update_is_done() msg = "《{}》本批次完成 批次时间 {} 共处理 {} 条任务".format( self._batch_name, batch_date, done_count) log.info(msg) if not is_first_check: self.send_msg(msg) # 判断下一批次是否到 if time_difference >= datetime.timedelta( days=self._batch_interval): msg = "《{}》下一批次开始".format(self._batch_name) log.info(msg) self.send_msg(msg) # 初始化任务表状态 if self.init_task() != False: # 更新失败返回False 其他返回True/None # 初始化属性 self.init_property() is_success = ( self.record_batch() ) # 有可能插入不成功,但是任务表已经重置了,不过由于当前时间为下一批次的时间,检查批次是否结束时不会检查任务表,所以下次执行时仍然会重置 if is_success: log.info( "插入新批次记录成功 1分钟后开始下发任务") # 防止work批次时间没来得及更新 tools.delay_time(60) return False # 下一批次开始 else: return True # 下一批次不开始。先不派发任务,因为批次表新批次插入失败了,需要插入成功后再派发任务 else: log.info("《{}》下次批次时间未到".format(self._batch_name)) if not is_first_check: self.send_msg("《{}》下次批次时间未到".format(self._batch_name)) return True else: if time_difference >= datetime.timedelta( days=self._batch_interval): # 已经超时 time_out = time_difference - datetime.timedelta( days=self._batch_interval) time_out_pretty = tools.format_seconds( time_out.total_seconds()) msg = "《{}》本批次已超时{} 批次时间 {}, 批次进度 {}/{}".format( self._batch_name, time_out_pretty, batch_date, done_count, total_count, ) if self._batch_interval >= 1: msg += ", 期望时间{}天".format(self._batch_interval) else: msg += ", 期望时间{}小时".format(self._batch_interval * 24) result = self.get_deal_speed( total_count=total_count, done_count=done_count, last_batch_date=last_batch_date, ) if result: deal_speed, need_time, overflow_time, calculate_speed_time = ( result) msg += ", 任务处理速度于{}统计, 约 {}条/小时, 预计还需 {}".format( calculate_speed_time, deal_speed, tools.format_seconds(need_time), ) if overflow_time > 0: msg += ", 该批次预计总超时 {}, 请及时处理".format( tools.format_seconds(overflow_time)) log.info(msg) if (not self._last_send_msg_time or now_date - self._last_send_msg_time >= self._send_msg_interval): self._last_send_msg_time = now_date self.send_msg(msg, level="error") else: # 未超时 remaining_time = ( datetime.timedelta(days=self._batch_interval) - time_difference) remaining_time_pretty = tools.format_seconds( remaining_time.total_seconds()) if self._batch_interval >= 1: msg = "《{}》本批次正在进行, 批次时间 {}, 批次进度 {}/{}, 期望时间{}天, 剩余{}".format( self._batch_name, batch_date, done_count, total_count, self._batch_interval, remaining_time_pretty, ) else: msg = "《{}》本批次正在进行, 批次时间 {}, 批次进度 {}/{}, 期望时间{}小时, 剩余{}".format( self._batch_name, batch_date, done_count, total_count, self._batch_interval * 24, remaining_time_pretty, ) result = self.get_deal_speed( total_count=total_count, done_count=done_count, last_batch_date=last_batch_date, ) if result: deal_speed, need_time, overflow_time, calculate_speed_time = ( result) msg += ", 任务处理速度于{}统计, 约 {}条/小时, 预计还需 {}".format( calculate_speed_time, deal_speed, tools.format_seconds(need_time), ) if overflow_time > 0: msg += ", 该批次可能会超时 {}, 请及时处理".format( tools.format_seconds(overflow_time)) # 发送警报 if (not self._last_send_msg_time or now_date - self._last_send_msg_time >= self._send_msg_interval): self._last_send_msg_time = now_date self.send_msg(msg, level="error") elif overflow_time < 0: msg += ", 该批次预计提前 {} 完成".format( tools.format_seconds(-overflow_time)) log.info(msg) else: # 插入batch_date self.record_batch() # 初始化任务表状态 可能有产生任务的代码 self.init_task() return False def related_spider_is_done(self): """ 相关连的爬虫是否跑完 @return: True / False / None 表示无相关的爬虫 可由自身的total_count 和 done_count 来判断 """ for related_redis_task_table in self._related_task_tables: if self._redisdb.exists_key(related_redis_task_table): return False if self._related_batch_record: sql = "select is_done from {} order by id desc limit 1".format( self._related_batch_record) is_done = self._mysqldb.find(sql) is_done = is_done[0][0] if is_done else None if is_done is None: log.warning("相关联的批次表不存在或无批次信息") return None if not is_done: return False return True def record_batch(self): """ @summary: 记录批次信息(初始化) --------- --------- @result: """ # 查询总任务数 sql = "select count(1) from %s%s" % ( self._task_table, self._task_condition_prefix_where, ) total_task_count = self._mysqldb.find(sql)[0][0] batch_date = tools.get_current_date(self._date_format) sql = ( "insert into %s (batch_date, done_count, total_count, `interval`, interval_unit, create_time) values ('%s', %s, %s, %s, '%s', CURRENT_TIME)" % ( self._batch_record_table, batch_date, 0, total_task_count, self._batch_interval if self._batch_interval >= 1 else self._batch_interval * 24, "day" if self._batch_interval >= 1 else "hour", )) affect_count = self._mysqldb.add(sql) # None / 0 / 1 (1 为成功) if affect_count: # 重置批次日期 self._batch_date_cache = batch_date # 重新刷下self.batch_date 中的 os.environ.get('batch_date') 否则日期还停留在上一个批次 os.environ["batch_date"] = self._batch_date_cache # 爬虫开始 self.spider_begin() self.record_spider_state( spider_type=2, state=0, batch_date=batch_date, spider_start_time=tools.get_current_date(), batch_interval=self._batch_interval, ) else: log.error("插入新批次失败") return affect_count # -------- 批次结束逻辑 ------------ def task_is_done(self): """ @summary: 检查任务状态 是否做完 同时更新批次时间 (不能挂 挂了批次时间就不更新了) --------- --------- @result: True / False (做完 / 未做完) """ is_done = False # 查看批次记录表任务状态 sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count, is_done from {batch_record_table} order by id desc limit 1'.format( date_format=self._date_format.replace(":%M", ":%i"), batch_record_table=self._batch_record_table, ) batch_info = self._mysqldb.find(sql) if batch_info is None: raise Exception("查询批次信息失败") if batch_info: self._batch_date_cache, total_count, done_count, is_done = batch_info[ 0] # 更新self._batch_date_cache, 防止新批次已经开始了,但self._batch_date_cache还是原来的批次时间 log.info("《%s》 批次时间%s 批次进度 %s/%s 完成状态 %d" % ( self._batch_name, self._batch_date_cache, done_count, total_count, is_done, )) os.environ[ "batch_date"] = self._batch_date_cache # 更新BatchParser里边的批次时间 if is_done: # 检查任务表中是否有没做的任务 若有则is_done 为 False # 比较耗时 加锁防止多进程同时查询 with RedisLock( key=self._spider_name, timeout=3600, wait_timeout=0, redis_cli=RedisDB().get_redis_obj(), ) as lock: if lock.locked: log.info("批次表标记已完成,正在检查任务表是否有未完成的任务") sql = "select 1 from %s where (%s = 0 or %s=2)%s limit 1" % ( self._task_table, self._task_state, self._task_state, self._task_condition_prefix_and, ) tasks = self._mysqldb.find(sql) # [(1,)] / [] if tasks: log.info("检测到任务表中有未完成任务,等待任务下发") is_done = False # 更新batch_record 表的is_done 状态,减少查询任务表的次数 sql = 'update {batch_record_table} set is_done = 0 where batch_date = "{batch_date}"'.format( batch_record_table=self._batch_record_table, batch_date=self._batch_date_cache, ) self._mysqldb.update(sql) else: log.info("任务表中任务均已完成,爬虫结束") else: log.info("批次表标记已完成,其他爬虫进程正在检查任务表是否有未完成的任务,本进程跳过检查,继续等待") is_done = False return is_done def run(self): """ @summary: 重写run方法 检查mysql中的任务是否做完, 做完停止 --------- --------- @result: """ try: self.create_batch_record_table() if not self._parsers: # 不是add_parser 模式 self._parsers.append(self) self._start() while True: if ( self.task_is_done() and self.all_thread_is_done() ): # redis全部的任务已经做完 并且mysql中的任务已经做完(检查各个线程all_thread_is_done,防止任务没做完,就更新任务状态,导致程序结束的情况) if not self._is_notify_end: self.spider_end() self.record_spider_state( spider_type=2, state=1, batch_date=self._batch_date_cache, spider_end_time=tools.get_current_date(), batch_interval=self._batch_interval, ) self._is_notify_end = True if self._auto_stop_when_spider_done: self._stop_all_thread() break else: self._is_notify_end = False self.check_task_status() tools.delay_time(10) # 10秒钟检查一次爬虫状态 except Exception as e: msg = "《%s》主线程异常 爬虫结束 exception: %s" % (self._batch_name, e) log.error(msg) self.send_msg(msg) os._exit(137) # 使退出码为35072 方便爬虫管理器重启 @classmethod def to_DebugBatchSpider(cls, *args, **kwargs): # DebugBatchSpider 继承 cls DebugBatchSpider.__bases__ = (cls, ) DebugBatchSpider.__name__ = cls.__name__ return DebugBatchSpider(*args, **kwargs)
class ItemBuffer(threading.Thread, Singleton): dedup = None def __init__(self, redis_key, task_table=None): if not hasattr(self, "_table_item"): super(ItemBuffer, self).__init__() self._thread_stop = False self._is_adding_to_db = False self._redis_key = redis_key self._task_table = task_table self._items_queue = Queue(maxsize=MAX_ITEM_COUNT) self._db = RedisDB() self._table_item = setting.TAB_ITEM self._table_request = setting.TAB_REQUSETS.format(redis_key=redis_key) self._item_tables = { # 'xxx_item': {'tab_item': 'xxx:xxx_item'} # 记录item名与redis中item名对应关系 } self._item_update_keys = { # 'xxx:xxx_item': ['id', 'name'...] # 记录redis中item名与需要更新的key对应关系 } self._pipelines = self.load_pipelines() self._have_mysql_pipeline = MYSQL_PIPELINE_PATH in setting.ITEM_PIPELINES self._mysql_pipeline = None if setting.ITEM_FILTER_ENABLE and not self.__class__.dedup: self.__class__.dedup = Dedup(to_md5=False) def load_pipelines(self): pipelines = [] for pipeline_path in setting.ITEM_PIPELINES: module, class_name = pipeline_path.rsplit(".", 1) pipeline_cls = importlib.import_module(module).__getattribute__(class_name) pipeline = pipeline_cls() if not isinstance(pipeline, BasePipeline): raise ValueError(f"{pipeline_path} 需继承 feapder.pipelines.BasePipeline") pipelines.append(pipeline) return pipelines @property def mysql_pipeline(self): if not self._mysql_pipeline: module, class_name = MYSQL_PIPELINE_PATH.rsplit(".", 1) pipeline_cls = importlib.import_module(module).__getattribute__(class_name) self._mysql_pipeline = pipeline_cls() return self._mysql_pipeline def run(self): while not self._thread_stop: self.flush() tools.delay_time(0.5) self.close() def stop(self): self._thread_stop = True def put_item(self, item): if isinstance(item, Item): # 入库前的回调 item.pre_to_db() self._items_queue.put(item) def flush(self): try: items = [] update_items = [] requests = [] callbacks = [] items_fingerprints = [] data_count = 0 while not self._items_queue.empty(): data = self._items_queue.get_nowait() data_count += 1 # data 分类 if callable(data): callbacks.append(data) elif isinstance(data, UpdateItem): update_items.append(data) elif isinstance(data, Item): items.append(data) if setting.ITEM_FILTER_ENABLE: items_fingerprints.append(data.fingerprint) else: # request-redis requests.append(data) if data_count >= UPLOAD_BATCH_MAX_SIZE: self.__add_item_to_db( items, update_items, requests, callbacks, items_fingerprints ) items = [] update_items = [] requests = [] callbacks = [] items_fingerprints = [] data_count = 0 if data_count: self.__add_item_to_db( items, update_items, requests, callbacks, items_fingerprints ) except Exception as e: log.exception(e) def get_items_count(self): return self._items_queue.qsize() def is_adding_to_db(self): return self._is_adding_to_db def __dedup_items(self, items, items_fingerprints): """ 去重 @param items: @param items_fingerprints: @return: 返回去重后的items, items_fingerprints """ if not items: return items, items_fingerprints is_exists = self.__class__.dedup.get(items_fingerprints) is_exists = is_exists if isinstance(is_exists, list) else [is_exists] dedup_items = [] dedup_items_fingerprints = [] items_count = dedup_items_count = dup_items_count = 0 while is_exists: item = items.pop(0) items_fingerprint = items_fingerprints.pop(0) is_exist = is_exists.pop(0) items_count += 1 if not is_exist: dedup_items.append(item) dedup_items_fingerprints.append(items_fingerprint) dedup_items_count += 1 else: dup_items_count += 1 log.info( "待入库数据 {} 条, 重复 {} 条,实际待入库数据 {} 条".format( items_count, dup_items_count, dedup_items_count ) ) return dedup_items, dedup_items_fingerprints def __pick_items(self, items, is_update_item=False): """ 将每个表之间的数据分开 拆分后 原items为空 @param items: @param is_update_item: @return: """ datas_dict = { # 'xxx:xxx_item': [{}, {}] redis 中的item名与对应的数据 } while items: item = items.pop(0) # 取item下划线格式的名 # 下划线类的名先从dict中取,没有则现取,然后存入dict。加快下次取的速度 item_name = item.item_name item_table = self._item_tables.get(item_name) if not item_table: item_name_underline = item.name_underline tab_item = self._table_item.format( redis_key=self._redis_key, item_name=item_name_underline ) item_table = {} item_table["tab_item"] = tab_item self._item_tables[item_name] = item_table else: tab_item = item_table.get("tab_item") if tab_item not in datas_dict: datas_dict[tab_item] = [] datas_dict[tab_item].append(item.to_dict) if is_update_item and tab_item not in self._item_update_keys: self._item_update_keys[tab_item] = item.update_key return datas_dict def __export_to_db(self, tab_item, datas, is_update=False, update_keys=()): to_table = tools.get_info(tab_item, ":s_(.*?)_item$", fetch_one=True) # 打点 校验 self.check_datas(table=to_table, datas=datas) for pipeline in self._pipelines: if is_update: if to_table == self._task_table and not isinstance( pipeline, MysqlPipeline ): continue if not pipeline.update_items(to_table, datas, update_keys=update_keys): log.error( f"{pipeline.__class__.__name__} 更新数据失败. table: {to_table} items: {datas}" ) return False else: if not pipeline.save_items(to_table, datas): log.error( f"{pipeline.__class__.__name__} 保存数据失败. table: {to_table} items: {datas}" ) return False # 若是任务表, 且上面的pipeline里没mysql,则需调用mysql更新任务 if not self._have_mysql_pipeline and is_update and to_table == self._task_table: if not self.mysql_pipeline.update_items( to_table, datas, update_keys=update_keys ): log.error( f"{pipeline.__class__.__name__} 更新数据失败. table: {to_table} items: {datas}" ) return False return True def __add_item_to_db( self, items, update_items, requests, callbacks, items_fingerprints ): export_success = False self._is_adding_to_db = True # 去重 if setting.ITEM_FILTER_ENABLE: items, items_fingerprints = self.__dedup_items(items, items_fingerprints) # 分捡 items_dict = self.__pick_items(items) update_items_dict = self.__pick_items(update_items, is_update_item=True) # item批量入库 while items_dict: tab_item, datas = items_dict.popitem() log.debug( """ -------------- item 批量入库 -------------- 表名: %s datas: %s """ % (tab_item, tools.dumps_json(datas, indent=16)) ) export_success = self.__export_to_db(tab_item, datas) # 执行批量update while update_items_dict: tab_item, datas = update_items_dict.popitem() log.debug( """ -------------- item 批量更新 -------------- 表名: %s datas: %s """ % (tab_item, tools.dumps_json(datas, indent=16)) ) update_keys = self._item_update_keys.get(tab_item) export_success = self.__export_to_db( tab_item, datas, is_update=True, update_keys=update_keys ) # 执行回调 while callbacks: try: callback = callbacks.pop(0) callback() except Exception as e: log.exception(e) # 删除做过的request if requests: self._db.zrem(self._table_request, requests) # 去重入库 if export_success and setting.ITEM_FILTER_ENABLE: if items_fingerprints: self.__class__.dedup.add(items_fingerprints, skip_check=True) self._is_adding_to_db = False def check_datas(self, table, datas): """ 打点 记录总条数及每个key情况 @param table: 表名 @param datas: 数据 列表 @return: """ pass def close(self): pass
def __init__( self, task_table, batch_record_table, batch_name, batch_interval, task_keys, task_state="state", min_task_count=10000, check_task_interval=5, task_limit=10000, related_redis_key=None, related_batch_record=None, task_condition="", task_order_by="", redis_key=None, thread_count=None, begin_callback=None, end_callback=None, delete_keys=(), auto_stop_when_spider_done=None, send_run_time=False, ): """ @summary: 批次爬虫 必要条件 1、需有任务表 任务表中必须有id 及 任务状态字段 如 state。如指定parser_name字段,则任务会自动下发到对应的parser下, 否则会下发到所有的parser下。其他字段可根据爬虫需要的参数自行扩充 参考建表语句如下: CREATE TABLE `table_name` ( `id` int(11) NOT NULL AUTO_INCREMENT, `param` varchar(1000) DEFAULT NULL COMMENT '爬虫需要的抓取数据需要的参数', `state` int(11) DEFAULT NULL COMMENT '任务状态', `parser_name` varchar(255) DEFAULT NULL COMMENT '任务解析器的脚本类名', PRIMARY KEY (`id`), UNIQUE KEY `nui` (`param`) USING BTREE ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8; 2、需有批次记录表 不存在自动创建 --------- @param task_table: mysql中的任务表 @param batch_record_table: mysql 中的批次记录表 @param batch_name: 批次采集程序名称 @param batch_interval: 批次间隔 天为单位。 如想一小时一批次,可写成1/24 @param task_keys: 需要获取的任务字段 列表 [] 如需指定解析的parser,则需将parser_name字段取出来。 @param task_state: mysql中任务表的任务状态字段 @param min_task_count: redis 中最少任务数, 少于这个数量会从mysql的任务表取任务 @param check_task_interval: 检查是否还有任务的时间间隔; @param task_limit: 从数据库中取任务的数量 @param redis_key: 任务等数据存放在redis中的key前缀 @param thread_count: 线程数,默认为配置文件中的线程数 @param begin_callback: 爬虫开始回调函数 @param end_callback: 爬虫结束回调函数 @param delete_keys: 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则; 常用于清空任务队列,否则重启时会断点续爬 @param auto_stop_when_spider_done: 爬虫抓取完毕后是否自动结束或等待任务,默认自动结束 @param send_run_time: 发送运行时间 @param related_redis_key: 有关联的其他爬虫任务表(redis)注意:要避免环路 如 A -> B & B -> A 。 @param related_batch_record: 有关联的其他爬虫批次表(mysql)注意:要避免环路 如 A -> B & B -> A 。 related_redis_key 与 related_batch_record 选其一配置即可;用于相关联的爬虫没结束时,本爬虫也不结束 若相关连的爬虫为批次爬虫,推荐以related_batch_record配置, 若相关连的爬虫为普通爬虫,无批次表,可以以related_redis_key配置 @param task_condition: 任务条件 用于从一个大任务表中挑选出数据自己爬虫的任务,即where后的条件语句 @param task_order_by: 取任务时的排序条件 如 id desc --------- @result: """ Scheduler.__init__( self, redis_key=redis_key, thread_count=thread_count, begin_callback=begin_callback, end_callback=end_callback, delete_keys=delete_keys, auto_stop_when_spider_done=auto_stop_when_spider_done, auto_start_requests=False, send_run_time=send_run_time, batch_interval=batch_interval, ) self._redisdb = RedisDB() self._mysqldb = MysqlDB() self._request_buffer = RequestBuffer(self._redis_key) self._task_table = task_table # mysql中的任务表 self._batch_record_table = batch_record_table # mysql 中的批次记录表 self._batch_name = batch_name # 批次采集程序名称 self._task_keys = task_keys # 需要获取的任务字段 self._task_state = task_state # mysql中任务表的state字段名 self._min_task_count = min_task_count # redis 中最少任务数 self._check_task_interval = check_task_interval self._task_limit = task_limit # mysql中一次取的任务数量 self._related_task_tables = [ setting.TAB_REQUSETS.format(redis_key=redis_key) ] # 自己的task表也需要检查是否有任务 if related_redis_key: self._related_task_tables.append( setting.TAB_REQUSETS.format(redis_key=related_redis_key)) self._related_batch_record = related_batch_record self._task_condition = task_condition self._task_condition_prefix_and = task_condition and " and {}".format( task_condition) self._task_condition_prefix_where = task_condition and " where {}".format( task_condition) self._task_order_by = task_order_by and " order by {}".format( task_order_by) self._batch_date_cache = None if self._batch_interval >= 1: self._date_format = "%Y-%m-%d" elif self._batch_interval < 1 and self._batch_interval >= 1 / 24: self._date_format = "%Y-%m-%d %H" else: self._date_format = "%Y-%m-%d %H:%M" # 报警相关 self._send_msg_interval = datetime.timedelta(hours=1) # 每隔1小时发送一次报警 self._last_send_msg_time = None self._spider_last_done_time = None # 爬虫最近已做任务数量时间 self._spider_last_done_count = 0 # 爬虫最近已做任务数量 self._spider_deal_speed_cached = None self._is_more_parsers = True # 多模版类爬虫
class GuestUserPool(UserPoolInterface): """ 访客用户池 不需要登陆 """ def __init__( self, redis_key, page_url=None, min_users=1, must_contained_keys=(), keep_alive=False, **kwargs, ): """ @param redis_key: user存放在redis中的key前缀 @param page_url: 生产user的url @param min_users: 最小user数 @param must_contained_keys: cookie中必须包含的key,用于校验cookie是否正确 @param keep_alive: 是否保持常驻,以便user不足时立即补充 --- @param kwargs: WebDriver的一些参数 load_images: 是否加载图片 user_agent: 字符串 或 无参函数,返回值为user_agent proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址 headless: 是否启用无头模式 driver_type: CHROME 或 PHANTOMJS,FIREFOX timeout: 请求超时时间 window_size: # 窗口大小 executable_path: 浏览器路径,默认为默认路径 """ self._redisdb = RedisDB() self._tab_user_pool = setting.TAB_USER_POOL.format(redis_key=redis_key, user_type="guest") self._page_url = page_url self._min_users = min_users self._must_contained_keys = must_contained_keys self._keep_alive = keep_alive self._kwargs = kwargs self._kwargs.setdefault("load_images", False) self._kwargs.setdefault("headless", True) self._users_id = [] def _load_users_id(self): self._users_id = self._redisdb.hkeys(self._tab_user_pool) if self._users_id: random.shuffle(self._users_id) def _get_user_id(self): if not self._users_id: self._load_users_id() if self._users_id: return self._users_id.pop() def login(self) -> Optional[GuestUser]: """ 默认使用webdirver去登录,生产cookie,可以重写 """ with WebDriver(**self._kwargs) as driver: driver.get(self._page_url) cookies = driver.cookies for key in self._must_contained_keys: if key not in cookies: break else: user = GuestUser(user_agent=driver.user_agent, cookies=cookies) return user log.error("获取cookie失败 cookies = {}".format(cookies)) return None def add_user(self, user: GuestUser): log.debug("add {}".format(user)) self._redisdb.hset(self._tab_user_pool, user.user_id, user.to_dict()) def get_user(self, block=True) -> Optional[GuestUser]: """ Args: block: 无用户时是否等待 Returns: """ while True: try: user_id = self._get_user_id() user_str = None if user_id: user_str = self._redisdb.hget(self._tab_user_pool, user_id) # 如果没取到user,可能是其他爬虫将此用户删除了,需要重刷新本地缓存的用户id if not user_str: self._load_users_id() continue if not user_id and block: self._keep_alive = False with RedisLock(key=self._tab_user_pool, lock_timeout=3600, wait_timeout=0) as _lock: if _lock.locked: self.run() continue return user_str and GuestUser(**eval(user_str)) except Exception as e: log.exception(e) tools.delay_time(1) def del_user(self, user_id: str): self._redisdb.hdel(self._tab_user_pool, user_id) self._load_users_id() def run(self): while True: try: now_user_count = self._redisdb.hget_count(self._tab_user_pool) need_user_count = self._min_users - now_user_count if need_user_count > 0: log.info("当前在线user数为 {} 小于 {}, 生产user".format( now_user_count, self._min_users)) try: user = self.login() if user: self.add_user(user) except Exception as e: log.exception(e) else: log.debug("当前user数为 {} 数量足够 暂不生产".format(now_user_count)) if self._keep_alive: tools.delay_time(10) else: break except Exception as e: log.exception(e) tools.delay_time(1)
def redisdb(self): if not self._redisdb: self._redisdb = RedisDB() return self._redisdb
class RequestBuffer(threading.Thread): dedup = None def __init__(self, redis_key): if not hasattr(self, "_requests_deque"): super(RequestBuffer, self).__init__() self._thread_stop = False self._is_adding_to_db = False self._requests_deque = collections.deque() self._del_requests_deque = collections.deque() self._db = RedisDB() self._table_request = setting.TAB_REQUSETS.format( redis_key=redis_key) self._table_failed_request = setting.TAB_FAILED_REQUSETS.format( redis_key=redis_key) if not self.__class__.dedup and setting.REQUEST_FILTER_ENABLE: self.__class__.dedup = Dedup( name=redis_key, to_md5=False, **setting.REQUEST_FILTER_SETTING) # 默认过期时间为一个月 def run(self): self._thread_stop = False while not self._thread_stop: try: self.__add_request_to_db() except Exception as e: log.exception(e) tools.delay_time(1) def stop(self): self._thread_stop = True self._started.clear() def put_request(self, request): self._requests_deque.append(request) if self.get_requests_count() > MAX_URL_COUNT: # 超过最大缓存,主动调用 self.flush() def put_del_request(self, request): self._del_requests_deque.append(request) def put_failed_request(self, request, table=None): try: request_dict = request.to_dict self._db.zadd(table or self._table_failed_request, request_dict, request.priority) except Exception as e: log.exception(e) def flush(self): try: self.__add_request_to_db() except Exception as e: log.exception(e) def get_requests_count(self): return len(self._requests_deque) def is_adding_to_db(self): return self._is_adding_to_db def __add_request_to_db(self): request_list = [] prioritys = [] callbacks = [] while self._requests_deque: request = self._requests_deque.popleft() self._is_adding_to_db = True if callable(request): # 函数 # 注意:应该考虑闭包情况。闭包情况可写成 # def test(xxx = xxx): # # TODO 业务逻辑 使用 xxx # 这么写不会导致xxx为循环结束后的最后一个值 callbacks.append(request) continue priority = request.priority # 如果需要去重并且库中已重复 则continue if (request.filter_repeat and setting.REQUEST_FILTER_ENABLE and not self.__class__.dedup.add(request.fingerprint)): log.debug("request已存在 url = %s" % request.url) continue else: request_list.append(str(request.to_dict)) prioritys.append(priority) if len(request_list) > MAX_URL_COUNT: self._db.zadd(self._table_request, request_list, prioritys) request_list = [] prioritys = [] # 入库 if request_list: self._db.zadd(self._table_request, request_list, prioritys) # 执行回调 for callback in callbacks: try: callback() except Exception as e: log.exception(e) # 删除已做任务 if self._del_requests_deque: request_done_list = [] while self._del_requests_deque: request_done_list.append(self._del_requests_deque.popleft()) # 去掉request_list中的requests, 否则可能会将刚添加的request删除 request_done_list = list( set(request_done_list) - set(request_list)) if request_done_list: self._db.zrem(self._table_request, request_done_list) self._is_adding_to_db = False
def __init__( self, redis_key=None, thread_count=None, begin_callback=None, end_callback=None, delete_keys=(), auto_stop_when_spider_done=None, auto_start_requests=None, send_run_time=True, batch_interval=0, wait_lock=True, task_table=None, ): """ @summary: 调度器 --------- @param redis_key: 爬虫request及item存放reis中的文件夹 @param thread_count: 线程数,默认为配置文件中的线程数 @param begin_callback: 爬虫开始回调函数 @param end_callback: 爬虫结束回调函数 @param delete_keys: 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则 @param auto_stop_when_spider_done: 爬虫抓取完毕后是否自动结束或等待任务,默认自动结束 @param auto_start_requests: 爬虫是否自动添加任务 @param send_run_time: 发送运行时间 @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时,只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时,爬虫才启动 @param wait_lock: 下发任务时否等待锁,若不等待锁,可能会存在多进程同时在下发一样的任务,因此分布式环境下请将该值设置True @param task_table: 任务表, 批次爬虫传递 --------- @result: """ super(Scheduler, self).__init__() for key, value in self.__class__.__custom_setting__.items(): setattr(setting, key, value) log.reload() self._redis_key = redis_key or setting.REDIS_KEY if not self._redis_key: raise Exception( """ redis_key 为redis中存放request与item的目录。不能为空, 可在setting中配置,如 REDIS_KEY = 'test' 或spider初始化时传参, 如 TestSpider(redis_key='test') """ ) self._request_buffer = RequestBuffer(redis_key) self._item_buffer = ItemBuffer(redis_key, task_table) self._collector = Collector(redis_key) self._parsers = [] self._parser_controls = [] self._parser_control_obj = PaserControl self._auto_stop_when_spider_done = ( auto_stop_when_spider_done if auto_stop_when_spider_done is not None else setting.AUTO_STOP_WHEN_SPIDER_DONE ) self._auto_start_requests = ( auto_start_requests if auto_start_requests is not None else setting.SPIDER_AUTO_START_REQUESTS ) self._send_run_time = send_run_time self._batch_interval = batch_interval self._begin_callback = ( begin_callback if begin_callback else lambda: log.info("\n********** feapder begin **********") ) self._end_callback = ( end_callback if end_callback else lambda: log.info("\n********** feapder end **********") ) self._thread_count = ( setting.SPIDER_THREAD_COUNT if not thread_count else thread_count ) self._spider_name = redis_key self._project_name = redis_key.split(":")[0] self._tab_spider_time = setting.TAB_SPIDER_TIME.format(redis_key=redis_key) self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key) self._tab_requests = setting.TAB_REQUSETS.format(redis_key=redis_key) self._tab_failed_requests = setting.TAB_FAILED_REQUSETS.format( redis_key=redis_key ) self._is_notify_end = False # 是否已经通知结束 self._last_task_count = 0 # 最近一次任务数量 self._redisdb = RedisDB() self._project_total_state_table = "{}_total_state".format(self._project_name) self._is_exist_project_total_state_table = False # Request 缓存设置 Request.cached_redis_key = redis_key Request.cached_expire_time = setting.RESPONSE_CACHED_EXPIRE_TIME delete_keys = delete_keys or setting.DELETE_KEYS if delete_keys: self.delete_tables(delete_keys) self._last_check_task_status_time = 0 self.wait_lock = wait_lock
from feapder.db.redisdb import RedisDB import time db = RedisDB.from_url("redis://localhost:6379") db.clear("test") # db = RedisDB(ip_ports="172.25.21.4:26379,172.25.21.5:26379,172.25.21.6:26379", db=0, user_pass=None) for i in range(20): result = db.zadd("test", list(range(10)), list(range(10))) print(result) time.sleep(3) # # # db.zremrangebyscore("test", 1, 3) # # db.zrem("test", [4, 0]) # # print(db.zget("test", 10))