Exemple #1
0
    def start_monitor_task(self, *args, **kws):
        if not self.is_reach_next_spider_time():
            return

        self._auto_start_requests = False
        redisdb = RedisDB()

        if not self._parsers:  # 不是add_parser 模式
            self._parsers.append(self)

        while True:
            try:
                # 检查redis中是否有任务
                tab_requests = setting.TAB_REQUSETS.format(
                    table_folder=self._table_folder)
                todo_task_count = redisdb.zget_count(tab_requests)

                if todo_task_count < self._min_task_count:  # 添加任务
                    # make start requests
                    self.distribute_task(*args, **kws)

                else:
                    log.info("redis 中尚有%s条积压任务,暂时不派发新任务" % todo_task_count)

            except Exception as e:
                log.exception(e)

            if self._auto_stop_when_spider_done:
                break

            time.sleep(self._check_task_interval)
Exemple #2
0
    def __init__(self, table_folder, process_num=None):
        """
        @summary:
        ---------
        @param table_folder:
        @param process_num: 进程编号
        ---------
        @result:
        """

        super(Collector, self).__init__()
        self._db = RedisDB()

        self._thread_stop = False

        self._todo_requests = collections.deque()

        self._tab_requests = setting.TAB_REQUSETS.format(
            table_folder=table_folder)
        self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(
            table_folder=table_folder)

        self._spider_mark = LOCAL_HOST_IP + ("_%s" % process_num
                                             if process_num else "_0")

        self._interval = setting.COLLECTOR_SLEEP_TIME
        self._request_count = setting.COLLECTOR_TASK_COUNT
        self._is_collector_task = False

        self._db.clear(self._tab_spider_status)
Exemple #3
0
    def __init__(self, table_folder):
        if not hasattr(self, "_table_item"):
            super(ItemBuffer, self).__init__()

            self._thread_stop = False
            self._is_adding_to_db = False
            self._table_folder = table_folder

            self._items_queue = Queue(maxsize=MAX_ITEM_COUNT)
            self._db = RedisDB()

            self._table_item = setting.TAB_ITEM
            self._table_request = setting.TAB_REQUSETS.format(
                table_folder=table_folder)

            self._item_tables = {
                # 'xxx_item': {'tab_item': 'xxx:xxx_item'} # 记录item名与redis中item名对应关系
            }

            self._item_update_keys = {
                # 'xxx:xxx_item': ['id', 'name'...] # 记录redis中item名与需要更新的key对应关系
            }

            self._export_data = ExportData(
            ) if setting.ADD_ITEM_TO_MYSQL else None

            self.db_tip()
    def __init__(self, table_folder):
        super(HandleFailedRequests, self).__init__()
        self._table_folder = table_folder

        self._redisdb = RedisDB()
        self._request_buffer = RequestBuffer(self._table_folder)

        self._table_failed_request = setting.TAB_FAILED_REQUSETS.format(
            table_folder=table_folder)
Exemple #5
0
    def delete_tables(self, delete_tables_list):
        if isinstance(delete_tables_list, bool):
            delete_tables_list = [self._table_folder + "*"]
        elif not isinstance(delete_tables_list, (list, tuple)):
            delete_tables_list = [delete_tables_list]

        redis = RedisDB()
        for delete_tab in delete_tables_list:
            if delete_tab == "*":
                delete_tab = self._table_folder + "*"

            tables = redis.getkeys(delete_tab)
            for table in tables:
                log.info("正在删除表 %s" % table)
                redis.clear(table)
Exemple #6
0
    def __init__(self, table_folder):
        if not hasattr(self, "_requests_deque"):
            super(RequestBuffer, self).__init__()

            self._thread_stop = False
            self._is_adding_to_db = False

            self._requests_deque = collections.deque()
            self._del_requests_deque = collections.deque()
            self._db = RedisDB()

            self._table_request = setting.TAB_REQUSETS.format(
                table_folder=table_folder)
            self._table_failed_request = setting.TAB_FAILED_REQUSETS.format(
                table_folder=table_folder)
Exemple #7
0
    def _start(self):
        # 启动request_buffer
        self._request_buffer.start()
        # 启动item_buffer
        self._item_buffer.start()
        # 启动collector
        self._collector.start()

        # 启动parser control
        for i in range(self._parser_count):
            parser_control = self._parser_control_obj(
                self._collector,
                self._table_folder,
                self._request_buffer,
                self._item_buffer,
            )

            for parser in self._parsers:
                parser_control.add_parser(parser)

            parser_control.start()
            self._parser_controls.append(parser_control)

        # 下发任务 因为时间可能比较长,放到最后面
        if setting.RETRY_FAILED_REQUESTS:
            # 重设失败的任务, 不用加锁,原子性操作
            handle_failed_requests = HandleFailedRequests(self._table_folder)
            handle_failed_requests.reput_failed_requests_to_requests()

        # 下发新任务
        if self._auto_start_requests:  # 自动下发
            if self.wait_lock:
                # 将添加任务处加锁,防止多进程之间添加重复的任务
                with RedisLock(
                        key=self._spider_name,
                        timeout=3600,
                        wait_timeout=60,
                        redis_cli=RedisDB().get_redis_obj(),
                ) as lock:
                    if lock.locked:
                        self.__add_task()
            else:
                self.__add_task()
    def __init__(self,
                 name: str,
                 expire_time: int,
                 expire_time_record_key=None):
        if not name:
            raise ValueError("name cant't be None")
        if not expire_time:
            raise ValueError("please set expire time, units is seconds")

        if not self.__class__.redis_db:
            self.__class__.redis_db = RedisDB()

        self.name = name
        self.expire_time = expire_time
        self.expire_time_record_key = expire_time_record_key

        self.record_expire_time()

        self.del_expire_key()
    def check_filter_capacity(self):
        """
        检测filter状态,如果已满,加载新的filter
        @return:
        """
        if (not self._check_capacity_time
                or time.time() - self._check_capacity_time > 1800):
            # with self._thread_lock:
            with RedisLock(
                    key="ScalableBloomFilter",
                    timeout=300,
                    wait_timeout=300,
                    redis_cli=RedisDB().get_redis_obj(),
            ) as lock:  # 全局锁 同一时间只有一个进程在真正的创建新的filter,等这个进程创建完,其他进程只是把刚创建的filter append进来
                if lock.locked:
                    while True:
                        if self.filters[-1].is_at_capacity:
                            self.filters.append(self.create_filter())
                        else:
                            break

                    self._check_capacity_time = time.time()
Exemple #10
0
    def __init__(self, table_folder):
        if not hasattr(self, "_requests_deque"):
            super(RequestBuffer, self).__init__()

            self._thread_stop = False
            self._is_adding_to_db = False

            self._requests_deque = collections.deque()
            self._del_requests_deque = collections.deque()
            self._db = RedisDB()

            self._table_request = setting.TAB_REQUSETS.format(
                table_folder=table_folder)
            self._table_failed_request = setting.TAB_FAILED_REQUSETS.format(
                table_folder=table_folder)

            if not self.__class__.dedup and setting.REQUEST_FILTER_ENABLE:
                self.__class__.dedup = Dedup(
                    filter_type=Dedup.ExpireFilter,
                    name=table_folder,
                    expire_time=2592000,
                    to_md5=False,
                )  # 过期时间为一个月
Exemple #11
0
    def __init__(
        self,
        table_folder=None,
        parser_count=None,
        begin_callback=None,
        end_callback=None,
        delete_tabs=(),
        process_num=None,
        auto_stop_when_spider_done=None,
        auto_start_requests=None,
        send_run_time=True,
        batch_interval=0,
        *parser_args,
        **parser_kwargs
    ):
        """
        @summary: 调度器
        ---------
        @param table_folder: 爬虫request及item存放reis中的文件夹
        @param parser_count: 线程数,默认为配置文件中的线程数
        @param begin_callback: 爬虫开始回调函数
        @param end_callback: 爬虫结束回调函数
        @param delete_tabs: 爬虫启动时删除的表,类型: 元组/bool/string。 支持正则
        @param process_num: 进程数
        @param auto_stop_when_spider_done: 爬虫抓取完毕后是否自动结束或等待任务,默认自动结束
        @param auto_start_requests: 爬虫是否自动添加任务
        @param send_run_time: 发送运行时间
        @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时,只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时,爬虫才启动

        @param *parser_args: 传给parser下start_requests的参数, tuple()
        @param **parser_kwargs: 传给parser下start_requests的参数, dict()
        ---------
        @result:
        """

        super(Scheduler, self).__init__()

        for key, value in self.__class__.__custom_setting__.items():
            setattr(setting, key, value)

        self._table_folder = table_folder or setting.TABLE_FOLDER
        if not self._table_folder:
            raise Exception(
                """
                table_folder 为redis中存放request与item的目录。不能为空,
                可在setting中配置,如 TABLE_FOLDER = 'test'
                或spider初始化时传参, 如 TestSpider(table_folder='test')
                """
            )

        self._request_buffer = RequestBuffer(table_folder)
        self._item_buffer = ItemBuffer(table_folder)

        self._collector = Collector(table_folder, process_num)
        self._parsers = []
        self._parser_controls = []
        self._parser_control_obj = PaserControl

        self._parser_args = parser_args
        self._parser_kwargs = parser_kwargs
        self._auto_stop_when_spider_done = (
            auto_stop_when_spider_done
            if auto_stop_when_spider_done is not None
            else setting.AUTO_STOP_WHEN_SPIDER_DONE
        )
        self._auto_start_requests = (
            auto_start_requests
            if auto_start_requests is not None
            else setting.PARSER_AUTO_START_REQUESTS
        )
        self._send_run_time = send_run_time
        self._batch_interval = batch_interval

        self._begin_callback = (
            begin_callback
            if begin_callback
            else lambda: log.info("\n********** spider begin **********")
        )
        self._end_callback = (
            end_callback
            if end_callback
            else lambda: log.info("\n********** spider end **********")
        )

        self._parser_count = setting.PARSER_COUNT if not parser_count else parser_count

        self._spider_name = table_folder
        self._project_name = table_folder.split(":")[0]

        self._tab_spider_time = setting.TAB_SPIDER_TIME.format(
            table_folder=table_folder
        )
        self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(
            table_folder=table_folder
        )
        self._tab_requests = setting.TAB_REQUSETS.format(table_folder=table_folder)
        self._tab_failed_requests = setting.TAB_FAILED_REQUSETS.format(
            table_folder=table_folder
        )

        self._is_notify_end = False  # 是否已经通知结束
        self._last_task_count = 0  # 最近一次任务数量
        self._redisdb = RedisDB()

        self._project_total_state_table = "{}_total_state".format(self._project_name)
        self._is_exist_project_total_state_table = False

        # Request 缓存设置
        Request.cached_table_folder = table_folder
        Request.cached_expire_time = setting.RESPONSE_CACHED_EXPIRE_TIME

        delete_tabs = delete_tabs or setting.DELETE_TABS
        if delete_tabs:
            self.delete_tables(delete_tabs)

        self._last_check_task_status_time = 0
Exemple #12
0
    def __init__(self,
                 task_table,
                 batch_record_table,
                 batch_name,
                 batch_interval,
                 task_keys,
                 task_state="state",
                 min_task_count=10000,
                 check_task_interval=5,
                 task_limit=10000,
                 related_table_folder=None,
                 related_batch_record=None,
                 task_condition="",
                 task_order_by="",
                 table_folder=None,
                 parser_count=None,
                 begin_callback=None,
                 end_callback=None,
                 delete_tabs=(),
                 process_num=None,
                 auto_stop_when_spider_done=None,
                 send_run_time=False,
                 *parser_args,
                 **parser_kwargs):
        """
        @summary: 批次爬虫
        必要条件
        1、需有任务表
            任务表中必须有id 及 任务状态字段 如 state。如指定parser_name字段,则任务会自动下发到对应的parser下, 否则会下发到所有的parser下。其他字段可根据爬虫需要的参数自行扩充

            参考建表语句如下:
            CREATE TABLE `table_name` (
              `id` int(11) NOT NULL AUTO_INCREMENT,
              `param` varchar(1000) DEFAULT NULL COMMENT '爬虫需要的抓取数据需要的参数',
              `state` int(11) DEFAULT NULL COMMENT '任务状态',
              `parser_name` varchar(255) DEFAULT NULL COMMENT '任务解析器的脚本类名',
              PRIMARY KEY (`id`),
              UNIQUE KEY `nui` (`param`) USING BTREE
            ) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=utf8;

        2、需有批次记录表 不存在自动创建

            此表节结构固定,参考建表语句如下:
            CREATE TABLE `xxx_batch_record` (
              `id` int(11) NOT NULL AUTO_INCREMENT,
              `batch_date` date DEFAULT NULL,
              `done_count` int(11) DEFAULT NULL,
              `total_count` int(11) DEFAULT NULL,
              PRIMARY KEY (`id`)
            ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;

        ---------
        @param task_table: mysql中的任务表
        @param batch_record_table: mysql 中的批次记录表
        @param batch_name: 批次采集程序名称
        @param batch_interval: 批次间隔 天为单位。 如想一小时一批次,可写成1/24
        @param task_keys: 需要获取的任务字段 列表 [] 如需指定解析的parser,则需将parser_name字段取出来。
        @param task_state: mysql中任务表的state字段名
        @param min_task_count: redis 中最少任务数, 少于这个数量会从mysql的任务表取任务
        @param check_task_interval: 检查是否还有任务的时间间隔;
        @param task_limit: 数据库中取任务的数量
        @param table_folder: 爬虫request及item存放reis中的文件夹
        @param parser_count: 线程数,默认为配置文件中的线程数
        @param begin_callback: 爬虫开始回调函数
        @param end_callback: 爬虫结束回调函数
        @param delete_tabs: 爬虫启动时删除的表,元组类型。 支持正则
        @param process_num: 进程数
        @param auto_stop_when_spider_done: 爬虫抓取完毕后是否自动结束或等待任务,默认自动结束
        @param send_run_time: 发送运行时间
        @param related_table_folder: 有关联的其他爬虫任务表(redis)
        @param related_batch_record: 有关联的其他爬虫批次表(mysql)注意:要避免环路 如 A -> B & B -> A 。 环路可用related_table_folder指定
            related_table_folder 与 related_batch_record 选其一配置即可。
            若相关连的爬虫为批次爬虫,推荐以related_batch_record配置,
            若相关连的爬虫为普通爬虫,无批次表,可以以related_table_folder配置
        @param task_condition: 任务条件 用于从一个大任务表中挑选出数据自己爬虫的任务,及where后的条件语句
        @param task_order_by: 取任务时的排序条件 如 id desc


        @param *parser_args: 传给parser下start_requests的参数, tuple()
        @param **parser_kwargs: 传给parser下start_requests的参数, dict()
        ---------
        @result:
        """
        Scheduler.__init__(
            self,
            table_folder=table_folder,
            parser_count=parser_count,
            begin_callback=begin_callback,
            end_callback=end_callback,
            delete_tabs=delete_tabs,
            process_num=process_num,
            auto_stop_when_spider_done=auto_stop_when_spider_done,
            auto_start_requests=False,
            send_run_time=send_run_time,
            batch_interval=batch_interval,
            *parser_args,
            **parser_kwargs)

        self._redisdb = RedisDB()
        self._mysqldb = MysqlDB()

        self._request_buffer = RequestBuffer(self._table_folder)

        self._task_table = task_table  # mysql中的任务表
        self._batch_record_table = batch_record_table  # mysql 中的批次记录表
        self._batch_name = batch_name  # 批次采集程序名称
        self._task_keys = task_keys  # 需要获取的任务字段

        self._task_state = task_state  # mysql中任务表的state字段名
        self._min_task_count = min_task_count  # redis 中最少任务数
        self._check_task_interval = check_task_interval
        self._task_limit = task_limit  # mysql中一次取的任务数量
        self._related_task_tables = [
            setting.TAB_REQUSETS.format(table_folder=table_folder)
        ]  # 自己的task表也需要检查是否有任务
        if related_table_folder:
            self._related_task_tables.append(
                setting.TAB_REQUSETS.format(table_folder=related_table_folder))

        self._related_batch_record = related_batch_record
        self._task_condition_prefix_and = task_condition and " and {}".format(
            task_condition)
        self._task_condition_prefix_where = task_condition and " where {}".format(
            task_condition)
        self._task_order_by = task_order_by and " order by {}".format(
            task_order_by)

        self._batch_date_cache = None
        if self._batch_interval >= 1:
            self._date_format = "%Y-%m-%d"
        elif self._batch_interval < 1 and self._batch_interval >= 1 / 24:
            self._date_format = "%Y-%m-%d %H"
        else:
            self._date_format = "%Y-%m-%d %H:%M"

        # 报警相关
        self._send_msg_interval = datetime.timedelta(hours=1)  # 每隔1小时发送一次报警
        self._last_send_msg_time = None

        self._spider_last_done_time = None  # 爬虫最近已做任务数量时间
        self._spider_last_done_count = 0  # 爬虫最近已做任务数量
        self._spider_deal_speed_cached = None

        self._is_more_parsers = True  # 多模版类爬虫
    def task_is_done(self):
        """
        @summary: 检查任务状态 是否做完 同时更新批次时间 (不能挂 挂了批次时间就不更新了)
        ---------
        ---------
        @result: True / False (做完 / 未做完)
        """

        is_done = False

        # 查看批次记录表任务状态
        sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count, is_done from {batch_record_table} order by id desc limit 1'.format(
            date_format=self._date_format.replace(":%M", ":%i"),
            batch_record_table=self._batch_record_table,
        )

        batch_info = self._mysqldb.find(sql)
        if batch_info is None:
            raise Exception("查询批次信息失败")

        if batch_info:
            self._batch_date_cache, total_count, done_count, is_done = batch_info[
                0]  # 更新self._batch_date_cache, 防止新批次已经开始了,但self._batch_date_cache还是原来的批次时间

            log.info("《%s》 批次时间%s 批次进度 %s/%s 完成状态 %d" % (
                self._batch_name,
                self._batch_date_cache,
                done_count,
                total_count,
                is_done,
            ))
            os.environ[
                "batch_date"] = self._batch_date_cache  # 更新BatchParser里边的批次时间

        if is_done:  # 检查任务表中是否有没做的任务 若有则is_done 为 False
            # 比较耗时 加锁防止多进程同时查询
            with RedisLock(
                    key=self._spider_name,
                    timeout=3600,
                    wait_timeout=0,
                    redis_cli=RedisDB().get_redis_obj(),
            ) as lock:
                if lock.locked:
                    log.info("批次表标记已完成,正在检查任务表是否有未完成的任务")

                    sql = "select 1 from %s where (%s = 0 or %s=2)%s limit 1" % (
                        self._task_table,
                        self._task_state,
                        self._task_state,
                        self._task_condition_prefix_and,
                    )
                    tasks = self._mysqldb.find(sql)  # [(1,)]  / []
                    if tasks:
                        log.info("检测到任务表中有未完成任务,等待任务下发")
                        is_done = False

                        # 更新batch_record 表的is_done 状态,减少查询任务表的次数
                        sql = 'update {batch_record_table} set is_done = 0 where batch_date = "{batch_date}"'.format(
                            batch_record_table=self._batch_record_table,
                            batch_date=self._batch_date_cache,
                        )
                        self._mysqldb.update(sql)

                    else:
                        log.info("任务表中任务均已完成,爬虫结束")
                else:
                    log.info("批次表标记已完成,其他爬虫进程正在检查任务表是否有未完成的任务,本进程跳过检查,继续等待")

                    is_done = False

        return is_done
Exemple #14
0
    def __init__(self, name):
        self.name = name
        self.count_cached_name = name + "_count_cached"

        if not self.__class__.redis_db:
            self.__class__.redis_db = RedisDB()
Exemple #15
0
    def _cache_db(self):
        if not self.__class__.cache_db:
            self.__class__.cache_db = RedisDB(
            )  # .from_url(setting.pika_spider_1_uri)

        return self.__class__.cache_db
Exemple #16
0
 def __init__(self):
     self._redisdb = RedisDB()
     self._to_db = MysqlDB()