Esempio n. 1
0
    def start_monitor_task(self, *args, **kws):
        if not self.is_reach_next_spider_time():
            return

        self._auto_start_requests = False
        redisdb = RedisDB()

        if not self._parsers:  # 不是add_parser 模式
            self._parsers.append(self)

        while True:
            try:
                # 检查redis中是否有任务
                tab_requests = setting.TAB_REQUSETS.format(
                    table_folder=self._table_folder)
                todo_task_count = redisdb.zget_count(tab_requests)

                if todo_task_count < self._min_task_count:  # 添加任务
                    # make start requests
                    self.distribute_task(*args, **kws)

                else:
                    log.info("redis 中尚有%s条积压任务,暂时不派发新任务" % todo_task_count)

            except Exception as e:
                log.exception(e)

            if self._auto_stop_when_spider_done:
                break

            time.sleep(self._check_task_interval)
Esempio n. 2
0
    def __init__(self, table_folder):
        if not hasattr(self, "_table_item"):
            super(ItemBuffer, self).__init__()

            self._thread_stop = False
            self._is_adding_to_db = False
            self._table_folder = table_folder

            self._items_queue = Queue(maxsize=MAX_ITEM_COUNT)
            self._db = RedisDB()

            self._table_item = setting.TAB_ITEM
            self._table_request = setting.TAB_REQUSETS.format(
                table_folder=table_folder)

            self._item_tables = {
                # 'xxx_item': {'tab_item': 'xxx:xxx_item'} # 记录item名与redis中item名对应关系
            }

            self._item_update_keys = {
                # 'xxx:xxx_item': ['id', 'name'...] # 记录redis中item名与需要更新的key对应关系
            }

            self._export_data = ExportData(
            ) if setting.ADD_ITEM_TO_MYSQL else None

            self.db_tip()
Esempio n. 3
0
    def __init__(self, table_folder, process_num=None):
        """
        @summary:
        ---------
        @param table_folder:
        @param process_num: 进程编号
        ---------
        @result:
        """

        super(Collector, self).__init__()
        self._db = RedisDB()

        self._thread_stop = False

        self._todo_requests = collections.deque()

        self._tab_requests = setting.TAB_REQUSETS.format(
            table_folder=table_folder)
        self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(
            table_folder=table_folder)

        self._spider_mark = LOCAL_HOST_IP + ("_%s" % process_num
                                             if process_num else "_0")

        self._interval = setting.COLLECTOR_SLEEP_TIME
        self._request_count = setting.COLLECTOR_TASK_COUNT
        self._is_collector_task = False

        self._db.clear(self._tab_spider_status)
    def __init__(self, table_folder):
        super(HandleFailedRequests, self).__init__()
        self._table_folder = table_folder

        self._redisdb = RedisDB()
        self._request_buffer = RequestBuffer(self._table_folder)

        self._table_failed_request = setting.TAB_FAILED_REQUSETS.format(
            table_folder=table_folder)
Esempio n. 5
0
    def delete_tables(self, delete_tables_list):
        if isinstance(delete_tables_list, bool):
            delete_tables_list = [self._table_folder + "*"]
        elif not isinstance(delete_tables_list, (list, tuple)):
            delete_tables_list = [delete_tables_list]

        redis = RedisDB()
        for delete_tab in delete_tables_list:
            if delete_tab == "*":
                delete_tab = self._table_folder + "*"

            tables = redis.getkeys(delete_tab)
            for table in tables:
                log.info("正在删除表 %s" % table)
                redis.clear(table)
Esempio n. 6
0
    def __init__(self, table_folder):
        if not hasattr(self, "_requests_deque"):
            super(RequestBuffer, self).__init__()

            self._thread_stop = False
            self._is_adding_to_db = False

            self._requests_deque = collections.deque()
            self._del_requests_deque = collections.deque()
            self._db = RedisDB()

            self._table_request = setting.TAB_REQUSETS.format(
                table_folder=table_folder)
            self._table_failed_request = setting.TAB_FAILED_REQUSETS.format(
                table_folder=table_folder)
Esempio n. 7
0
    def __init__(self, table_folder):
        if not hasattr(self, "_requests_deque"):
            super(RequestBuffer, self).__init__()

            self._thread_stop = False
            self._is_adding_to_db = False

            self._requests_deque = collections.deque()
            self._del_requests_deque = collections.deque()
            self._db = RedisDB()

            self._table_request = setting.TAB_REQUSETS.format(
                table_folder=table_folder)
            self._table_failed_request = setting.TAB_FAILED_REQUSETS.format(
                table_folder=table_folder)

            if not self.__class__.dedup and setting.REQUEST_FILTER_ENABLE:
                self.__class__.dedup = Dedup(
                    filter_type=Dedup.ExpireFilter,
                    name=table_folder,
                    expire_time=2592000,
                    to_md5=False,
                )  # 过期时间为一个月
Esempio n. 8
0
    def __init__(self,
                 name: str,
                 expire_time: int,
                 expire_time_record_key=None):
        if not name:
            raise ValueError("name cant't be None")
        if not expire_time:
            raise ValueError("please set expire time, units is seconds")

        if not self.__class__.redis_db:
            self.__class__.redis_db = RedisDB()

        self.name = name
        self.expire_time = expire_time
        self.expire_time_record_key = expire_time_record_key

        self.record_expire_time()

        self.del_expire_key()
Esempio n. 9
0
    def _start(self):
        # 启动request_buffer
        self._request_buffer.start()
        # 启动item_buffer
        self._item_buffer.start()
        # 启动collector
        self._collector.start()

        # 启动parser control
        for i in range(self._parser_count):
            parser_control = self._parser_control_obj(
                self._collector,
                self._table_folder,
                self._request_buffer,
                self._item_buffer,
            )

            for parser in self._parsers:
                parser_control.add_parser(parser)

            parser_control.start()
            self._parser_controls.append(parser_control)

        # 下发任务 因为时间可能比较长,放到最后面
        if setting.RETRY_FAILED_REQUESTS:
            # 重设失败的任务, 不用加锁,原子性操作
            handle_failed_requests = HandleFailedRequests(self._table_folder)
            handle_failed_requests.reput_failed_requests_to_requests()

        # 下发新任务
        if self._auto_start_requests:  # 自动下发
            if self.wait_lock:
                # 将添加任务处加锁,防止多进程之间添加重复的任务
                with RedisLock(
                        key=self._spider_name,
                        timeout=3600,
                        wait_timeout=60,
                        redis_cli=RedisDB().get_redis_obj(),
                ) as lock:
                    if lock.locked:
                        self.__add_task()
            else:
                self.__add_task()
Esempio n. 10
0
    def check_filter_capacity(self):
        """
        检测filter状态,如果已满,加载新的filter
        @return:
        """
        if (not self._check_capacity_time
                or time.time() - self._check_capacity_time > 1800):
            # with self._thread_lock:
            with RedisLock(
                    key="ScalableBloomFilter",
                    timeout=300,
                    wait_timeout=300,
                    redis_cli=RedisDB().get_redis_obj(),
            ) as lock:  # 全局锁 同一时间只有一个进程在真正的创建新的filter,等这个进程创建完,其他进程只是把刚创建的filter append进来
                if lock.locked:
                    while True:
                        if self.filters[-1].is_at_capacity:
                            self.filters.append(self.create_filter())
                        else:
                            break

                    self._check_capacity_time = time.time()
class HandleFailedRequests(object):
    """docstring for HandleFailedRequests"""
    def __init__(self, table_folder):
        super(HandleFailedRequests, self).__init__()
        self._table_folder = table_folder

        self._redisdb = RedisDB()
        self._request_buffer = RequestBuffer(self._table_folder)

        self._table_failed_request = setting.TAB_FAILED_REQUSETS.format(
            table_folder=table_folder)

    def get_failed_requests(self, count=100):
        failed_requests = self._redisdb.zget(self._table_failed_request,
                                             count=10000)
        failed_requests = [
            eval(failed_request) for failed_request in failed_requests
        ]
        return failed_requests

    def reput_failed_requests_to_requests(self):
        log.debug("正在重置失败的requests...")
        total_count = 0
        while True:
            failed_requests = self.get_failed_requests()
            if not failed_requests:
                break

            for request in failed_requests:
                request["retry_times"] = 0
                request_obj = Request.from_dict(request)
                self._request_buffer.put_request(request_obj)

                total_count += 1

        self._request_buffer.flush()

        log.debug("重置%s条失败requests为待抓取requests" % total_count)
Esempio n. 12
0
    def __init__(
        self,
        table_folder=None,
        parser_count=None,
        begin_callback=None,
        end_callback=None,
        delete_tabs=(),
        process_num=None,
        auto_stop_when_spider_done=None,
        auto_start_requests=None,
        send_run_time=True,
        batch_interval=0,
        *parser_args,
        **parser_kwargs
    ):
        """
        @summary: 调度器
        ---------
        @param table_folder: 爬虫request及item存放reis中的文件夹
        @param parser_count: 线程数,默认为配置文件中的线程数
        @param begin_callback: 爬虫开始回调函数
        @param end_callback: 爬虫结束回调函数
        @param delete_tabs: 爬虫启动时删除的表,类型: 元组/bool/string。 支持正则
        @param process_num: 进程数
        @param auto_stop_when_spider_done: 爬虫抓取完毕后是否自动结束或等待任务,默认自动结束
        @param auto_start_requests: 爬虫是否自动添加任务
        @param send_run_time: 发送运行时间
        @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时,只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时,爬虫才启动

        @param *parser_args: 传给parser下start_requests的参数, tuple()
        @param **parser_kwargs: 传给parser下start_requests的参数, dict()
        ---------
        @result:
        """

        super(Scheduler, self).__init__()

        for key, value in self.__class__.__custom_setting__.items():
            setattr(setting, key, value)

        self._table_folder = table_folder or setting.TABLE_FOLDER
        if not self._table_folder:
            raise Exception(
                """
                table_folder 为redis中存放request与item的目录。不能为空,
                可在setting中配置,如 TABLE_FOLDER = 'test'
                或spider初始化时传参, 如 TestSpider(table_folder='test')
                """
            )

        self._request_buffer = RequestBuffer(table_folder)
        self._item_buffer = ItemBuffer(table_folder)

        self._collector = Collector(table_folder, process_num)
        self._parsers = []
        self._parser_controls = []
        self._parser_control_obj = PaserControl

        self._parser_args = parser_args
        self._parser_kwargs = parser_kwargs
        self._auto_stop_when_spider_done = (
            auto_stop_when_spider_done
            if auto_stop_when_spider_done is not None
            else setting.AUTO_STOP_WHEN_SPIDER_DONE
        )
        self._auto_start_requests = (
            auto_start_requests
            if auto_start_requests is not None
            else setting.PARSER_AUTO_START_REQUESTS
        )
        self._send_run_time = send_run_time
        self._batch_interval = batch_interval

        self._begin_callback = (
            begin_callback
            if begin_callback
            else lambda: log.info("\n********** spider begin **********")
        )
        self._end_callback = (
            end_callback
            if end_callback
            else lambda: log.info("\n********** spider end **********")
        )

        self._parser_count = setting.PARSER_COUNT if not parser_count else parser_count

        self._spider_name = table_folder
        self._project_name = table_folder.split(":")[0]

        self._tab_spider_time = setting.TAB_SPIDER_TIME.format(
            table_folder=table_folder
        )
        self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(
            table_folder=table_folder
        )
        self._tab_requests = setting.TAB_REQUSETS.format(table_folder=table_folder)
        self._tab_failed_requests = setting.TAB_FAILED_REQUSETS.format(
            table_folder=table_folder
        )

        self._is_notify_end = False  # 是否已经通知结束
        self._last_task_count = 0  # 最近一次任务数量
        self._redisdb = RedisDB()

        self._project_total_state_table = "{}_total_state".format(self._project_name)
        self._is_exist_project_total_state_table = False

        # Request 缓存设置
        Request.cached_table_folder = table_folder
        Request.cached_expire_time = setting.RESPONSE_CACHED_EXPIRE_TIME

        delete_tabs = delete_tabs or setting.DELETE_TABS
        if delete_tabs:
            self.delete_tables(delete_tabs)

        self._last_check_task_status_time = 0
Esempio n. 13
0
class Scheduler(threading.Thread):
    __custom_setting__ = {}

    def __init__(
        self,
        table_folder=None,
        parser_count=None,
        begin_callback=None,
        end_callback=None,
        delete_tabs=(),
        process_num=None,
        auto_stop_when_spider_done=None,
        auto_start_requests=None,
        send_run_time=True,
        batch_interval=0,
        *parser_args,
        **parser_kwargs
    ):
        """
        @summary: 调度器
        ---------
        @param table_folder: 爬虫request及item存放reis中的文件夹
        @param parser_count: 线程数,默认为配置文件中的线程数
        @param begin_callback: 爬虫开始回调函数
        @param end_callback: 爬虫结束回调函数
        @param delete_tabs: 爬虫启动时删除的表,类型: 元组/bool/string。 支持正则
        @param process_num: 进程数
        @param auto_stop_when_spider_done: 爬虫抓取完毕后是否自动结束或等待任务,默认自动结束
        @param auto_start_requests: 爬虫是否自动添加任务
        @param send_run_time: 发送运行时间
        @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时,只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时,爬虫才启动

        @param *parser_args: 传给parser下start_requests的参数, tuple()
        @param **parser_kwargs: 传给parser下start_requests的参数, dict()
        ---------
        @result:
        """

        super(Scheduler, self).__init__()

        for key, value in self.__class__.__custom_setting__.items():
            setattr(setting, key, value)

        self._table_folder = table_folder or setting.TABLE_FOLDER
        if not self._table_folder:
            raise Exception(
                """
                table_folder 为redis中存放request与item的目录。不能为空,
                可在setting中配置,如 TABLE_FOLDER = 'test'
                或spider初始化时传参, 如 TestSpider(table_folder='test')
                """
            )

        self._request_buffer = RequestBuffer(table_folder)
        self._item_buffer = ItemBuffer(table_folder)

        self._collector = Collector(table_folder, process_num)
        self._parsers = []
        self._parser_controls = []
        self._parser_control_obj = PaserControl

        self._parser_args = parser_args
        self._parser_kwargs = parser_kwargs
        self._auto_stop_when_spider_done = (
            auto_stop_when_spider_done
            if auto_stop_when_spider_done is not None
            else setting.AUTO_STOP_WHEN_SPIDER_DONE
        )
        self._auto_start_requests = (
            auto_start_requests
            if auto_start_requests is not None
            else setting.PARSER_AUTO_START_REQUESTS
        )
        self._send_run_time = send_run_time
        self._batch_interval = batch_interval

        self._begin_callback = (
            begin_callback
            if begin_callback
            else lambda: log.info("\n********** spider begin **********")
        )
        self._end_callback = (
            end_callback
            if end_callback
            else lambda: log.info("\n********** spider end **********")
        )

        self._parser_count = setting.PARSER_COUNT if not parser_count else parser_count

        self._spider_name = table_folder
        self._project_name = table_folder.split(":")[0]

        self._tab_spider_time = setting.TAB_SPIDER_TIME.format(
            table_folder=table_folder
        )
        self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(
            table_folder=table_folder
        )
        self._tab_requests = setting.TAB_REQUSETS.format(table_folder=table_folder)
        self._tab_failed_requests = setting.TAB_FAILED_REQUSETS.format(
            table_folder=table_folder
        )

        self._is_notify_end = False  # 是否已经通知结束
        self._last_task_count = 0  # 最近一次任务数量
        self._redisdb = RedisDB()

        self._project_total_state_table = "{}_total_state".format(self._project_name)
        self._is_exist_project_total_state_table = False

        # Request 缓存设置
        Request.cached_table_folder = table_folder
        Request.cached_expire_time = setting.RESPONSE_CACHED_EXPIRE_TIME

        delete_tabs = delete_tabs or setting.DELETE_TABS
        if delete_tabs:
            self.delete_tables(delete_tabs)

        self._last_check_task_status_time = 0

    def add_parser(self, parser):
        parser = parser()  # parser 实例化
        if isinstance(parser, BaseParse):
            self._parsers.append(parser)
        else:
            raise ValueError("parser 必须继承spider.core.base_parser.BaseParser")

    def run(self):
        if not self.is_reach_next_spider_time():
            return

        self._start()

        while True:
            if self.all_thread_is_done():
                if not self._is_notify_end:
                    self.spider_end()  # 跑完一轮
                    self.record_spider_state(
                        spider_type=1,
                        state=1,
                        spider_end_time=tools.get_current_date(),
                        batch_interval=self._batch_interval,
                    )

                    self._is_notify_end = True

                if self._auto_stop_when_spider_done:
                    self._stop_all_thread()
                    break

            else:
                self._is_notify_end = False

            self.check_task_status()

            tools.delay_time(1)  # 1秒钟检查一次爬虫状态

    def _start(self):
        if self._auto_start_requests:
            # 将添加任务处加锁,防止多进程之间添加重复的任务
            with RedisLock(
                key=self._spider_name,
                timeout=3600,
                wait_timeout=60,
                redis_uri="redis://:{password}@{host_post}/{db}".format(
                    password=setting.REDISDB_USER_PASS,
                    host_post=setting.REDISDB_IP_PORTS,
                    db=setting.REDISDB_DB,
                ),
            ) as lock:
                if lock.locked:

                    # 启动parser 的 start_requests
                    self.spider_begin()  # 不自动结束的爬虫此处只能执行一遍
                    self.record_spider_state(
                        spider_type=1,
                        state=0,
                        batch_date=tools.get_current_date(),
                        spider_start_time=tools.get_current_date(),
                        batch_interval=self._batch_interval,
                    )

                    # 判断任务池中属否还有任务,若有接着抓取
                    todo_task_count = self._collector.get_requests_count()
                    if todo_task_count:
                        log.info(
                            "检查到有待做任务 %s 条,不重下发新任务。将接着上回异常终止处继续抓取" % todo_task_count
                        )
                    else:
                        for parser in self._parsers:
                            results = parser.start_requests(
                                *self._parser_args, **self._parser_kwargs
                            )
                            # 添加request到请求队列,由请求队列统一入库
                            if results and not isinstance(results, Iterable):
                                raise Exception(
                                    "%s.%s返回值必须可迭代" % (parser.name, "start_requests")
                                )

                            result_type = 1
                            for result in results or []:
                                if isinstance(result, Request):
                                    result.parser_name = (
                                        result.parser_name or parser.name
                                    )
                                    self._request_buffer.put_request(result)
                                    result_type = 1

                                elif isinstance(result, Item):
                                    self._item_buffer.put_item(result)
                                    result_type = 2

                                elif callable(result):  # callbale的request可能是更新数据库操作的函数
                                    if result_type == 1:
                                        self._request_buffer.put_request(result)
                                    else:
                                        self._item_buffer.put_item(result)
                                else:
                                    raise TypeError(
                                        "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format(
                                            type(result)
                                        )
                                    )

                            self._request_buffer.flush()
                            self._item_buffer.flush()

        # 启动collector
        self._collector.start()

        # 启动parser control
        for i in range(self._parser_count):
            parser_control = self._parser_control_obj(
                self._collector,
                self._table_folder,
                self._request_buffer,
                self._item_buffer,
            )

            for parser in self._parsers:
                parser_control.add_parser(parser)

            parser_control.start()
            self._parser_controls.append(parser_control)

        # 启动request_buffer
        self._request_buffer.start()

        # 启动item_buffer
        self._item_buffer.start()

    def all_thread_is_done(self):
        for i in range(3):  # 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
            # 检测 collector 状态
            if (
                self._collector.is_collector_task()
                or self._collector.get_requests_count() > 0
            ):
                return False

            # 检测 parser_control 状态
            for parser_control in self._parser_controls:
                if not parser_control.is_not_task():
                    return False

            # 检测 item_buffer 状态
            if (
                self._item_buffer.get_items_count() > 0
                or self._item_buffer.is_adding_to_db()
            ):
                return False

            # 检测 request_buffer 状态
            if (
                self._request_buffer.get_requests_count() > 0
                or self._request_buffer.is_adding_to_db()
            ):
                return False

            tools.delay_time(1)

        return True

    @tools.run_safe_model("check_task_status")
    def check_task_status(self):
        """
        检查任务状态 预警
        """
        # 每分钟检查一次
        now_time = time.time()
        if now_time - self._last_check_task_status_time > 60:
            self._last_check_task_status_time = now_time
        else:
            return

        # 检查redis中任务状态,若连续20分钟内任务数量未发生变化(parser可能卡死),则发出报警信息
        task_count = self._redisdb.zget_count(self._tab_requests)

        if task_count:
            if task_count != self._last_task_count:
                self._last_task_count = task_count
                self._redisdb.hset(
                    self._tab_spider_time,
                    SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY,
                    tools.get_current_timestamp(),
                )  # 多进程会重复发消息, 使用reids记录上次统计时间
            else:
                # 判断时间间隔是否超过20分钟
                lua = """
                    local key = KEYS[1]
                    local field = ARGV[1]
                    local current_timestamp = ARGV[2]

                    -- 取值
                    local last_timestamp = redis.call('hget', key, field)
                    if last_timestamp and current_timestamp - last_timestamp >= 1200 then
                        return current_timestamp - last_timestamp -- 返回任务停滞时间 秒
                    end

                    if not last_timestamp then
                        redis.call('hset', key, field, current_timestamp)
                    end

                    return 0

                """
                redis_obj = self._redisdb.get_redis_obj()
                cmd = redis_obj.register_script(lua)
                overtime = cmd(
                    keys=[self._tab_spider_time],
                    args=[
                        SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY,
                        tools.get_current_timestamp(),
                    ],
                )

                if overtime:
                    # 发送报警
                    msg = "《{}》爬虫任务停滞 {},请检查爬虫是否正常".format(
                        self._spider_name, tools.format_seconds(overtime)
                    )
                    log.error(msg)
                    self.send_msg(
                        msg,
                        level="error",
                        message_prefix="《{}》爬虫任务停滞".format(self._spider_name),
                    )

        else:
            self._last_task_count = 0

        # 检查失败任务数量 超过1000 报警,
        failed_count = self._redisdb.zget_count(self._tab_failed_requests)
        if failed_count > setting.WARNING_FAILED_COUNT:
            # 发送报警
            msg = "《%s》爬虫当前失败任务 %s, 请检查爬虫是否正常" % (self._spider_name, failed_count)
            log.error(msg)
            self.send_msg(
                msg,
                level="error",
                message_prefix="《%s》爬虫当前失败任务数预警" % (self._spider_name),
            )

        # parser_control实时统计已做任务数及失败任务数,若失败数大于10且失败任务数/已做任务数>=0.5 则报警
        failed_task_count, success_task_count = PaserControl.get_task_status_count()
        total_count = success_task_count + failed_task_count
        if total_count > 0:
            task_success_rate = success_task_count / total_count
            if task_success_rate < 0.5:
                # 发送报警
                msg = "《%s》爬虫当前任务成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % (
                    self._spider_name,
                    success_task_count,
                    failed_task_count,
                    task_success_rate,
                )
                log.error(msg)
                # 统计下上次发消息的时间,若时间大于1小时,则报警(此处为多进程,需要考虑别报重复)
                self.send_msg(
                    msg,
                    level="error",
                    message_prefix="《%s》爬虫当前任务成功率" % (self._spider_name),
                )

    def delete_tables(self, delete_tables_list):
        if isinstance(delete_tables_list, bool):
            delete_tables_list = [self._table_folder + "*"]
        elif not isinstance(delete_tables_list, (list, tuple)):
            delete_tables_list = [delete_tables_list]

        redis = RedisDB()
        for delete_tab in delete_tables_list:
            if delete_tab == "*":
                delete_tab = self._table_folder + "*"

            tables = redis.getkeys(delete_tab)
            for table in tables:
                if table != self._tab_spider_time:
                    log.info("正在删除表 %s" % table)
                    redis.clear(table)

    def _stop_all_thread(self):
        self._request_buffer.stop()
        self._item_buffer.stop()
        # 停止 collector
        self._collector.stop()
        # 停止 parser_controls
        for parser_control in self._parser_controls:
            parser_control.stop()

    def send_msg(self, msg, level="debug", message_prefix=""):
        """
        @summary: 叮叮 发送消息
        ---------
        @param msg: 消息
        @param developers: 开发者姓名
        ---------
        @result:
        """

        tools.dingding_warning(msg, rate_limit=3600, message_prefix=message_prefix)

    def spider_begin(self):
        """
        @summary: start_monitor_task 方式启动,此函数与spider_end不在同一进程内,变量不可共享
        ---------
        ---------
        @result:
        """

        if self._begin_callback:
            self._begin_callback()

        # 记录开始时间
        if not self._redisdb.hexists(self._tab_spider_time, SPIDER_START_TIME_KEY):
            current_timestamp = tools.get_current_timestamp()
            self._redisdb.hset(
                self._tab_spider_time, SPIDER_START_TIME_KEY, current_timestamp
            )

            # 发送消息
            # self.send_msg('《%s》爬虫开始'%self._spider_name)

    def spider_end(self):
        self.record_end_time()

        if self._end_callback:
            self._end_callback()

        for parser in self._parsers:
            parser.close()
            parser.end_callback()

        # 计算抓取时常
        data = self._redisdb.hget(
            self._tab_spider_time, SPIDER_START_TIME_KEY, is_pop=True
        )
        if data:
            begin_timestamp = int(data)

            spand_time = tools.get_current_timestamp() - begin_timestamp

            msg = "《%s》爬虫结束,耗时 %s" % (
                self._spider_name,
                tools.format_seconds(spand_time),
            )
            log.info(msg)

            if self._send_run_time:
                self.send_msg(msg)

        if not self._auto_stop_when_spider_done:
            log.info("爬虫不自动结束, 等待下一轮任务...")
        else:
            self.delete_tables(self._tab_spider_status)

    def record_end_time(self):
        # 记录结束时间
        if self._batch_interval:
            current_timestamp = tools.get_current_timestamp()
            self._redisdb.hset(
                self._tab_spider_time, SPIDER_END_TIME_KEY, current_timestamp
            )

    def is_reach_next_spider_time(self):
        if not self._batch_interval:
            return True

        last_spider_end_time = self._redisdb.hget(
            self._tab_spider_time, SPIDER_END_TIME_KEY
        )
        if last_spider_end_time:
            last_spider_end_time = int(last_spider_end_time)
            current_timestamp = tools.get_current_timestamp()
            time_interval = current_timestamp - last_spider_end_time

            if time_interval < self._batch_interval * 86400:
                log.info(
                    "上次运行结束时间为 {} 与当前时间间隔 为 {}, 小于规定的抓取时间间隔 {}。爬虫不执行,退出~".format(
                        tools.timestamp_to_date(last_spider_end_time),
                        tools.format_seconds(time_interval),
                        tools.format_seconds(self._batch_interval * 86400),
                    )
                )
                return False

        return True

    def record_spider_state(
        self,
        spider_type,
        state,
        batch_date=None,
        spider_start_time=None,
        spider_end_time=None,
        batch_interval=None,
    ):
        pass
Esempio n. 14
0
class RequestBuffer(threading.Thread, Singleton):
    dedup = None

    def __init__(self, table_folder):
        if not hasattr(self, "_requests_deque"):
            super(RequestBuffer, self).__init__()

            self._thread_stop = False
            self._is_adding_to_db = False

            self._requests_deque = collections.deque()
            self._del_requests_deque = collections.deque()
            self._db = RedisDB()

            self._table_request = setting.TAB_REQUSETS.format(
                table_folder=table_folder)
            self._table_failed_request = setting.TAB_FAILED_REQUSETS.format(
                table_folder=table_folder)

            if not self.__class__.dedup and setting.REQUEST_FILTER_ENABLE:
                self.__class__.dedup = Dedup(
                    filter_type=Dedup.ExpireFilter,
                    name=table_folder,
                    expire_time=2592000,
                    to_md5=False,
                )  # 过期时间为一个月

    def run(self):
        while not self._thread_stop:
            try:
                self.__add_request_to_db()
            except Exception as e:
                log.exception(e)

            tools.delay_time(1)

    def stop(self):
        self._thread_stop = True

    def put_request(self, request):
        self._requests_deque.append(request)

        if self.get_requests_count() > MAX_URL_COUNT:  # 超过最大缓存,主动调用
            self.flush()

    def put_del_request(self, request):
        self._del_requests_deque.append(request)

    def put_failed_request(self, request, table=None):
        try:
            request_dict = request.to_dict
            self._db.zadd(table or self._table_failed_request, request_dict,
                          request.priority)
        except Exception as e:
            log.exception(e)

    def flush(self):
        try:
            self.__add_request_to_db()
        except Exception as e:
            log.exception(e)

    def get_requests_count(self):
        return len(self._requests_deque)

    def is_adding_to_db(self):
        return self._is_adding_to_db

    def __add_request_to_db(self):
        request_list = []
        prioritys = []
        callbacks = []

        while self._requests_deque:
            request = self._requests_deque.popleft()
            self._is_adding_to_db = True

            if callable(request):
                # 函数
                # 注意:应该考虑闭包情况。闭包情况可写成
                # def test(xxx = xxx):
                #     # TODO 业务逻辑 使用 xxx
                # 这么写不会导致xxx为循环结束后的最后一个值
                callbacks.append(request)
                continue

            priority = request.priority

            # 如果需要去重并且库中已重复 则continue
            if (request.filter_repeat and setting.REQUEST_FILTER_ENABLE
                    and not self.__class__.dedup.add(request.fingerprint)):
                log.debug("request已存在  url = %s" % request.url)
                continue
            else:
                request_list.append(str(request.to_dict))
                prioritys.append(priority)

            if len(request_list) > MAX_URL_COUNT:
                self._db.zadd(self._table_request, request_list, prioritys)
                request_list = []
                prioritys = []

        # 入库
        if request_list:
            self._db.zadd(self._table_request, request_list, prioritys)

        # 执行回调
        for callback in callbacks:
            try:
                callback()
            except Exception as e:
                log.exception(e)

        # 删除已做任务
        if self._del_requests_deque:
            request_done_list = []
            while self._del_requests_deque:
                request_done_list.append(self._del_requests_deque.popleft())

            # 去掉request_list中的requests, 否则可能会将刚添加的request删除
            request_done_list = list(
                set(request_done_list) - set(request_list))

            if request_done_list:
                self._db.zrem(self._table_request, request_done_list)

        self._is_adding_to_db = False
Esempio n. 15
0
class ExportData(object):
    def __init__(self):
        self._redisdb = RedisDB()
        self._to_db = MysqlDB()

    def export(self, from_table, to_table, auto_update=False, batch_count=100):
        """
        @summary:
        用于从redis的item中导出数据到关系型数据库,如mysql/oracle
        from_table与to_table表结构必须一致
        ---------
        @param from_table:
        @param to_table:
        @param auto_update: 当数据存在时是否自动更新 默认否
        ---------
        @result:
        """
        total_count = 0

        while True:
            datas = []
            try:
                datas = self._redisdb.sget(from_table,
                                           count=batch_count,
                                           is_pop=False)
                if not datas:
                    log.info("""
                        \r%s -> %s 共导出 %s 条数据""" %
                             (from_table, to_table, total_count))
                    break

                json_datas = [eval(data) for data in datas]
                sql, json_datas = tools.make_batch_sql(to_table, json_datas,
                                                       auto_update)
                if self._to_db.add_batch(sql, json_datas):
                    total_count += len(json_datas)
                    self._redisdb.srem(from_table, datas)

            except Exception as e:
                log.exception(e)
                log.error(datas)

    def export_all(
        self,
        tables,
        auto_update=False,
        batch_count=100,
        every_table_per_export_callback=None,
    ):
        """
        @summary: 导出所有item
        ---------
        @param tables: 如qidian  则导出起点下面所有的items
        数据库中的表格式必须有规律 如导出 qidian:comment:s_qidian_book_comment_dynamic_item 对应导入 qidian_book_comment_dynamic
        @param auto_update: 是否自动更新
        @param batch_count: 每批次导出的数量
        @every_table_per_export_callback: 导出前的回调函数, 用来修改特定表的参数 to_table, auto_update, batch_count
        如:
            def every_table_per_export_callback(to_table, auto_update, batch_count):
                if to_table == 'xxx':
                    auto_update = True
                return to_table, auto_update, batch_count
        ---------
        @result:
        """
        tables = (self._redisdb.getkeys(tables + "*_item")
                  if not isinstance(tables, list) else tables)
        if not tables:
            log.info("无表数据")
        for table in tables:
            from_table = table
            to_table = tools.get_info(str(from_table),
                                      ":s_(.*?)_item",
                                      fetch_one=True)
            if callable(every_table_per_export_callback):
                to_table, auto_update, batch_count = every_table_per_export_callback(
                    to_table, auto_update, batch_count)

            log.info("""
                \r正在导出 %s -> %s""" % (from_table, to_table))
            self.export(from_table, to_table, auto_update, batch_count)

    def export_items(self, tab_item, items_data):
        """
        @summary:
        ---------
        @param tab_item: redis中items的表名
        @param items_data: [item.to_dict] 数据
        ---------
        @result:
        """

        to_table = tools.get_info(tab_item, ":s_(.*?)_item", fetch_one=True)
        sql, datas = tools.make_batch_sql(to_table, items_data)
        add_count = self._to_db.add_batch(sql, datas)
        datas_size = len(datas)
        if add_count is None:
            log.error("导出数据到表 %s 失败" % (to_table))
        else:
            log.info("共导出 %s 条数据 到 %s, 重复 %s 条" %
                     (datas_size, to_table, datas_size - add_count))

        return add_count != None

    def update_items(self, tab_item, items_data, update_keys=()):
        """
        @summary:
        ---------
        @param tab_item: redis中items的表名
        @param items_data: [item.to_dict] 数据
        @param update_keys: 更新的字段
        ---------
        @result:
        """
        to_table = tools.get_info(tab_item, ":s_(.*?)_item", fetch_one=True)
        sql, datas = tools.make_batch_sql(
            to_table,
            items_data,
            update_columns=update_keys or list(items_data[0].keys()),
        )
        update_count = self._to_db.add_batch(sql, datas)
        if update_count is None:
            log.error("更新表 %s 数据失败" % (to_table))
        else:
            msg = "共更新 %s 条数据 到 %s" % (update_count // 2, to_table)
            if update_keys:
                msg += " 更新字段为 {}".format(update_keys)
            log.info(msg)

        return update_count != None
Esempio n. 16
0
 def __init__(self):
     self._redisdb = RedisDB()
     self._to_db = MysqlDB()
Esempio n. 17
0
    def __init__(self,
                 task_table,
                 batch_record_table,
                 batch_name,
                 batch_interval,
                 task_keys,
                 task_state="state",
                 min_task_count=10000,
                 check_task_interval=5,
                 task_limit=10000,
                 related_table_folder=None,
                 related_batch_record=None,
                 task_condition="",
                 task_order_by="",
                 table_folder=None,
                 parser_count=None,
                 begin_callback=None,
                 end_callback=None,
                 delete_tabs=(),
                 process_num=None,
                 auto_stop_when_spider_done=None,
                 send_run_time=False,
                 *parser_args,
                 **parser_kwargs):
        """
        @summary: 批次爬虫
        必要条件
        1、需有任务表
            任务表中必须有id 及 任务状态字段 如 state。如指定parser_name字段,则任务会自动下发到对应的parser下, 否则会下发到所有的parser下。其他字段可根据爬虫需要的参数自行扩充

            参考建表语句如下:
            CREATE TABLE `table_name` (
              `id` int(11) NOT NULL AUTO_INCREMENT,
              `param` varchar(1000) DEFAULT NULL COMMENT '爬虫需要的抓取数据需要的参数',
              `state` int(11) DEFAULT NULL COMMENT '任务状态',
              `parser_name` varchar(255) DEFAULT NULL COMMENT '任务解析器的脚本类名',
              PRIMARY KEY (`id`),
              UNIQUE KEY `nui` (`param`) USING BTREE
            ) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=utf8;

        2、需有批次记录表 不存在自动创建

            此表节结构固定,参考建表语句如下:
            CREATE TABLE `xxx_batch_record` (
              `id` int(11) NOT NULL AUTO_INCREMENT,
              `batch_date` date DEFAULT NULL,
              `done_count` int(11) DEFAULT NULL,
              `total_count` int(11) DEFAULT NULL,
              PRIMARY KEY (`id`)
            ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;

        ---------
        @param task_table: mysql中的任务表
        @param batch_record_table: mysql 中的批次记录表
        @param batch_name: 批次采集程序名称
        @param batch_interval: 批次间隔 天为单位。 如想一小时一批次,可写成1/24
        @param task_keys: 需要获取的任务字段 列表 [] 如需指定解析的parser,则需将parser_name字段取出来。
        @param task_state: mysql中任务表的state字段名
        @param min_task_count: redis 中最少任务数, 少于这个数量会从mysql的任务表取任务
        @param check_task_interval: 检查是否还有任务的时间间隔;
        @param task_limit: 数据库中取任务的数量
        @param table_folder: 爬虫request及item存放reis中的文件夹
        @param parser_count: 线程数,默认为配置文件中的线程数
        @param begin_callback: 爬虫开始回调函数
        @param end_callback: 爬虫结束回调函数
        @param delete_tabs: 爬虫启动时删除的表,元组类型。 支持正则
        @param process_num: 进程数
        @param auto_stop_when_spider_done: 爬虫抓取完毕后是否自动结束或等待任务,默认自动结束
        @param send_run_time: 发送运行时间
        @param related_table_folder: 有关联的其他爬虫任务表(redis)
        @param related_batch_record: 有关联的其他爬虫批次表(mysql)注意:要避免环路 如 A -> B & B -> A 。 环路可用related_table_folder指定
            related_table_folder 与 related_batch_record 选其一配置即可。
            若相关连的爬虫为批次爬虫,推荐以related_batch_record配置,
            若相关连的爬虫为普通爬虫,无批次表,可以以related_table_folder配置
        @param task_condition: 任务条件 用于从一个大任务表中挑选出数据自己爬虫的任务,及where后的条件语句
        @param task_order_by: 取任务时的排序条件 如 id desc


        @param *parser_args: 传给parser下start_requests的参数, tuple()
        @param **parser_kwargs: 传给parser下start_requests的参数, dict()
        ---------
        @result:
        """
        Scheduler.__init__(
            self,
            table_folder=table_folder,
            parser_count=parser_count,
            begin_callback=begin_callback,
            end_callback=end_callback,
            delete_tabs=delete_tabs,
            process_num=process_num,
            auto_stop_when_spider_done=auto_stop_when_spider_done,
            auto_start_requests=False,
            send_run_time=send_run_time,
            batch_interval=batch_interval,
            *parser_args,
            **parser_kwargs)

        self._redisdb = RedisDB()
        self._mysqldb = MysqlDB()

        self._request_buffer = RequestBuffer(self._table_folder)

        self._task_table = task_table  # mysql中的任务表
        self._batch_record_table = batch_record_table  # mysql 中的批次记录表
        self._batch_name = batch_name  # 批次采集程序名称
        self._task_keys = task_keys  # 需要获取的任务字段

        self._task_state = task_state  # mysql中任务表的state字段名
        self._min_task_count = min_task_count  # redis 中最少任务数
        self._check_task_interval = check_task_interval
        self._task_limit = task_limit  # mysql中一次取的任务数量
        self._related_task_tables = [
            setting.TAB_REQUSETS.format(table_folder=table_folder)
        ]  # 自己的task表也需要检查是否有任务
        if related_table_folder:
            self._related_task_tables.append(
                setting.TAB_REQUSETS.format(table_folder=related_table_folder))

        self._related_batch_record = related_batch_record
        self._task_condition_prefix_and = task_condition and " and {}".format(
            task_condition)
        self._task_condition_prefix_where = task_condition and " where {}".format(
            task_condition)
        self._task_order_by = task_order_by and " order by {}".format(
            task_order_by)

        self._batch_date_cache = None
        if self._batch_interval >= 1:
            self._date_format = "%Y-%m-%d"
        elif self._batch_interval < 1 and self._batch_interval >= 1 / 24:
            self._date_format = "%Y-%m-%d %H"
        else:
            self._date_format = "%Y-%m-%d %H:%M"

        # 报警相关
        self._send_msg_interval = datetime.timedelta(hours=1)  # 每隔1小时发送一次报警
        self._last_send_msg_time = None

        self._spider_last_done_time = None  # 爬虫最近已做任务数量时间
        self._spider_last_done_count = 0  # 爬虫最近已做任务数量
        self._spider_deal_speed_cached = None

        self._is_more_parsers = True  # 多模版类爬虫
Esempio n. 18
0
class BatchSpider(BatchParser, Scheduler):
    def __init__(self,
                 task_table,
                 batch_record_table,
                 batch_name,
                 batch_interval,
                 task_keys,
                 task_state="state",
                 min_task_count=10000,
                 check_task_interval=5,
                 task_limit=10000,
                 related_table_folder=None,
                 related_batch_record=None,
                 task_condition="",
                 task_order_by="",
                 table_folder=None,
                 parser_count=None,
                 begin_callback=None,
                 end_callback=None,
                 delete_tabs=(),
                 process_num=None,
                 auto_stop_when_spider_done=None,
                 send_run_time=False,
                 *parser_args,
                 **parser_kwargs):
        """
        @summary: 批次爬虫
        必要条件
        1、需有任务表
            任务表中必须有id 及 任务状态字段 如 state。如指定parser_name字段,则任务会自动下发到对应的parser下, 否则会下发到所有的parser下。其他字段可根据爬虫需要的参数自行扩充

            参考建表语句如下:
            CREATE TABLE `table_name` (
              `id` int(11) NOT NULL AUTO_INCREMENT,
              `param` varchar(1000) DEFAULT NULL COMMENT '爬虫需要的抓取数据需要的参数',
              `state` int(11) DEFAULT NULL COMMENT '任务状态',
              `parser_name` varchar(255) DEFAULT NULL COMMENT '任务解析器的脚本类名',
              PRIMARY KEY (`id`),
              UNIQUE KEY `nui` (`param`) USING BTREE
            ) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=utf8;

        2、需有批次记录表 不存在自动创建

            此表节结构固定,参考建表语句如下:
            CREATE TABLE `xxx_batch_record` (
              `id` int(11) NOT NULL AUTO_INCREMENT,
              `batch_date` date DEFAULT NULL,
              `done_count` int(11) DEFAULT NULL,
              `total_count` int(11) DEFAULT NULL,
              PRIMARY KEY (`id`)
            ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;

        ---------
        @param task_table: mysql中的任务表
        @param batch_record_table: mysql 中的批次记录表
        @param batch_name: 批次采集程序名称
        @param batch_interval: 批次间隔 天为单位。 如想一小时一批次,可写成1/24
        @param task_keys: 需要获取的任务字段 列表 [] 如需指定解析的parser,则需将parser_name字段取出来。
        @param task_state: mysql中任务表的state字段名
        @param min_task_count: redis 中最少任务数, 少于这个数量会从mysql的任务表取任务
        @param check_task_interval: 检查是否还有任务的时间间隔;
        @param task_limit: 数据库中取任务的数量
        @param table_folder: 爬虫request及item存放reis中的文件夹
        @param parser_count: 线程数,默认为配置文件中的线程数
        @param begin_callback: 爬虫开始回调函数
        @param end_callback: 爬虫结束回调函数
        @param delete_tabs: 爬虫启动时删除的表,元组类型。 支持正则
        @param process_num: 进程数
        @param auto_stop_when_spider_done: 爬虫抓取完毕后是否自动结束或等待任务,默认自动结束
        @param send_run_time: 发送运行时间
        @param related_table_folder: 有关联的其他爬虫任务表(redis)
        @param related_batch_record: 有关联的其他爬虫批次表(mysql)注意:要避免环路 如 A -> B & B -> A 。 环路可用related_table_folder指定
            related_table_folder 与 related_batch_record 选其一配置即可。
            若相关连的爬虫为批次爬虫,推荐以related_batch_record配置,
            若相关连的爬虫为普通爬虫,无批次表,可以以related_table_folder配置
        @param task_condition: 任务条件 用于从一个大任务表中挑选出数据自己爬虫的任务,及where后的条件语句
        @param task_order_by: 取任务时的排序条件 如 id desc


        @param *parser_args: 传给parser下start_requests的参数, tuple()
        @param **parser_kwargs: 传给parser下start_requests的参数, dict()
        ---------
        @result:
        """
        Scheduler.__init__(
            self,
            table_folder=table_folder,
            parser_count=parser_count,
            begin_callback=begin_callback,
            end_callback=end_callback,
            delete_tabs=delete_tabs,
            process_num=process_num,
            auto_stop_when_spider_done=auto_stop_when_spider_done,
            auto_start_requests=False,
            send_run_time=send_run_time,
            batch_interval=batch_interval,
            *parser_args,
            **parser_kwargs)

        self._redisdb = RedisDB()
        self._mysqldb = MysqlDB()

        self._request_buffer = RequestBuffer(self._table_folder)

        self._task_table = task_table  # mysql中的任务表
        self._batch_record_table = batch_record_table  # mysql 中的批次记录表
        self._batch_name = batch_name  # 批次采集程序名称
        self._task_keys = task_keys  # 需要获取的任务字段

        self._task_state = task_state  # mysql中任务表的state字段名
        self._min_task_count = min_task_count  # redis 中最少任务数
        self._check_task_interval = check_task_interval
        self._task_limit = task_limit  # mysql中一次取的任务数量
        self._related_task_tables = [
            setting.TAB_REQUSETS.format(table_folder=table_folder)
        ]  # 自己的task表也需要检查是否有任务
        if related_table_folder:
            self._related_task_tables.append(
                setting.TAB_REQUSETS.format(table_folder=related_table_folder))

        self._related_batch_record = related_batch_record
        self._task_condition_prefix_and = task_condition and " and {}".format(
            task_condition)
        self._task_condition_prefix_where = task_condition and " where {}".format(
            task_condition)
        self._task_order_by = task_order_by and " order by {}".format(
            task_order_by)

        self._batch_date_cache = None
        if self._batch_interval >= 1:
            self._date_format = "%Y-%m-%d"
        elif self._batch_interval < 1 and self._batch_interval >= 1 / 24:
            self._date_format = "%Y-%m-%d %H"
        else:
            self._date_format = "%Y-%m-%d %H:%M"

        # 报警相关
        self._send_msg_interval = datetime.timedelta(hours=1)  # 每隔1小时发送一次报警
        self._last_send_msg_time = None

        self._spider_last_done_time = None  # 爬虫最近已做任务数量时间
        self._spider_last_done_count = 0  # 爬虫最近已做任务数量
        self._spider_deal_speed_cached = None

        self._is_more_parsers = True  # 多模版类爬虫

    def init_property(self):
        """
        每个批次开始时需要重置的属性
        @return:
        """
        self._last_send_msg_time = None

        self._spider_last_done_time = None
        self._spider_last_done_count = 0  # 爬虫刚开始启动时已做任务数量

    def add_parser(self, parser):
        parser = parser(
            self._task_table,
            self._batch_record_table,
            self._task_state,
            self._date_format,
        )  # parser 实例化
        self._parsers.append(parser)

    def start_monitor_task(self):
        """
        @summary: 监控任务状态
        ---------
        ---------
        @result:
        """
        if not self._parsers:  # 不是多模版模式, 将自己注入到parsers,自己为模版
            self._is_more_parsers = False
            self._parsers.append(self)

        elif len(self._parsers) <= 1:
            self._is_more_parsers = False

        self.create_batch_record_table()

        # 添加任务
        for parser in self._parsers:
            parser.add_task()

        is_first_check = True
        while True:
            try:
                if self.check_batch(is_first_check):  # 该批次已经做完
                    if not self._auto_stop_when_spider_done:
                        is_first_check = True
                        log.info("爬虫所有任务已做完,不自动结束,等待新任务...")
                        time.sleep(self._check_task_interval)
                        continue
                    else:
                        break

                is_first_check = False

                # 检查redis中是否有任务 任务小于_min_task_count 则从mysql中取
                tab_requests = setting.TAB_REQUSETS.format(
                    table_folder=self._table_folder)
                todo_task_count = self._redisdb.zget_count(tab_requests)

                tasks = []
                if todo_task_count < self._min_task_count:  # 从mysql中取任务
                    # 更新batch表的任务状态数量
                    self.update_task_done_count()

                    log.info("redis 中剩余任务%s 数量过小 从mysql中取任务追加" %
                             todo_task_count)
                    tasks = self.get_todo_task_from_mysql()
                    if not tasks:  # 状态为0的任务已经做完,需要检查状态为2的任务是否丢失

                        if (todo_task_count == 0
                            ):  # redis 中无待做任务,此时mysql中状态为2的任务为丢失任务。需重新做
                            lose_task_count = self.get_lose_task_count()

                            if not lose_task_count:
                                time.sleep(self._check_task_interval)
                                continue

                            elif (
                                    lose_task_count > self._task_limit * 5
                            ):  # 丢失任务太多,直接重置,否则每次等redis任务消耗完再取下一批丢失任务,速度过慢
                                log.info("正在重置丢失任务为待做 共 {} 条".format(
                                    lose_task_count))
                                # 重置正在做的任务为待做
                                if self.reset_lose_task_from_mysql():
                                    log.info("重置丢失任务成功")
                                else:
                                    log.info("重置丢失任务失败")

                                continue

                            else:  # 丢失任务少,直接取
                                log.info("正在取丢失任务 共 {} 条, 取 {} 条".format(
                                    lose_task_count,
                                    self._task_limit
                                    if self._task_limit <= lose_task_count else
                                    lose_task_count,
                                ))
                                tasks = self.get_doing_task_from_mysql()

                    else:
                        log.info("mysql 中取到待做任务 %s 条" % len(tasks))

                else:
                    log.info("redis 中尚有%s条积压任务,暂时不派发新任务" % todo_task_count)

                if not tasks:
                    if todo_task_count >= self._min_task_count:
                        # log.info('任务正在进行 redis中剩余任务 %s' % todo_task_count)
                        pass
                    else:
                        log.info("mysql 中无待做任务 redis中剩余任务 %s" %
                                 todo_task_count)
                else:
                    # make start requests
                    self.distribute_task(tasks)
                    log.info("添加任务到redis成功")

            except Exception as e:
                log.exception(e)

            time.sleep(self._check_task_interval)

    def create_batch_record_table(self):
        sql = (
            "select table_name from information_schema.tables where table_name like '%s'"
            % self._batch_record_table)
        tables_name = self._mysqldb.find(sql)
        if not tables_name:
            sql = """
                CREATE TABLE `{table_name}` (
                      `id` int(11) UNSIGNED NOT NULL AUTO_INCREMENT,
                      `batch_date` {batch_date} DEFAULT NULL COMMENT '批次时间',
                      `done_count` int(11) DEFAULT NULL COMMENT '完成数 (1,-1)',
                      `total_count` int(11) DEFAULT NULL COMMENT '任务总数',
                      `fail_count` int(11) DEFAULT NULL COMMENT '失败任务数 (-1)',
                      `interval` float(11) DEFAULT NULL COMMENT '批次间隔',
                      `interval_unit` varchar(20) DEFAULT NULL COMMENT '批次间隔单位 day, hour',
                      `create_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '批次开始时间',
                      `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '本条记录更新时间',
                      `is_done` int(11) DEFAULT '0' COMMENT '批次是否完成 0 未完成  1 完成',
                      PRIMARY KEY (`id`)
                    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
            """.format(
                table_name=self._batch_record_table,
                batch_date="date"
                if self._date_format == "%Y-%m-%d" else "datetime",
            )

            self._mysqldb.execute(sql)

    def distribute_task(self, tasks):
        """
        @summary: 分发任务
        ---------
        @param tasks:
        ---------
        @result:
        """
        if self._is_more_parsers:  # 为多模版类爬虫,需要下发指定的parser
            for task in tasks:
                for parser in self._parsers:  # 寻找task对应的parser
                    if parser.name in task:
                        requests = parser.start_requests(task)
                        if requests and not isinstance(requests, Iterable):
                            raise Exception("%s.%s返回值必须可迭代" %
                                            (parser.name, "start_requests"))

                        result_type = 1
                        for request in requests or []:
                            if isinstance(request, Request):
                                request.parser_name = request.parser_name or parser.name
                                self._request_buffer.put_request(request)
                                result_type = 1

                            elif isinstance(request, Item):
                                self._item_buffer.put_item(request)
                                result_type = 2

                                if (self._item_buffer.get_items_count() >=
                                        MAX_ITEM_COUNT):
                                    self._item_buffer.flush()

                            elif callable(
                                    request):  # callbale的request可能是更新数据库操作的函数
                                if result_type == 1:
                                    self._request_buffer.put_request(request)
                                else:
                                    self._item_buffer.put_item(request)

                                    if (self._item_buffer.get_items_count() >=
                                            MAX_ITEM_COUNT):
                                        self._item_buffer.flush()

                            else:
                                raise TypeError(
                                    "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}"
                                    .format(type(requests)))

                        break

        else:  # task没对应的parser 则将task下发到所有的parser
            for task in tasks:
                for parser in self._parsers:
                    requests = parser.start_requests(task)
                    if requests and not isinstance(requests, Iterable):
                        raise Exception("%s.%s返回值必须可迭代" %
                                        (parser.name, "start_requests"))

                    result_type = 1
                    for request in requests or []:
                        if isinstance(request, Request):
                            request.parser_name = request.parser_name or parser.name
                            self._request_buffer.put_request(request)
                            result_type = 1

                        elif isinstance(request, Item):
                            self._item_buffer.put_item(request)
                            result_type = 2

                            if self._item_buffer.get_items_count(
                            ) >= MAX_ITEM_COUNT:
                                self._item_buffer.flush()

                        elif callable(
                                request):  # callbale的request可能是更新数据库操作的函数
                            if result_type == 1:
                                self._request_buffer.put_request(request)
                            else:
                                self._item_buffer.put_item(request)

                                if (self._item_buffer.get_items_count() >=
                                        MAX_ITEM_COUNT):
                                    self._item_buffer.flush()

        self._request_buffer.flush()
        self._item_buffer.flush()

    def __get_task_state_count(self):
        sql = "select {state}, count(1) from {task_table}{task_condition} group by {state}".format(
            state=self._task_state,
            task_table=self._task_table,
            task_condition=self._task_condition_prefix_where,
        )
        task_state_count = self._mysqldb.find(sql)

        task_state = {
            "total_count":
            sum(count for state, count in task_state_count),
            "done_count":
            sum(count for state, count in task_state_count
                if state in (1, -1)),
            "failed_count":
            sum(count for state, count in task_state_count if state == -1),
        }

        return task_state

    def update_task_done_count(self):
        """
        @summary: 更新批次表中的任务状态
        ---------
        ---------
        @result:
        """
        task_count = self.__get_task_state_count()

        # log.info('《%s》 批次进度 %s/%s' % (self._batch_name, done_task_count, total_task_count))

        # 更新批次表
        sql = "update {} set done_count = {}, total_count = {}, fail_count = {}, update_time = CURRENT_TIME, is_done=0, `interval` = {}, interval_unit = '{}' where batch_date = '{}'".format(
            self._batch_record_table,
            task_count.get("done_count"),
            task_count.get("total_count"),
            task_count.get("failed_count"),
            self._batch_interval
            if self._batch_interval >= 1 else self._batch_interval * 24,
            "day" if self._batch_interval >= 1 else "hour",
            self.batch_date,
        )
        self._mysqldb.update(sql)

    def update_is_done(self):
        sql = "update {} set is_done = 1, update_time = CURRENT_TIME where batch_date = '{}' and is_done = 0".format(
            self._batch_record_table, self.batch_date)
        self._mysqldb.update(sql)

    def get_todo_task_from_mysql(self):
        """
        @summary: 取待做的任务
        ---------
        ---------
        @result:
        """
        # TODO 分批取数据 每批最大取 1000000个,防止内存占用过大
        # 查询任务
        sql = "select %s from %s where %s = 0%s%s limit %s" % (
            ", ".join(self._task_keys),
            self._task_table,
            self._task_state,
            self._task_condition_prefix_and,
            self._task_order_by,
            self._task_limit,
        )
        tasks = self._mysqldb.find(sql)

        if tasks:
            # 更新任务状态
            for i in range(0, len(tasks), 10000):  # 10000 一批量更新
                task_ids = str(tuple([task[0] for task in tasks[i:i + 10000]
                                      ])).replace(",)", ")")
                sql = "update %s set %s = 2 where id in %s" % (
                    self._task_table,
                    self._task_state,
                    task_ids,
                )
                self._mysqldb.update(sql)

        return tasks

    def get_doing_task_from_mysql(self):
        """
        @summary: 取正在做的任务
        ---------
        ---------
        @result:
        """

        # 查询任务
        sql = "select %s from %s where %s = 2%s%s limit %s" % (
            ", ".join(self._task_keys),
            self._task_table,
            self._task_state,
            self._task_condition_prefix_and,
            self._task_order_by,
            self._task_limit,
        )
        tasks = self._mysqldb.find(sql)

        return tasks

    def get_lose_task_count(self):
        sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count from {batch_record_table} order by id desc limit 1'.format(
            date_format=self._date_format.replace(":%M", ":%i"),
            batch_record_table=self._batch_record_table,
        )
        batch_info = self._mysqldb.find(sql)  # (('2018-08-19', 49686, 0),)
        batch_date, total_count, done_count = batch_info[0]
        return total_count - done_count

    def reset_lose_task_from_mysql(self):
        """
        @summary: 重置丢失任务为待做
        ---------
        ---------
        @result:
        """

        sql = "update {table} set {state} = 0 where {state} = 2{task_condition}".format(
            table=self._task_table,
            state=self._task_state,
            task_condition=self._task_condition_prefix_and,
        )
        return self._mysqldb.update(sql)

    def get_deal_speed(self, total_count, done_count, last_batch_date):
        """
        获取处理速度
        @param total_count: 总数量
        @param done_count: 做完数量
        @param last_batch_date: 批次时间 datetime
        @return:
            deal_speed (条/小时), need_time (秒), overflow_time(秒) ( overflow_time < 0 时表示提前多少秒完成 )
            或
            None
        """
        if not self._spider_last_done_count:
            now_date = datetime.datetime.now()
            self._spider_last_done_count = done_count
            self._spider_last_done_time = now_date

        if done_count > self._spider_last_done_count:
            now_date = datetime.datetime.now()

            time_interval = (now_date -
                             self._spider_last_done_time).total_seconds()
            deal_speed = (done_count -
                          self._spider_last_done_count) / time_interval  # 条/秒
            need_time = (total_count - done_count) / deal_speed  # 单位秒
            overflow_time = (
                (now_date - last_batch_date).total_seconds() + need_time -
                datetime.timedelta(days=self._batch_interval).total_seconds()
            )  # 溢出时间 秒
            calculate_speed_time = now_date.strftime(
                "%Y-%m-%d %H:%M:%S")  # 统计速度时间

            deal_speed = int(deal_speed * 3600)  # 条/小时

            # 更新最近已做任务数及时间
            self._spider_last_done_count = done_count
            self._spider_last_done_time = now_date

            self._spider_deal_speed_cached = (
                deal_speed,
                need_time,
                overflow_time,
                calculate_speed_time,
            )

        return self._spider_deal_speed_cached

    def check_batch(self, is_first_check=False):
        """
        @summary: 检查批次是否完成
        ---------
        @param: is_first_check 是否为首次检查,若首次检查,且检查结果为批次已完成,则不发送批次完成消息。因为之前发送过了
        ---------
        @result: 完成返回True 否则False
        """

        sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count from {batch_record_table} order by id desc limit 1'.format(
            date_format=self._date_format.replace(":%M", ":%i"),
            batch_record_table=self._batch_record_table,
        )
        batch_info = self._mysqldb.find(sql)  # (('2018-08-19', 49686, 0),)

        if batch_info:
            batch_date, total_count, done_count = batch_info[0]

            now_date = datetime.datetime.now()
            last_batch_date = datetime.datetime.strptime(
                batch_date, self._date_format)
            time_difference = now_date - last_batch_date

            if total_count == done_count and time_difference < datetime.timedelta(
                    days=self._batch_interval):  # 若在本批次内,再次检查任务表是否有新增任务
                # # 改成查询任务表 看是否真的没任务了,因为batch_record表里边的数量可能没来得及更新
                task_count = self.__get_task_state_count()

                total_count = task_count.get("total_count")
                done_count = task_count.get("done_count")

            if total_count == done_count:
                # 检查相关联的爬虫是否完成
                releated_spider_is_done = self.related_spider_is_done()
                if releated_spider_is_done == False:
                    msg = "《{}》本批次未完成, 正在等待依赖爬虫 {} 结束. 批次时间 {} 批次进度 {}/{}".format(
                        self._batch_name,
                        self._related_batch_record
                        or self._related_task_tables,
                        batch_date,
                        done_count,
                        total_count,
                    )
                    log.info(msg)
                    # 检查是否超时 超时发出报警
                    if time_difference >= datetime.timedelta(
                            days=self._batch_interval):  # 已经超时
                        if (not self._last_send_msg_time
                                or now_date - self._last_send_msg_time >=
                                self._send_msg_interval):
                            self._last_send_msg_time = now_date
                            self.send_msg(msg, level="error")

                    return False

                elif releated_spider_is_done == True:
                    # 更新is_done 状态
                    self.update_is_done()

                else:
                    self.update_is_done()

                msg = "《{}》本批次完成 批次时间 {} 共处理 {} 条任务".format(
                    self._batch_name, batch_date, done_count)
                log.info(msg)
                if not is_first_check:
                    self.send_msg(msg)

                # 判断下一批次是否到
                if time_difference >= datetime.timedelta(
                        days=self._batch_interval):
                    msg = "《{}》下一批次开始".format(self._batch_name)
                    log.info(msg)
                    self.send_msg(msg)

                    # 初始化任务表状态
                    if self.init_task() != False:  # 更新失败返回False 其他返回True/None
                        # 初始化属性
                        self.init_property()

                        is_success = (
                            self.record_batch()
                        )  # 有可能插入不成功,但是任务表已经重置了,不过由于当前时间为下一批次的时间,检查批次是否结束时不会检查任务表,所以下次执行时仍然会重置
                        if is_success:
                            log.info(
                                "插入新批次记录成功 1分钟后开始下发任务")  # 防止work批次时间没来得及更新
                            tools.delay_time(60)

                            return False  # 下一批次开始

                        else:
                            return True  # 下一批次不开始。先不派发任务,因为批次表新批次插入失败了,需要插入成功后再派发任务

                else:
                    log.info("《{}》下次批次时间未到".format(self._batch_name))
                    if not is_first_check:
                        self.send_msg("《{}》下次批次时间未到".format(self._batch_name))
                    return True

            else:
                if time_difference >= datetime.timedelta(
                        days=self._batch_interval):  # 已经超时
                    time_out = time_difference - datetime.timedelta(
                        days=self._batch_interval)
                    time_out_pretty = tools.format_seconds(
                        time_out.total_seconds())

                    msg = "《{}》本批次已超时{} 批次时间 {}, 批次进度 {}/{}".format(
                        self._batch_name,
                        time_out_pretty,
                        batch_date,
                        done_count,
                        total_count,
                    )
                    if self._batch_interval >= 1:
                        msg += ", 期望时间{}天".format(self._batch_interval)
                    else:
                        msg += ", 期望时间{}小时".format(self._batch_interval * 24)

                    result = self.get_deal_speed(
                        total_count=total_count,
                        done_count=done_count,
                        last_batch_date=last_batch_date,
                    )
                    if result:
                        deal_speed, need_time, overflow_time, calculate_speed_time = (
                            result)
                        msg += ", 任务处理速度于{}统计, 约 {}条/小时, 预计还需 {}".format(
                            calculate_speed_time,
                            deal_speed,
                            tools.format_seconds(need_time),
                        )

                        if overflow_time > 0:
                            msg += ", 该批次预计总超时 {}, 请及时处理".format(
                                tools.format_seconds(overflow_time))

                    log.info(msg)

                    if (not self._last_send_msg_time
                            or now_date - self._last_send_msg_time >=
                            self._send_msg_interval):
                        self._last_send_msg_time = now_date
                        self.send_msg(msg, level="error")

                else:  # 未超时
                    remaining_time = (
                        datetime.timedelta(days=self._batch_interval) -
                        time_difference)
                    remaining_time_pretty = tools.format_seconds(
                        remaining_time.total_seconds())

                    if self._batch_interval >= 1:
                        msg = "《{}》本批次正在进行, 批次时间 {}, 批次进度 {}/{}, 期望时间{}天, 剩余{}".format(
                            self._batch_name,
                            batch_date,
                            done_count,
                            total_count,
                            self._batch_interval,
                            remaining_time_pretty,
                        )
                    else:
                        msg = "《{}》本批次正在进行, 批次时间 {}, 批次进度 {}/{}, 期望时间{}小时, 剩余{}".format(
                            self._batch_name,
                            batch_date,
                            done_count,
                            total_count,
                            self._batch_interval * 24,
                            remaining_time_pretty,
                        )

                    result = self.get_deal_speed(
                        total_count=total_count,
                        done_count=done_count,
                        last_batch_date=last_batch_date,
                    )
                    if result:
                        deal_speed, need_time, overflow_time, calculate_speed_time = (
                            result)
                        msg += ", 任务处理速度于{}统计, 约 {}条/小时, 预计还需 {}".format(
                            calculate_speed_time,
                            deal_speed,
                            tools.format_seconds(need_time),
                        )

                        if overflow_time > 0:
                            msg += ", 该批次可能会超时 {}, 请及时处理".format(
                                tools.format_seconds(overflow_time))
                            # 发送警报
                            if (not self._last_send_msg_time
                                    or now_date - self._last_send_msg_time >=
                                    self._send_msg_interval):
                                self._last_send_msg_time = now_date
                                self.send_msg(msg, level="error")

                        elif overflow_time < 0:
                            msg += ", 该批次预计提前 {} 完成".format(
                                tools.format_seconds(-overflow_time))

                    log.info(msg)

        else:
            # 插入batch_date
            self.record_batch()

            # 初始化任务表状态 可能有产生任务的代码
            self.init_task()

            return False

    def related_spider_is_done(self):
        """
        相关连的爬虫是否跑完
        @return: True / False / None 表示无相关的爬虫 可由自身的total_count 和 done_count 来判断
        """

        for related_redis_task_table in self._related_task_tables:
            if self._redisdb.exists_key(related_redis_task_table):
                return False

        if self._related_batch_record:
            sql = "select is_done from {} order by id desc limit 1".format(
                self._related_batch_record)
            is_done = self._mysqldb.find(sql)
            is_done = is_done[0][0] if is_done else None

            if is_done is None:
                log.warning("相关联的批次表不存在或无批次信息")
                return None

            if not is_done:
                return False

        return True

    def record_batch(self):
        """
        @summary: 记录批次信息(初始化)
        ---------
        ---------
        @result:
        """

        # 查询总任务数
        sql = "select count(1) from %s%s" % (
            self._task_table,
            self._task_condition_prefix_where,
        )
        total_task_count = self._mysqldb.find(sql)[0][0]

        batch_date = tools.get_current_date(self._date_format)

        sql = (
            "insert into %s (batch_date, done_count, total_count, `interval`, interval_unit, create_time) values ('%s', %s, %s, %s, '%s', CURRENT_TIME)"
            % (
                self._batch_record_table,
                batch_date,
                0,
                total_task_count,
                self._batch_interval
                if self._batch_interval >= 1 else self._batch_interval * 24,
                "day" if self._batch_interval >= 1 else "hour",
            ))

        affect_count = self._mysqldb.add(sql)  # None / 0 / 1 (1 为成功)
        if affect_count:
            # 重置批次日期
            self._batch_date_cache = batch_date
            # 重新刷下self.batch_date 中的 os.environ.get('batch_date') 否则日期还停留在上一个批次
            os.environ["batch_date"] = self._batch_date_cache

            # 爬虫开始
            self.spider_begin()
            self.record_spider_state(
                spider_type=2,
                state=0,
                batch_date=batch_date,
                spider_start_time=tools.get_current_date(),
                batch_interval=self._batch_interval,
            )
        else:
            log.error("插入新批次失败")

        return affect_count

    # -------- 批次结束逻辑 ------------

    def task_is_done(self):
        """
        @summary: 检查任务状态 是否做完 同时更新批次时间 (不能挂 挂了批次时间就不更新了)
        ---------
        ---------
        @result: True / False (做完 / 未做完)
        """

        is_done = False

        # 查看批次记录表任务状态
        sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count, is_done from {batch_record_table} order by id desc limit 1'.format(
            date_format=self._date_format.replace(":%M", ":%i"),
            batch_record_table=self._batch_record_table,
        )

        batch_info = self._mysqldb.find(sql)
        if batch_info is None:
            raise Exception("查询批次信息失败")

        if batch_info:
            self._batch_date_cache, total_count, done_count, is_done = batch_info[
                0]  # 更新self._batch_date_cache, 防止新批次已经开始了,但self._batch_date_cache还是原来的批次时间

            log.info("《%s》 批次时间%s 批次进度 %s/%s 完成状态 %d" % (
                self._batch_name,
                self._batch_date_cache,
                done_count,
                total_count,
                is_done,
            ))
            os.environ[
                "batch_date"] = self._batch_date_cache  # 更新BatchParser里边的批次时间

        if is_done:  # 检查任务表中是否有没做的任务 若有则is_done 为 False
            # 比较耗时 加锁防止多进程同时查询
            with RedisLock(
                    key=self._spider_name,
                    timeout=3600,
                    wait_timeout=0,
                    redis_uri="redis://:{password}@{host_post}/{db}".format(
                        password=setting.REDISDB_USER_PASS,
                        host_post=setting.REDISDB_IP_PORTS,
                        db=setting.REDISDB_DB,
                    ),
            ) as lock:
                if lock.locked:
                    log.info("批次表标记已完成,正在检查任务表是否有未完成的任务")

                    sql = "select 1 from %s where (%s = 0 or %s=2)%s limit 1" % (
                        self._task_table,
                        self._task_state,
                        self._task_state,
                        self._task_condition_prefix_and,
                    )
                    tasks = self._mysqldb.find(sql)  # [(1,)]  / []
                    if tasks:
                        log.info("检测到任务表中有未完成任务,等待任务下发")
                        is_done = False

                        # 更新batch_record 表的is_done 状态,减少查询任务表的次数
                        sql = 'update {batch_record_table} set is_done = 0 where batch_date = "{batch_date}"'.format(
                            batch_record_table=self._batch_record_table,
                            batch_date=self._batch_date_cache,
                        )
                        self._mysqldb.update(sql)

                    else:
                        log.info("任务表中任务均已完成,爬虫结束")
                else:
                    log.info("批次表标记已完成,其他爬虫进程正在检查任务表是否有未完成的任务,本进程跳过检查,继续等待")

                    is_done = False

        return is_done

    def run(self):
        """
        @summary: 重写run方法 检查mysql中的任务是否做完, 做完停止
        ---------
        ---------
        @result:
        """
        try:
            self.create_batch_record_table()

            if not self._parsers:  # 不是add_parser 模式
                self._parsers.append(self)

            self._start()

            while True:
                if (
                        self.task_is_done() and self.all_thread_is_done()
                ):  # redis全部的任务已经做完 并且mysql中的任务已经做完(检查各个线程all_thread_is_done,防止任务没做完,就更新任务状态,导致程序结束的情况)
                    if not self._is_notify_end:
                        self.spider_end()
                        self.record_spider_state(
                            spider_type=2,
                            state=1,
                            batch_date=self._batch_date_cache,
                            spider_end_time=tools.get_current_date(),
                            batch_interval=self._batch_interval,
                        )

                        self._is_notify_end = True

                    if self._auto_stop_when_spider_done:
                        self._stop_all_thread()
                        break
                else:
                    self._is_notify_end = False

                self.check_task_status()
                tools.delay_time(10)  # 10秒钟检查一次爬虫状态

        except Exception as e:
            msg = "《%s》主线程异常 爬虫结束 exception: %s" % (self._batch_name, e)
            log.error(msg)
            self.send_msg(msg)

            os._exit(137)  # 使退出码为35072 方便爬虫管理器重启

    @classmethod
    def to_DebugBatchSpider(cls, *args, **kwargs):
        # DebugBatchSpider 继承 cls
        DebugBatchSpider.__bases__ = (cls, )
        DebugBatchSpider.__name__ = cls.__name__
        return DebugBatchSpider(*args, **kwargs)
Esempio n. 19
0
    def task_is_done(self):
        """
        @summary: 检查任务状态 是否做完 同时更新批次时间 (不能挂 挂了批次时间就不更新了)
        ---------
        ---------
        @result: True / False (做完 / 未做完)
        """

        is_done = False

        # 查看批次记录表任务状态
        sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count, is_done from {batch_record_table} order by id desc limit 1'.format(
            date_format=self._date_format.replace(":%M", ":%i"),
            batch_record_table=self._batch_record_table,
        )

        batch_info = self._mysqldb.find(sql)
        if batch_info is None:
            raise Exception("查询批次信息失败")

        if batch_info:
            self._batch_date_cache, total_count, done_count, is_done = batch_info[
                0]  # 更新self._batch_date_cache, 防止新批次已经开始了,但self._batch_date_cache还是原来的批次时间

            log.info("《%s》 批次时间%s 批次进度 %s/%s 完成状态 %d" % (
                self._batch_name,
                self._batch_date_cache,
                done_count,
                total_count,
                is_done,
            ))
            os.environ[
                "batch_date"] = self._batch_date_cache  # 更新BatchParser里边的批次时间

        if is_done:  # 检查任务表中是否有没做的任务 若有则is_done 为 False
            # 比较耗时 加锁防止多进程同时查询
            with RedisLock(
                    key=self._spider_name,
                    timeout=3600,
                    wait_timeout=0,
                    redis_cli=RedisDB().get_redis_obj(),
            ) as lock:
                if lock.locked:
                    log.info("批次表标记已完成,正在检查任务表是否有未完成的任务")

                    sql = "select 1 from %s where (%s = 0 or %s=2)%s limit 1" % (
                        self._task_table,
                        self._task_state,
                        self._task_state,
                        self._task_condition_prefix_and,
                    )
                    tasks = self._mysqldb.find(sql)  # [(1,)]  / []
                    if tasks:
                        log.info("检测到任务表中有未完成任务,等待任务下发")
                        is_done = False

                        # 更新batch_record 表的is_done 状态,减少查询任务表的次数
                        sql = 'update {batch_record_table} set is_done = 0 where batch_date = "{batch_date}"'.format(
                            batch_record_table=self._batch_record_table,
                            batch_date=self._batch_date_cache,
                        )
                        self._mysqldb.update(sql)

                    else:
                        log.info("任务表中任务均已完成,爬虫结束")
                else:
                    log.info("批次表标记已完成,其他爬虫进程正在检查任务表是否有未完成的任务,本进程跳过检查,继续等待")

                    is_done = False

        return is_done
Esempio n. 20
0
    def __init__(self, name):
        self.name = name
        self.count_cached_name = name + "_count_cached"

        if not self.__class__.redis_db:
            self.__class__.redis_db = RedisDB()
Esempio n. 21
0
class Collector(threading.Thread):
    def __init__(self, table_folder, process_num=None):
        """
        @summary:
        ---------
        @param table_folder:
        @param process_num: 进程编号
        ---------
        @result:
        """

        super(Collector, self).__init__()
        self._db = RedisDB()

        self._thread_stop = False

        self._todo_requests = collections.deque()

        self._tab_requests = setting.TAB_REQUSETS.format(
            table_folder=table_folder)
        self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(
            table_folder=table_folder)

        self._spider_mark = LOCAL_HOST_IP + ("_%s" % process_num
                                             if process_num else "_0")

        self._interval = setting.COLLECTOR_SLEEP_TIME
        self._request_count = setting.COLLECTOR_TASK_COUNT
        self._is_collector_task = False

        self._db.clear(self._tab_spider_status)

    def run(self):
        while not self._thread_stop:

            try:
                self.__input_data()
            except Exception as e:
                log.exception(e)

            self._is_collector_task = False

            time.sleep(self._interval)

    def stop(self):
        self._thread_stop = True

    def __input_data(self):
        if len(self._todo_requests) >= self._request_count:
            return

        # 汇报节点信息
        self._db.zadd(self._tab_spider_status, self._spider_mark, 0)  # 未做

        request_count = self._request_count  # 先赋值
        # 根据等待节点数量,动态分配request
        spider_wait_count = self._db.zget_count(self._tab_spider_status,
                                                priority_min=0,
                                                priority_max=0)
        if spider_wait_count:
            # 任务数量
            task_count = self._db.zget_count(self._tab_requests)
            # 动态分配的数量 = 任务数量 / 休息的节点数量 + 1
            request_count = task_count // spider_wait_count + 1

        request_count = (request_count if request_count <= self._request_count
                         else self._request_count)

        if not request_count:
            return

        # requests_list = self._db.zget(self._tab_requests, count = request_count)

        # 取任务
        current_timestamp = tools.get_current_timestamp()
        priority_max = current_timestamp - setting.REQUEST_TIME_OUT  # 普通的任务 与 已经超时的任务
        requests_list = self._db.zrangebyscore_set_score(
            self._tab_requests,
            priority_min="-inf",
            priority_max=priority_max,
            score=current_timestamp,
            count=request_count,
        )
        # print('取任务', len(requests_list))

        if not requests_list:
            pass
        else:
            self._is_collector_task = True
            # 将取到的任务放回到redis, 以当前时间戳标记,表示正在做的任务。任务做完在request_buffer中删除,没做完则到超时时间后重新做
            # self._db.zadd(self._tab_requests, requests_list, prioritys=current_timestamp)

            # 汇报节点信息
            self._db.zadd(self._tab_spider_status, self._spider_mark, 1)  # 正在做

            # 存request
            self.__put_requests(requests_list)

    def __put_requests(self, requests_list):
        for request in requests_list:
            try:
                request_dict = {
                    "request_obj": Request.from_dict(eval(request)),
                    "request_redis": request,
                }
            except Exception as e:
                log.exception("""
                error %s
                request %s
                """ % (e, request))

                request_dict = None

            if request_dict:
                self._todo_requests.append(request_dict)

    def get_requests(self, count):
        requests = []
        count = count if count <= len(self._todo_requests) else len(
            self._todo_requests)
        while count:
            requests.append(self._todo_requests.popleft())
            count -= 1

        return requests

    def get_requests_count(self):
        return len(self._todo_requests) or self._db.zget_count(
            self._tab_requests)

    def is_collector_task(self):
        return self._is_collector_task
Esempio n. 22
0
class ItemBuffer(threading.Thread, Singleton):
    def __init__(self, table_folder):
        if not hasattr(self, "_table_item"):
            super(ItemBuffer, self).__init__()

            self._thread_stop = False
            self._is_adding_to_db = False
            self._table_folder = table_folder

            self._items_queue = Queue(maxsize=MAX_ITEM_COUNT)
            self._db = RedisDB()

            self._table_item = setting.TAB_ITEM
            self._table_request = setting.TAB_REQUSETS.format(
                table_folder=table_folder)

            self._item_tables = {
                # 'xxx_item': {'tab_item': 'xxx:xxx_item'} # 记录item名与redis中item名对应关系
            }

            self._item_update_keys = {
                # 'xxx:xxx_item': ['id', 'name'...] # 记录redis中item名与需要更新的key对应关系
            }

            self._export_data = ExportData(
            ) if setting.ADD_ITEM_TO_MYSQL else None

            self.db_tip()

    def db_tip(self):
        msg = "\n"
        if setting.ADD_ITEM_TO_MYSQL:
            msg += "item 自动入mysql\n"
        if setting.ADD_ITEM_TO_REDIS:
            msg += "item 自动入redis\n"
        if msg == "\n":
            log.warning("*** 请注意检查item是否入库 !!!")
        else:
            log.info(msg)

    def run(self):
        while not self._thread_stop:
            self.flush()
            tools.delay_time(0.5)

        self.close()

    def stop(self):
        self._thread_stop = True

    def put_item(self, item):
        self._items_queue.put(item)

    def flush(self):
        try:
            items = []
            update_items = []
            requests = []
            callbacks = []
            items_fingerprints = []
            data_count = 0

            while not self._items_queue.empty():
                data = self._items_queue.get_nowait()
                data_count += 1

                # data 分类
                if callable(data):
                    callbacks.append(data)

                elif isinstance(data, UpdateItem):
                    update_items.append(data)

                elif isinstance(data, Item):
                    items.append(data)
                    if setting.ITEM_FILTER_ENABLE:
                        items_fingerprints.append(data.fingerprint)

                else:  # request-redis
                    requests.append(data)

                if data_count >= UPLOAD_BATCH_MAX_SIZE:
                    self.__add_item_to_db(items, update_items, requests,
                                          callbacks, items_fingerprints)

                    items = []
                    update_items = []
                    requests = []
                    callbacks = []
                    items_fingerprints = []
                    data_count = 0

            if data_count:
                self.__add_item_to_db(items, update_items, requests, callbacks,
                                      items_fingerprints)

        except Exception as e:
            log.exception(e)

    def get_items_count(self):
        return self._items_queue.qsize()

    def is_adding_to_db(self):
        return self._is_adding_to_db

    def __dedup_items(self, items, items_fingerprints):
        """
        去重
        @param items:
        @param items_fingerprints:
        @return: 返回去重后的items, items_fingerprints
        """
        if not items:
            return items, items_fingerprints

        is_exists = self.__class__.dedup.get(items_fingerprints)
        is_exists = is_exists if isinstance(is_exists, list) else [is_exists]

        dedup_items = []
        dedup_items_fingerprints = []
        items_count = dedup_items_count = dup_items_count = 0

        while is_exists:
            item = items.pop(0)
            items_fingerprint = items_fingerprints.pop(0)
            is_exist = is_exists.pop(0)

            items_count += 1

            if not is_exist:
                dedup_items.append(item)
                dedup_items_fingerprints.append(items_fingerprint)
                dedup_items_count += 1
            else:
                dup_items_count += 1

        log.info("待入库数据 {} 条, 重复 {} 条,实际待入库数据 {} 条".format(
            items_count, dup_items_count, dedup_items_count))

        return dedup_items, dedup_items_fingerprints

    def __pick_items(self, items, is_update_item=False):
        """
        将每个表之间的数据分开 拆分后 原items为空
        @param items:
        @param is_update_item:
        @return:
        """
        datas_dict = {
            # 'xxx:xxx_item': [{}, {}] redis 中的item名与对应的数据
        }

        while items:
            item = items.pop(0)
            # 取item下划线格式的名
            # 下划线类的名先从dict中取,没有则现取,然后存入dict。加快下次取的速度
            item_name = item.item_name
            item_table = self._item_tables.get(item_name)
            if not item_table:
                item_name_underline = item.name_underline
                tab_item = self._table_item.format(
                    table_folder=self._table_folder,
                    item_name=item_name_underline)

                item_table = {}
                item_table["tab_item"] = tab_item

                self._item_tables[item_name] = item_table

            else:
                tab_item = item_table.get("tab_item")

            # 入库前的回调
            item.per_to_db()

            if tab_item not in datas_dict:
                datas_dict[tab_item] = []

            datas_dict[tab_item].append(item.to_dict)

            if is_update_item and tab_item not in self._item_update_keys:
                self._item_update_keys[tab_item] = item.update_key

        return datas_dict

    def __export_to_db(self, tab_item, datas, is_update=False, update_keys=()):
        export_success = False
        # 打点 校验
        to_table = tools.get_info(tab_item, ":s_(.*?)_item", fetch_one=True)
        self.check_datas(table=to_table, datas=datas)

        if setting.ADD_ITEM_TO_MYSQL:  # 任务表需要入mysql
            if isinstance(setting.ADD_ITEM_TO_MYSQL, (list, tuple)):
                for item in setting.ADD_ITEM_TO_MYSQL:
                    if item in to_table:
                        export_success = (
                            self._export_data.export_items(tab_item, datas) if
                            not is_update else self._export_data.update_items(
                                tab_item, datas, update_keys=update_keys))

            else:
                export_success = (
                    self._export_data.export_items(tab_item, datas)
                    if not is_update else self._export_data.update_items(
                        tab_item, datas, update_keys=update_keys))

        if setting.ADD_ITEM_TO_REDIS:
            if isinstance(setting.ADD_ITEM_TO_REDIS, (list, tuple)):
                for item in setting.ADD_ITEM_TO_REDIS:
                    if item in to_table:
                        self._db.sadd(tab_item, datas)
                        export_success = True
                        log.info("共导出 %s 条数据 到redis %s" %
                                 (len(datas), tab_item))
                        break

            else:
                self._db.sadd(tab_item, datas)
                export_success = True
                log.info("共导出 %s 条数据 到redis %s" % (len(datas), tab_item))

        return export_success

    def __add_item_to_db(self, items, update_items, requests, callbacks,
                         items_fingerprints):
        export_success = False
        self._is_adding_to_db = True

        # 去重
        if setting.ITEM_FILTER_ENABLE:
            items, items_fingerprints = self.__dedup_items(
                items, items_fingerprints)

        # 分捡
        items_dict = self.__pick_items(items)
        update_items_dict = self.__pick_items(update_items,
                                              is_update_item=True)

        # item批量入库
        while items_dict:
            tab_item, datas = items_dict.popitem()

            log.debug("""
                -------------- item 批量入库 --------------
                表名: %s
                datas: %s
                    """ % (tab_item, tools.dumps_json(datas, indent=16)))

            export_success = self.__export_to_db(tab_item, datas)

        # 执行批量update
        while update_items_dict:
            tab_item, datas = update_items_dict.popitem()
            log.debug("""
                -------------- item 批量更新 --------------
                表名: %s
                datas: %s
                    """ % (tab_item, tools.dumps_json(datas, indent=16)))

            update_keys = self._item_update_keys.get(tab_item)
            export_success = self.__export_to_db(tab_item,
                                                 datas,
                                                 is_update=True,
                                                 update_keys=update_keys)

        # 执行回调
        while callbacks:
            try:
                callback = callbacks.pop(0)
                callback()
            except Exception as e:
                log.exception(e)

        # 删除做过的request
        if requests:
            self._db.zrem(self._table_request, requests)

        # 去重入库
        if export_success and setting.ITEM_FILTER_ENABLE:
            if items_fingerprints:
                self.__class__.dedup.add(items_fingerprints, skip_check=True)

        self._is_adding_to_db = False

    def check_datas(self, table, datas):
        """
        打点 记录总条数及每个key情况
        @param table: 表名
        @param datas: 数据 列表
        @return:
        """
        pass

    def close(self):
        pass
Esempio n. 23
0
    def _cache_db(self):
        if not self.__class__.cache_db:
            self.__class__.cache_db = RedisDB(
            )  # .from_url(setting.pika_spider_1_uri)

        return self.__class__.cache_db