Beispiel #1
0
    def __init__(self, thread_count=None):
        """
        基于内存队列的爬虫,不支持分布式
        :param thread_count: 线程数
        """
        super(AirSpider, self).__init__()

        for key, value in self.__class__.__custom_setting__.items():
            setattr(setting, key, value)
            log.reload()

        self._thread_count = (setting.SPIDER_THREAD_COUNT
                              if not thread_count else thread_count)

        self._memory_db = MemoryDB()
        self._parser_controls = []
Beispiel #2
0
    def __init__(
        self,
        redis_key=None,
        thread_count=None,
        begin_callback=None,
        end_callback=None,
        delete_keys=(),
        auto_stop_when_spider_done=None,
        auto_start_requests=None,
        send_run_time=True,
        batch_interval=0,
        wait_lock=True,
        task_table=None,
    ):
        """
        @summary: 调度器
        ---------
        @param redis_key: 爬虫request及item存放reis中的文件夹
        @param thread_count: 线程数,默认为配置文件中的线程数
        @param begin_callback: 爬虫开始回调函数
        @param end_callback: 爬虫结束回调函数
        @param delete_keys: 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则
        @param auto_stop_when_spider_done: 爬虫抓取完毕后是否自动结束或等待任务,默认自动结束
        @param auto_start_requests: 爬虫是否自动添加任务
        @param send_run_time: 发送运行时间
        @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时,只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时,爬虫才启动
        @param wait_lock: 下发任务时否等待锁,若不等待锁,可能会存在多进程同时在下发一样的任务,因此分布式环境下请将该值设置True
        @param task_table: 任务表, 批次爬虫传递
        ---------
        @result:
        """

        super(Scheduler, self).__init__()

        for key, value in self.__class__.__custom_setting__.items():
            setattr(setting, key, value)
            log.reload()

        self._redis_key = redis_key or setting.REDIS_KEY
        if not self._redis_key:
            raise Exception(
                """
                redis_key 为redis中存放request与item的目录。不能为空,
                可在setting中配置,如 REDIS_KEY = 'test'
                或spider初始化时传参, 如 TestSpider(redis_key='test')
                """
            )

        self._request_buffer = RequestBuffer(redis_key)
        self._item_buffer = ItemBuffer(redis_key, task_table)

        self._collector = Collector(redis_key)
        self._parsers = []
        self._parser_controls = []
        self._parser_control_obj = PaserControl

        self._auto_stop_when_spider_done = (
            auto_stop_when_spider_done
            if auto_stop_when_spider_done is not None
            else setting.AUTO_STOP_WHEN_SPIDER_DONE
        )
        self._auto_start_requests = (
            auto_start_requests
            if auto_start_requests is not None
            else setting.SPIDER_AUTO_START_REQUESTS
        )
        self._send_run_time = send_run_time
        self._batch_interval = batch_interval

        self._begin_callback = (
            begin_callback
            if begin_callback
            else lambda: log.info("\n********** feapder begin **********")
        )
        self._end_callback = (
            end_callback
            if end_callback
            else lambda: log.info("\n********** feapder end **********")
        )

        self._thread_count = (
            setting.SPIDER_THREAD_COUNT if not thread_count else thread_count
        )

        self._spider_name = redis_key
        self._project_name = redis_key.split(":")[0]

        self._tab_spider_time = setting.TAB_SPIDER_TIME.format(redis_key=redis_key)
        self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key)
        self._tab_requests = setting.TAB_REQUSETS.format(redis_key=redis_key)
        self._tab_failed_requests = setting.TAB_FAILED_REQUSETS.format(
            redis_key=redis_key
        )

        self._is_notify_end = False  # 是否已经通知结束
        self._last_task_count = 0  # 最近一次任务数量
        self._redisdb = RedisDB()

        self._project_total_state_table = "{}_total_state".format(self._project_name)
        self._is_exist_project_total_state_table = False

        # Request 缓存设置
        Request.cached_redis_key = redis_key
        Request.cached_expire_time = setting.RESPONSE_CACHED_EXPIRE_TIME

        delete_keys = delete_keys or setting.DELETE_KEYS
        if delete_keys:
            self.delete_tables(delete_keys)

        self._last_check_task_status_time = 0
        self.wait_lock = wait_lock