Ejemplo n.º 1
0
 def __init__(self):
     self.spider = Spider()
     self.scheduler = Scheduler()
     self.pipeline = Pipeline()
     self.downloader = Downloader()
     self.spider_mid = SpiderMiddleware()  # 初始化爬虫中间件对象
     self.downloader_mid = DownloaderMiddleware()  # 初始化下载器中间件对象
Ejemplo n.º 2
0
    def __init__(self):
        self.spiders = self._auto_import_instances(path=SPIDERS, isspider=True)
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipelines = self._auto_import_instances(path=PIPELINES)
        self.spider_mids = self._auto_import_instances(path=SPIDER_MIDDLEWARES)
        self.downloader_mids = self._auto_import_instances(
            path=DOWNLOADER_MIDDLEWARES)

        self.total_response_nums = 0
        self.total_request_nums = 0
Ejemplo n.º 3
0
    def __init__(self, spiders):
        self.spiders = spiders
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipeline = Pipeline()

        self.spider_mid = SpiderMiddleware()
        self.downloader_mid = DownloaderMiddleware()

        self.total_response_nums = 0
        self.total_request_nums = 0
Ejemplo n.º 4
0
    def __init__(self):
        self.spiders = self._auto_import_ret(path=SPIDERS, isspider=True)
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipelines = self._auto_import_ret(path=PIPELINES)

        self.spider_mids = self._auto_import_ret(path=SPIDER_MIDDLEWARES)
        self.downloader_mids = self._auto_import_ret(
            path=DOWNLOADER_MIDDLEWARES)

        self.total_request_nums = 0
        self.total_response_nums = 0

        self.pool = Pool(MAX_REQUEST_NUMS)
        self.is_running = True
Ejemplo n.º 5
0
 def __init__(self):  # 接收外部传入的爬虫对象
     """实例化其他的组件,在引擎中通过调用组件的方法实现其功能"""
     # print(spiders)
     self.scheduler = Scheduler()  # 初始化调度器对象
     self.downloader = Downloader()  # 初始化下载器对象
     self.spiders = self._auto_import_instances(SPIDERS,
                                                is_spider=True)  # 爬虫对象 字典
     self.pipelines = self._auto_import_instances(PIPELINES)  # 管道对象 列表
     self.spider_mids = self._auto_import_instances(
         SPIDER_MIDDLEWARES)  # 列表
     self.downloader_mids = self._auto_import_instances(
         DOWNLOADER_MIDDLEWARES)  # 列表
     self.total_request_nums = 0
     self.total_response_nums = 0
     self.pool = Pool()  # 实例化线程池对象
     self.is_running = False  # 判断程序是否执行标志
Ejemplo n.º 6
0
class Engine(object):
    def __init__(self):
        self.spider = Spider()
        self.scheduler = Scheduler()
        self.pipeline = Pipeline()
        self.downloader = Downloader()

    def start(self):
        """启动整个引擎,主要调用逻辑代码写在_start_engine中"""
        self._start_engine()

    def _start_engine(self):
        '''实现个业务之间对接'''
        # 爬虫模块发出初始化请求
        start_request = self.spider.start_request()
        # 把初始化请求添加给调度器
        self.scheduler.add_request(start_request)
        # 从调度器获取请求对象
        request = self.scheduler.get_request()
        # 调用下载器发送请求
        response = self.downloader.get_response(request)
        # 调用爬虫处理响应方法,处理响应,得到结果
        result = self.spider.parse(response)
        if isinstance(result, Request):
            self.scheduler.add_request(result)
        else:
            self.pipeline.process_item(result)
Ejemplo n.º 7
0
 def __init__(self):
     self.spiders = self.__auto_import(
         settings.SPIDERS, is_spider=True)  # 这里传递过来的是一个字典{爬虫名:爬虫对象}
     # 创建统计器对象
     self.stats_collector = StatsCollector()
     # 把统计器对象传递给调度器
     self.scheduler = Scheduler(self.stats_collector)
     self.downloader = Downloader()
     self.pipelines = self.__auto_import(settings.PIPELINES)
     self.spider_middlewares = self.__auto_import(
         settings.SPIDER_MIDDLEWARES)
     self.downloader_middlewares = self.__auto_import(
         settings.DOWNLOADER_MIDDLEWARES)
     self.pool = Pool()  # 创建线程池对象
     # 定义变量,用于记录起始请求完成的爬虫数量
     self.start_request_finished_spider_count = 0
Ejemplo n.º 8
0
    def __init__(self):
        self.spiders = self._auto_import_instances(path=SPIDERS,
                                                   isspider=True)  # 爬虫字典
        self.scheduler = Scheduler()
        self.downloader = Downloader()

        self.pipelines = self._auto_import_instances(path=PIPELINES)
        self.spider_mids = self._auto_import_instances(path=SPIDER_MIDDLEWARES)
        self.downloader_mids = self._auto_import_instances(
            path=DOWNLOADER_MIDDLEWARES)

        self.total_response_nums = 0
        self.total_request_nums = 0

        self.pool = Pool(5)  # os.cpu_count() or 1
        self.is_running = True
Ejemplo n.º 9
0
    def __init__(self):
        self.spiders = self._auto_import_instances(path=SPIDERS, isspider=True) # 爬虫字典
        self.pipelines = self._auto_import_instances(path=PIPELINES)
        self.spider_mids = self._auto_import_instances(path=SPIDER_MIDDLEWARES)
        self.downloader_mids = self._auto_import_instances(path=DOWNLOADER_MIDDLEWARES)

        if SCHEDULER_PERSIST:
            self.collector = ReidsStatsCollector()
        else:
            self.collector = NormalStatsCollector()
        # self.total_response_nums = 0
        # self.total_request_nums = 0

        self.scheduler = Scheduler(self.collector)
        self.downloader = Downloader()
        self.pool = Pool(5) # os.cpu_count() or 1
        self.is_running = True
Ejemplo n.º 10
0
class Engine():
    def __init__(self):
        self.spider = Spider()
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipeline = Pipeline()

        self.spider_mid = SpiderMiddleware()
        self.downloader_mid = DownloaderMiddleware()

    def start(self):
        # 被调用执行引擎逻辑的入口函数
        self._start_engine()

    def _start_engine(self):
        # 框架运行的逻辑

        # 1. 爬虫模块发出初始请求
        start_request = self.spider.start_request()

        # 利用爬虫中间件预处理请求对象
        start_request = self.spider_mid.process_request(start_request)

        # 2. 把初始请求添加给调度器
        self.scheduler.add_request(start_request)

        # 3. 从调度器获取请求对象
        request = self.scheduler.get_request()

        # 利用下载器中间件预处理请求对象
        request = self.downloader_mid.process_request(request)

        # 4. 利用下载器发起请求
        response = self.downloader.get_response(request)

        # 利用下载器中间件预处理响应对象
        response = self.downloader_mid.process_response(response)

        # 利用爬虫中间件预处理响应对象
        response = self.spider_mid.process_response(response)

        # 5. 利用爬虫的解析响应的方法,处理响应,得到结果
        result = self.spider.parse(response)

        # 6. 判断结果对象
        if isinstance(result, Request):

            # 利用爬虫中间件预处理请求对象
            result = self.spider_mid.process_request(result)

            # 6.1 如果是请求对象,那么就再交给调度器
            self.scheduler.add_request(result)

        else:

            # 6.2 否则,就交给管道处理
            self.pipeline.process_item(result)
Ejemplo n.º 11
0
class Engine(object):
    def __init__(self):
        self.spider = Spider()
        self.scheduler = Scheduler()
        self.pipeline = Pipeline()
        self.downloader = Downloader()
        self.spider_mid = SpiderMiddleware()  # 初始化爬虫中间件对象
        self.downloader_mid = DownloaderMiddleware()  # 初始化下载器中间件对象

    def start(self):
        """启动整个引擎,主要调用逻辑代码写在_start_engine中"""
        start = datetime.now()  # 获取当前时间
        logger.info('开始运行时间:%s' % start)
        self._start_engine()
        stop = datetime.now()
        logger.info('运行结束时间:%s' % stop)
        # 运行总耗时时间
        logger.info('耗时: %.2f' % (stop - start).total_seconds())

    def _start_engine(self):
        '''实现个业务之间对接'''
        # 爬虫模块发出初始化请求
        start_request = self.spider.start_request()

        # 爬虫中间件
        start_request = self.spider_mid.process_request(start_request)
        # 把初始化请求添加给调度器
        self.scheduler.add_request(start_request)
        # 从调度器获取请求对象
        request = self.scheduler.get_request()

        # 利用下载中间件预处理请求对象
        request = self.downloader_mid.process_request(request)
        # 调用下载器发送请求
        response = self.downloader.get_response(request)

        # 下载中间件预处理对象
        response = self.downloader_mid.process_response(response)
        # 调用爬虫处理响应方法,处理响应,得到结果
        result = self.spider.parse(response)
        if isinstance(result, Request):
            result = self.spider_mid.process_request(result)
            self.scheduler.add_request(result)
        else:
            self.pipeline.process_item(result)
Ejemplo n.º 12
0
class Engine(object):
    """
    1.对其他模块进行初始化
    2.启动引擎(实现引擎调用核心逻辑)
    """
    def __init__(self):
        self.spiders = self.__auto_import(
            settings.SPIDERS, is_spider=True)  # 这里传递过来的是一个字典{爬虫名:爬虫对象}
        # 创建统计器对象
        self.stats_collector = StatsCollector()
        # 把统计器对象传递给调度器
        self.scheduler = Scheduler(self.stats_collector)
        self.downloader = Downloader()
        self.pipelines = self.__auto_import(settings.PIPELINES)
        self.spider_middlewares = self.__auto_import(
            settings.SPIDER_MIDDLEWARES)
        self.downloader_middlewares = self.__auto_import(
            settings.DOWNLOADER_MIDDLEWARES)
        self.pool = Pool()  # 创建线程池对象
        # 定义变量,用于记录起始请求完成的爬虫数量
        self.start_request_finished_spider_count = 0

    @staticmethod
    def __auto_import(full_names, is_spider=False):
        """
        动态导入:根据配置信息自动创建对象,封装成需要的格式返回
        :param full_names: 配置全类名(路径)列表
        :param is_spider: 判断是不是爬虫对象
        :return: 配置类的对象列表或者字典
        """
        # 如果是爬虫就是字典,否则是列表
        instances = {} if is_spider else []
        for full_name in full_names:
            result = full_name.rsplit('.', maxsplit=1)
            # 模块名
            module_name = result[0]
            # 类名
            class_name = result[1]
            # 根据模块名,导入模块,获取模块对象
            module = importlib.import_module(module_name)
            # 根据类名,从模块对象中获取类对象
            cls = getattr(module, class_name)
            # 使用类对象,创建实例对象
            instance = cls()
            # 如果是爬虫,存储到字典中,否则存到列表里
            if is_spider:
                instances[instance.name] = instance
            else:
                instances.append(instance)

        return instances

    def start(self):
        """启动引擎,对外提供接口"""
        start = datetime.now()
        logger.info("开始运行时间:%s" % start)
        self.__start()
        stop = datetime.now()
        logger.info("运行结束时间:%s" % stop)
        logger.info("耗时:%.2f秒" % (stop - start).total_seconds())
        # 总起始请求数量
        logger.info('总起始请求数量:%s' % self.stats_collector.start_request_nums)
        # 记录总请求数量
        logger.info('总请求数量:%s' % self.stats_collector.request_nums)
        # 总过滤请求数量
        logger.info('过滤掉的请求数量:%s' % self.stats_collector.repeat_request_nums)
        # 总响应数量
        logger.info('总响应处理数量:%s' % self.stats_collector.response_nums)
        # 如果启用分布式,当前程序结束的时候,清空统计信息
        if settings.SCHEDULER_PERSIST:
            self.stats_collector.clear()
            if not settings.FP_PERSIST:
                # 如果不开启断点续爬,清空指纹和请求队列数据
                self.scheduler.clear()

    @staticmethod
    def __error_callback(e):
        """错误回调函数"""
        try:
            raise e
        except Exception as e:
            logger.exception(e)

    def __callback_execute(self, temp):
        """异步线程池回调函数"""
        self.pool.apply_async(self.__execute_request_response_item,
                              callback=self.__callback_execute,
                              error_callback=self.__error_callback)

    def __start(self):
        """私有启动引擎的方法,实现核心代码"""
        # 添加起始请求到调度器中
        # 异步执行__add_start_requests任务
        self.pool.apply_async(self.__add_start_requests,
                              error_callback=self.__error_callback)

        # 配置多少个异步任务,这个异步调用就执行多少次
        for i in range(settings.ASYNC_COUNT):
            # 异步执行__execute_request_response_item任务
            self.pool.apply_async(self.__execute_request_response_item,
                                  callback=self.__callback_execute,
                                  error_callback=self.__error_callback)

        # 让主线程等待一下,让上面的异步任务启动起来
        time.sleep(1)

        while True:
            # 死循环进行轮询,非常消耗cpu性能,稍睡一下,降低消耗
            time.sleep(0.1)

            # 当所有爬虫的起始请求都执行完了才结束
            if self.start_request_finished_spider_count >= len(self.spiders):
                # 当所有请求都处理完成,要结束循环
                if self.stats_collector.response_nums >= self.stats_collector.request_nums:
                    # 没有请求了,退出循环
                    break

    def __execute_request_response_item(self):
        """处理请求、响应和数据方法"""
        # 从调度器获取请求对象,交给下载器发起请求,获取一个响应对象
        request = self.scheduler.get_request()

        # 取出该请求对应爬虫对象,根据爬虫名去爬虫字典中取出爬虫对象
        spider = self.spiders[request.spider_name]

        # 遍历下载器中间件列表,获取每一个下载器中间件
        for downloader_middleware in self.downloader_middlewares:
            # 利用下载器中间件预处理请求对象
            request = downloader_middleware.process_request(request)
        # 利用下载器发起请求
        response = self.downloader.get_response(request)
        # 把请求的meta数据传递给response
        response.meta = request.meta

        # 利用下载器中间件预处理响应对象
        # 遍历下载器中间件列表,获取每个下载器中间件
        for downloader_middleware in self.downloader_middlewares:
            response = downloader_middleware.process_response(response)

        # 遍历爬虫中间件列表,获取每一个爬虫中间件
        for spider_middleware in self.spider_middlewares:
            # 调用爬虫中间件的process_response方法,处理响应
            response = spider_middleware.process_response(response)

        # 如果有该请求有对应解析函数callback,就使用callback来解析数据
        if request.callback:
            # 接收解析函数,处理结果
            results = request.callback(response)
        else:
            # 如果没有callback就使用parse函数来解析数据
            results = spider.parse(response)

        # 判断results是不是可迭代对象,如果不可迭代,变为可迭代的
        if not isinstance(results, Iterable):
            results = [results]
        for result in results:
            # 判断结果对象
            if isinstance(result, Request):
                # 如果是请求对象,就再交给调度器
                # 遍历爬虫中间件列表,获取每个爬虫中间件
                for spider_middleware in self.spider_middlewares:
                    # 利用爬虫中间件预处理请求对象
                    result = spider_middleware.process_request(result)

                # 设置请求对象对应的爬虫名
                result.spider_name = spider.name

                self.scheduler.add_request(result)
            else:
                # 否则,交给管道处理
                for pipeline in self.pipelines:
                    result = pipeline.process_item(result, spider)

        # 统计总的响应数量,每次递增1
        self.stats_collector.incr(self.stats_collector.response_nums_key)

    def __add_start_requests(self):
        # 遍历爬虫字典,取出爬虫对象
        for spider_name, spider in self.spiders.items():
            # 异步执行起始请求任务,防止有增量爬虫启动后不停止而后面的爬虫不执行的问题
            self.pool.apply_async(
                self.__add_one_spider_start_requests,
                args=(spider, spider_name),
                error_callback=self.__error_callback,
                callback=self.__add_one_spider_start_requests_callback)

    def __add_one_spider_start_requests_callback(self, temp):
        """每调用一次,起始请求完成数+1"""
        self.start_request_finished_spider_count += 1

    def __add_one_spider_start_requests(self, spider, spider_name):
        # 调用爬虫start_request方法,获取请求对象
        for request in spider.start_request():
            # 设置该请求对应的爬虫名
            request.spider_name = spider_name

            # 统计起始请求数量
            self.stats_collector.incr(
                self.stats_collector.start_request_nums_key)

            # 利用爬虫中间件预处理请求对象
            # 遍历爬虫中间件列表,获取每个爬虫中间件
            for spider_middleware in self.spider_middlewares:
                request = spider_middleware.process_request(request)
            # 调用调度器的add_request把请求添加到调度器中
            self.scheduler.add_request(request)
Ejemplo n.º 13
0
class Engine():
    def __init__(self, spiders):
        self.spiders = spiders
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipeline = Pipeline()

        self.spider_mid = SpiderMiddleware()
        self.downloader_mid = DownloaderMiddleware()

        self.total_response_nums = 0
        self.total_request_nums = 0

    def start(self):
        # 被调用执行引擎逻辑的入口函数

        start = datetime.now()  # 起始时间
        logger.info("开始运行时间:%s" % start)  # 使用日志记录起始运行时间
        self._start_engine()
        stop = datetime.now()  # 结束时间
        logger.info("结束运行时间:%s" % stop)  # 使用日志记录结束运行时间
        logger.info("耗时:%.2f" % (stop - start).total_seconds())  # 使用日志记录运行耗时
        logger.info("总的请求数量:{}".format(self.total_request_nums))
        logger.info("总的响应数量:{}".format(self.total_response_nums))

    def _start_requests(self):
        """把爬虫中所有起始的url构造request并添加到请求队列中"""
        for spider_name, spider in self.spiders.items():
            for start_request in spider.start_requests():

                # 1. 爬虫模块发出初始请求
                # 利用爬虫中间件预处理请求对象
                start_request = self.spider_mid.process_request(start_request)
                # 给request对象增加spider_name的属性
                start_request.spider_name = spider_name
                # 2. 把初始请求添加给调度器
                self.scheduler.add_request(start_request)
                # 总请求数 + 1
                self.total_request_nums += 1

    def _execute_request_response_item(self):
        """队列中取出一个request,直到处理完毕"""
        # 3. 从调度器获取请求对象
        request = self.scheduler.get_request()
        # 判断队列是否取空
        if request is None:
            return  # 提前终止
        # 利用下载器中间件预处理请求对象
        request = self.downloader_mid.process_request(request)
        # 4. 利用下载器发起请求
        response = self.downloader.get_response(request)
        # 传递meta
        response.meta = request.meta
        # 利用下载器中间件预处理响应对象
        response = self.downloader_mid.process_response(response)
        # 利用爬虫中间件预处理响应对象
        response = self.spider_mid.process_response(response)
        # 根据爬虫名获取爬虫类对象
        spider = self.spiders[request.spider_name]
        # 5. 利用爬虫的解析响应的方法,处理响应,得到结果
        # request.parse指定的解析函数 = getattr(爬虫类对象, 指定的解析函数的字符串)
        parse_func = getattr(spider, request.parse)
        results = parse_func(response)
        for result in results:
            # 6. 判断结果对象
            if isinstance(result, Request):
                # 利用爬虫中间件预处理请求对象
                result = self.spider_mid.process_request(result)
                # 给request对象增加spider_name的属性
                result.spider_name = request.spider_name
                # 6.1 如果是请求对象,那么就再交给调度器
                self.scheduler.add_request(result)
                # 总请求数 + 1
                self.total_request_nums += 1
            else:
                # 6.2 否则,就交给管道处理
                self.pipeline.process_item(result)
        # 总响应数 +1
        self.total_response_nums += 1

    def _start_engine(self):
        # 框架运行的逻辑

        self._start_requests()  # 把所有初始request放入队列

        while True:
            time.sleep(0.1)
            self._execute_request_response_item()  # 处理一个从队列中取出的request

            # 程序退出的条件
            if self.scheduler.q.empty():
                break  # 判断队列为空
Ejemplo n.º 14
0
class Engine():
    def __init__(self):
        self.spider = Spider()
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipeline = Pipeline()

        self.spider_mid = SpiderMiddleware()
        self.downloader_mid = DownloaderMiddleware()

    def start(self):
        # 被调用执行引擎逻辑的入口函数

        start = datetime.now()  # 起始时间
        logger.info("开始运行时间:%s" % start)  # 使用日志记录起始运行时间
        self._start_engine()
        stop = datetime.now()  # 结束时间
        logger.info("结束运行时间:%s" % stop)  # 使用日志记录结束运行时间
        logger.info("耗时:%.2f" % (stop - start).total_seconds())  # 使用日志记录运行耗时

    def _start_engine(self):
        # 框架运行的逻辑

        # 1. 爬虫模块发出初始请求
        start_request = self.spider.start_request()

        # 利用爬虫中间件预处理请求对象
        start_request = self.spider_mid.process_request(start_request)

        # 2. 把初始请求添加给调度器
        self.scheduler.add_request(start_request)

        # 3. 从调度器获取请求对象
        request = self.scheduler.get_request()

        # 利用下载器中间件预处理请求对象
        request = self.downloader_mid.process_request(request)

        # 4. 利用下载器发起请求
        response = self.downloader.get_response(request)

        # 利用下载器中间件预处理响应对象
        response = self.downloader_mid.process_response(response)

        # 利用爬虫中间件预处理响应对象
        response = self.spider_mid.process_response(response)

        # 5. 利用爬虫的解析响应的方法,处理响应,得到结果
        result = self.spider.parse(response)

        # 6. 判断结果对象
        if isinstance(result, Request):

            # 利用爬虫中间件预处理请求对象
            result = self.spider_mid.process_request(result)

            # 6.1 如果是请求对象,那么就再交给调度器
            self.scheduler.add_request(result)

        else:

            # 6.2 否则,就交给管道处理
            self.pipeline.process_item(result)
Ejemplo n.º 15
0
class Engine():
    def __init__(self):
        self.spiders = self._auto_import_instances(path=SPIDERS,
                                                   isspider=True)  # 爬虫字典
        self.pipelines = self._auto_import_instances(path=PIPELINES)
        self.spider_mids = self._auto_import_instances(path=SPIDER_MIDDLEWARES)
        self.downloader_mids = self._auto_import_instances(
            path=DOWNLOADER_MIDDLEWARES)

        if SCHEDULER_PERSIST:
            self.collector = ReidsStatsCollector()
        else:
            self.collector = NormalStatsCollector()
        # self.total_response_nums = 0
        # self.total_request_nums = 0

        self.scheduler = Scheduler(self.collector)
        self.downloader = Downloader()
        self.pool = Pool(5)  # os.cpu_count() or 1
        self.is_running = True

    def _auto_import_instances(self, path=[], isspider=False):
        """传入项目配置中爬虫,管道,中间件配置变量(list),返回爬虫字段 或 包含管道类中间件对象的list"""

        results = {} if isspider else []

        for p in path:  # p == 'spiders.baidu.BaiduSpider' or 'pipelines.BaiduPipeline'
            py_name_str = p.rsplit('.', 1)[0]  # 'spiders.baidu' or 'pipelines'
            cls_name_str = p.rsplit('.',
                                    1)[1]  # 'BaiduSpider' or 'BaiduPipeline'
            # 动态导入py文件模块对象
            # importlib.import_module(模块/py文件名字的字符串) == 模块/py文件对象
            py_obj = importlib.import_module(py_name_str)
            # getattr(模块/py文件对象, 类名称字符串) == 类对象 # 还没有实例化
            cls_obj = getattr(py_obj, cls_name_str)

            if isspider:
                results[cls_obj.name] = cls_obj()
            else:
                results.append(cls_obj())
        return results

    def start(self):
        # 框架启动的入口函数
        start = datetime.now()  # 起始时间
        logger.info("开始运行时间:%s" % start)  # 使用日志记录起始运行时间
        self._start_engine()
        stop = datetime.now()  # 结束时间
        logger.info("结束运行时间:%s" % stop)  # 使用日志记录结束运行时间
        logger.info("耗时:%.2f" % (stop - start).total_seconds())  # 使用日志记录运行耗时
        logger.info("总的请求数量:{}".format(self.collector.request_nums))
        logger.info("总的响应数量:{}".format(self.collector.response_nums))
        logger.info("重复请求数量:{}".format(self.collector.repeat_request_nums))
        self.collector.clear()  # 清除计数统计!

    def _start_requests(self):
        """专门处理所有爬虫起始url:构造request,加入队列"""
        def func(spider_name, spider):
            """对一个爬虫的所有起始url构造request,并入队"""
            # 1. 爬虫模块发出初始请求
            for start_request in spider.start_requests():
                # 给request添加spider_name属性
                start_request.spider_name = spider_name
                # 利用爬虫中间件预处理请求对象
                for spider_mid in self.spider_mids:
                    start_request = spider_mid.process_request(
                        start_request, spider)
                # 2. 把初始请求添加给调度器
                self.scheduler.add_request(start_request)
                # 总请求数 +1
                # self.total_request_nums += 1
                self.collector.incr(self.collector.request_nums_key)

        for spider_name, spider in self.spiders.items():
            # 异步的让每个爬虫的起始request入队
            self.pool.apply_async(func, args=(spider_name, spider))

    def _execute_request_response_item(self):
        """只处理一个请求对象:从队列取出一个request->response->item进管道/request入队"""
        # 3. 从调度器获取请求对象,交给下载器发起请求,获取一个响应对象
        request = self.scheduler.get_request()
        if request is None:  # 此时队列取空了
            return  # 提前终止函数
        # 根据爬虫的名字来获取爬虫类对象 # 爬虫名字在请求对象进入队列之前已经赋值了!
        spider = self.spiders[request.spider_name]

        # 利用下载器中间件预处理请求对象
        for downloader_mid in self.downloader_mids:
            request = downloader_mid.process_request(request, spider)
        # 4. 利用下载器发起请求
        response = self.downloader.get_response(request)
        # meta属性的传递!!!
        response.meta = request.meta
        # 利用下载器中间件预处理响应对象
        for downloader_mid in self.downloader_mids:
            response = downloader_mid.process_response(request, response,
                                                       spider)
        # 利用爬虫中间件预处理响应对象
        for spider_mid in self.spider_mids:
            response = spider_mid.process_response(request, response, spider)

        # 5. 利用爬虫的解析响应的方法,处理响应,得到结果
        # 利用getattr函数和爬虫类中构造的request指定的解析函数名称字符串 获取指定解析函数对象
        # func = getattr(类, 类中函数的字符串) # func还没有执行被调用
        parse_func = getattr(spider, request.parse)
        # 调用request.parse指定的解析函数
        if request.parse == 'start_requests':  # 防止爬虫中request.parse指定为start_requests函数
            results = parse_func()
        else:
            results = parse_func(response)  # 解析函数有可能返回可迭代对象或None
        if results is None:  # 如果解析函数返回None就提前return
            # 总响应数 +1
            # self.total_response_nums += 1
            self.collector.incr(self.collector.response_nums_key)
            return

        for result in results:
            # 6. 判断结果对象
            if isinstance(result, Request):
                # 给request添加spider_name属性
                result.spider_name = request.spider_name
                # 利用爬虫中间件预处理请求对象
                for spider_mid in self.spider_mids:
                    result = spider_mid.process_request(result, spider)
                # 6.1 如果是请求对象,那么就再交给调度器
                self.scheduler.add_request(result)
                # 总请求数 +1
                # self.total_request_nums += 1
                self.collector.incr(self.collector.request_nums_key)
            else:
                # 6.2 否则,就交给管道处理
                for pipeline in self.pipelines:
                    result = pipeline.process_item(result, spider)
        # 总响应数 +1
        # self.total_response_nums += 1
        self.collector.incr(self.collector.response_nums_key)

    def _error_callback(self, exception):
        try:
            raise exception  # 抛出异常后,才能被日志进行完整记录下来
        except Exception as e:
            logger.exception(e)

    def _callback(self, xxx):
        if self.is_running:
            # print(self.total_response_nums, self.total_request_nums, self.scheduler.total_repeat_nums)

            self.pool.apply_async(self._execute_request_response_item,
                                  callback=self._callback,
                                  error_callback=self._error_callback)

    def _start_engine(self):
        """引擎执行的逻辑"""
        self._start_requests()  # 把所有起始url构造request入队

        for i in range(MAX_ASYNC_THREAD_NUMBER):
            # 不断 处理一个请求对象,直到该request处理完毕
            self.pool.apply_async(self._execute_request_response_item,
                                  callback=self._callback,
                                  error_callback=self._error_callback)
        while True:  # 退出条件
            time.sleep(0.01)
            # 总响应数 + 重复的请求数 == 总请求数
            if self.collector.response_nums + self.collector.repeat_request_nums == self.collector.request_nums:
                self.is_running = False
                # print(self.total_response_nums, self.total_request_nums, self.scheduler.total_repeat_nums)
                break
Ejemplo n.º 16
0
class Engine(object):
    """
    1.对外提供整个程序的如旧
    2.依次调用其他组件对外提供的接口,实现整个框架的运作(驱动)
    """
    def __init__(self):  # 接收外部传入的爬虫对象
        """实例化其他的组件,在引擎中通过调用组件的方法实现其功能"""
        # print(spiders)
        self.scheduler = Scheduler()  # 初始化调度器对象
        self.downloader = Downloader()  # 初始化下载器对象
        self.spiders = self._auto_import_instances(SPIDERS,
                                                   is_spider=True)  # 爬虫对象 字典
        self.pipelines = self._auto_import_instances(PIPELINES)  # 管道对象 列表
        self.spider_mids = self._auto_import_instances(
            SPIDER_MIDDLEWARES)  # 列表
        self.downloader_mids = self._auto_import_instances(
            DOWNLOADER_MIDDLEWARES)  # 列表
        self.total_request_nums = 0
        self.total_response_nums = 0
        self.pool = Pool()  # 实例化线程池对象
        self.is_running = False  # 判断程序是否执行标志

    def _auto_import_instances(self, path, is_spider=False):
        """
        实现模块的动态导入,传入模块路径列表,返回类的实例
        :param self:
        :param path:包含模块位置字符串的列表
        :return:{“name”:spider}/[pipeline等]
        """
        if is_spider:
            instances = {}
        else:
            instances = []
        for p in path:
            module_name = p.rsplit(".", 1)[0]  # 获取模块路径的名字
            cls_name = p.rsplit(".", 1)[-1]  # 获取类名
            module = importlib.import_module(module_name)  # 导入模块
            cls = getattr(module, cls_name)  # 获取module下的类
            if is_spider:
                instances[cls().name] = cls()
            else:
                instances.append(cls())
        print(instances)
        return instances

    def start(self):
        """启动整个引擎"""
        start_time = datetime.now()
        print("爬虫启动:", start_time)
        self._start_engine()
        endtime = datetime.now()
        print("爬虫结束:", endtime)
        print(endtime - start_time)
        print("请求数量:", self.total_request_nums)
        print("响应数量:", self.total_response_nums)
        print("重复数量:", self.scheduler.repeat_request_nums)

    def _start_request(self):
        """初始化请求,调用爬虫的start_request方法,把所有等等请求添加到调度器"""
        for spider_name, spider in self.spiders.items():
            for start_request in spider.start_request():

                # 1.对start_request进过爬虫中间件进行处理
                for spider_mid in self.spider_mids:
                    start_request = spider_mid.process_request(start_request)

                # 给请求对象添加spider_name 属性
                start_request.spider_name = spider_name

                # 2. 调用调度器的add_request方法,添加request对象到调度器中
                self.scheduler.add_request(start_request)

                # 3.请求数加1
                self.total_request_nums += 1

    def _execute_request_response_item(self):

        # 4.调用调度器的get_request方法,获取request
        request = self.scheduler.get_request()

        # 判断请求对象是否存在,不存在,直接返回
        if request is None:
            return

        # 5. request对象经过下载器中间件的process_request进行处理
        for downloader_mid in self.downloader_mids:
            request = downloader_mid.process_request(request)

        # 6. 调用下载器的get_response方法,获取响应
        response = self.downloader.get_response(request)

        # 将request的meta值传给response的meta
        response.meta = request.meta

        # 7. response对象经过下载器中间件的process_response进行处理
        for downloader_mid in self.downloader_mids:
            response = downloader_mid.process_response(response)

        # 8. response对象经过下爬虫中间件的process_response进行处理
        for spider_mid in self.spider_mids:
            response = spider_mid.process_response(response)

        # 根据request的spider_name属性获取爬虫实例
        spider = self.spiders[request.spider_name]

        # 获取request对象响应的parse方法
        parse = getattr(spider, request.parse)

        # 9. 调用爬虫的parse方法,处理响应
        for result in parse(response):
            # 判断结果类型,如果是request,重新调用调度器的add_request方法
            if isinstance(result, Request):
                # 在解析函数得到request对象之后,使用process_request进行处理
                for spider_mid in self.spider_mids:
                    result = spider_mid.process_request(result)
                # 为请求对象设置spider_name属性
                result.spider_name = request.spider_name
                # 添加到队列中去
                self.scheduler.add_request(result)
                self.total_request_nums += 1
            # 7如果不是,调用pipeline的process_item方法处理结果
            else:
                for pipeline in self.pipelines:
                    result = pipeline.process_item(result, spider)
        # 响应加1
        self.total_response_nums += 1

    def _callback(self, temp):
        """执行新的请求回调函数,实现循环"""
        if self.is_running is True:
            self.pool.apply_async(self._execute_request_response_item,
                                  callback=self._callback)

    def _start_engine(self):
        """依次调用其他组件对外提供的接口,实现整个框架的运作(驱动)"""
        #
        # # 1. 调用爬虫的start_request方法,获取request对象
        # start_request = self.spider.start_requests()
        # # 对start_request进过爬虫中间件进行处理
        # start_request = self.spider_mid.process_request(start_request)
        #
        # # 2. 调用调度器的add_request方法,添加request对象到调度器中
        # self.scheduler.add_request(start_request)
        # # 3. 调用调度器的get_request方法,获取request对象
        # request = self.scheduler.get_request()
        # # request对象经过下载器中间件的process_request进行处理
        # request = self.downloader_mid.process_request(request)
        #
        # # 4. 调用下载器的get_response方法,获取响应
        # response = self.downloader.get_response(request)
        # # response对象经过下载器中间件的process_response进行处理
        # response = self.downloader_mid.process_response(response)
        # # response对象经过下爬虫中间件的process_response进行处理
        # response = self.spider_mid.process_response(response)
        #
        # # 5. 调用爬虫的parse方法,处理响应
        # result = self.spider.parse(response)
        # # 6.判断结果的类型,如果是request,重新调用调度器的add_request方法
        # if isinstance(result, Request):
        #     # 在解析函数得到request对象之后,使用process_request进行处理
        #     result = self.spider_mid.process_request(result)
        #     self.scheduler.add_request(result)
        # # 7如果不是,调用pipeline的process_item方法处理结果
        # else:
        #     self.pipeline.process_item(result)
        self.is_running = True  # 启动引擎,设置状态为True
        self.pool.apply_async(self._start_request)  # 使用异步,使用子线程
        for i in range(COCOURRENT_REQUEST):
            self.pool.apply_async(self._execute_request_response_item,
                                  callback=self._callback)  # 使用子线程

        self._start_request()

        # 设置循环,处理多个请求,阻塞,等待子线程结束
        while True:
            time.sleep(0.0001)

            # self._execute_request_response_item()

            # 设置退出条件:当请求数和响应数相等时,退出循环
            # 因为是异步,需要增加判断条件,请求书不能是0
            # if self.total_response_nums +self.scheduler.repeat_request_nums >= self.total_request_nums:
            # if self.total_response_number >= self.scheduler.total_request_number and self.scheduler.total_request_number != 0:
            if self.total_response_nums >= self.scheduler.total_request_number and self.scheduler.total_request_number != 0:
                self.running = False  # 满足循环退出条件后,设置运行状态为False
                break
Ejemplo n.º 17
0
class Engine():
    def __init__(self):
        self.spiders = self._auto_import_ret(path=SPIDERS, isspider=True)
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipelines = self._auto_import_ret(path=PIPELINES)

        self.spider_mids = self._auto_import_ret(path=SPIDER_MIDDLEWARES)
        self.downloader_mids = self._auto_import_ret(
            path=DOWNLOADER_MIDDLEWARES)

        self.total_request_nums = 0
        self.total_response_nums = 0

        self.pool = Pool(MAX_REQUEST_NUMS)
        self.is_running = True

    def _auto_import_ret(self, path=[], isspider=False):
        """利用动态导包的方式,自动根据配置获取爬虫字典或管道、中间件列表"""
        ret = {} if isspider else []
        for p in path:  # 'spiders.baidu.BaiduSpider'
            py_name_str = p.rsplit('.', 1)[0]  # 'spiders.baidu'
            cls_name_str = p.rsplit('.', 1)[1]  # 'BaiduSpider'
            # 利用importlib.import_module函数来根据执行位置.模块名字符串来获取模块对象
            py_obj = importlib.import_module(py_name_str)
            cls = getattr(py_obj, cls_name_str)  #此时返回的是没有实例化的类对象!
            if isspider:
                ret[cls.name] = cls()
            else:
                ret.append(cls())
        return ret

    def start(self):
        # 入口函数
        start_time = datetime.datetime.now()
        logger.info('开始的时间:{}'.format(start_time))
        self._start_engine()
        end_time = datetime.datetime.now()
        logger.info('结束的时间:{}'.format(end_time))
        logger.info('耗时:{}'.format(end_time - start_time))
        logger.info('总请求数:{}'.format(self.total_request_nums))
        logger.info('总响应数:{}'.format(self.total_response_nums))
        logger.info('重复的请求数:{}'.format(self.scheduler.repeat_request_nums))

    def _start_requests(self):
        """把爬虫的所有起始url全部构造成request,放入队列"""
        # 1. 爬虫模块发出初始请求
        for spider_name, spider in self.spiders.items():
            for start_request in spider.start_requests():
                # 给request添加爬虫名的属性
                start_request.spider_name = spider_name
                # 利用爬虫中间件预处理请求对象
                for spider_mid in self.spider_mids:
                    start_request = spider_mid.process_request(
                        start_request, spider)
                # 2. 把初始请求添加给调度器
                self.scheduler.add_request(start_request)
                # 总请求数 + 1
                self.total_request_nums += 1

    def execute_request_response_item(self):
        """把队列中取出的一个request进行处理,直到不再需要该request"""
        # 3. 从调度器获取请求对象
        request = self.scheduler.get_request()
        if request is None:
            return  # 此时请求队列被取空
        # 通过request携带的spider_name来按键取出爬虫类对象
        spider = self.spiders[request.spider_name]

        # 利用下载器中间件预处理请求对象
        for downloader_mid in self.downloader_mids:
            request = downloader_mid.process_request(request, spider)
        # 4. 利用下载器发起请求
        response = self.downloader.get_response(request)
        # 传递meta!!!
        response.meta = request.meta
        # 利用下载器中间件预处理响应对象
        for downloader_mid in self.downloader_mids:
            response = downloader_mid.process_response(response, spider)
        # 利用爬虫中间件预处理响应对象
        for spider_mid in self.spider_mids:
            response = spider_mid.process_response(response, spider)

        # request.parse == 解析函数名字符串
        # 函数 = getattr(类对象, 类的函数名字符串)
        # 获取request在构造时指定的解析函数
        parse_func = getattr(spider, request.parse)

        # 5. 利用爬虫的解析响应的方法,处理响应,得到结果
        results = parse_func(response)

        for result in results:
            # 6. 判断结果对象
            if isinstance(result, Request):
                # 给request添加爬虫名的属性
                result.spider_name = request.spider_name
                # result.spider_name = spider.name # 结果同上一行
                # 利用爬虫中间件预处理请求对象
                for spider_mid in self.spider_mids:
                    result = spider_mid.process_request(result, spider)
                # 6.1 如果是请求对象,那么就再交给调度器
                self.scheduler.add_request(result)
                # 总请求数 + 1
                self.total_request_nums += 1
            else:
                # 6.2 否则,就交给管道处理
                for pipeline in self.pipelines:
                    result = pipeline.process_item(result, spider)

        # 总响应数 + 1
        self.total_response_nums += 1

    def _callback(self, temp):
        if self.is_running:
            self.pool.apply_async(func=self.execute_request_response_item,
                                  callback=self._callback)

    def _start_engine(self):
        # 调用其他组件的属性或函数来组织运行的逻辑

        self._start_requests()

        for i in range(MAX_REQUEST_NUMS):  # 通过配置来控制并发规模
            self.pool.apply_async(func=self.execute_request_response_item,
                                  callback=self._callback)

        while True:
            # 程序退出的条件:总响应数 + 重复请求数 == 总请求数
            if self.total_response_nums + self.scheduler.repeat_request_nums == self.total_request_nums:
                self.is_running = False
                break
Ejemplo n.º 18
0
 def __init__(self):
     self.spider = Spider()
     self.scheduler = Scheduler()
     self.downloader = Downloader()
     self.pipeline = Pipeline()
Ejemplo n.º 19
0
class Engine():
    def __init__(self):
        self.spiders = self._auto_import_instances(path=SPIDERS, isspider=True)
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipelines = self._auto_import_instances(path=PIPELINES)
        self.spider_mids = self._auto_import_instances(path=SPIDER_MIDDLEWARES)
        self.downloader_mids = self._auto_import_instances(
            path=DOWNLOADER_MIDDLEWARES)

        self.total_response_nums = 0
        self.total_request_nums = 0

    def _auto_import_instances(self, path=[], isspider=False):
        """根据项目配置动态返回爬虫字典 或管道中间件列表"""
        instances = {} if isspider else []
        for p in path:  # p = 'pipelines.BaiduPipeline'
            py_name_str = p.rsplit('.', 1)[0]  # 'spipelines'
            cls_name_str = p.rsplit('.', 1)[1]  # 'BaiduPipeline'
            py_obj = importlib.import_module(py_name_str)  # 获取py文件对象
            cls_obj = getattr(py_obj, cls_name_str)  # 获取py文件中的类对象,此时没有实例化!
            if isspider:
                instances[cls_obj.name] = cls_obj()
            else:
                instances.append(cls_obj())
        return instances

    def start(self):
        # 被调用执行引擎逻辑的入口函数

        start = datetime.now()  # 起始时间
        logger.info("开始运行时间:%s" % start)  # 使用日志记录起始运行时间
        self._start_engine()
        stop = datetime.now()  # 结束时间
        logger.info("结束运行时间:%s" % stop)  # 使用日志记录结束运行时间
        logger.info("耗时:%.2f" % (stop - start).total_seconds())  # 使用日志记录运行耗时
        logger.info("总的请求数量:{}".format(self.total_request_nums))
        logger.info("总的响应数量:{}".format(self.total_response_nums))

    def _start_requests(self):
        """把爬虫中所有起始的url构造request并添加到请求队列中"""
        for spider_name, spider in self.spiders.items():
            for start_request in spider.start_requests():
                # 1. 爬虫模块发出初始请求
                # 利用爬虫中间件预处理请求对象
                for spider_mid in self.spider_mids:
                    start_request = spider_mid.process_request(
                        start_request, spider)
                # 给request对象增加spider_name的属性
                start_request.spider_name = spider_name
                # 2. 把初始请求添加给调度器
                self.scheduler.add_request(start_request)
                # 总请求数 + 1
                self.total_request_nums += 1

    def _execute_request_response_item(self):
        """队列中取出一个request,直到处理完毕"""
        # 3. 从调度器获取请求对象
        request = self.scheduler.get_request()
        # 判断队列是否取空
        if request is None:
            return  # 提前终止
        # 根据爬虫名获取爬虫类对象
        spider = self.spiders[request.spider_name]
        # 利用下载器中间件预处理请求对象
        for downloader_mid in self.downloader_mids:
            request = downloader_mid.process_request(request, spider)
        # 4. 利用下载器发起请求
        response = self.downloader.get_response(request)
        # 传递meta
        response.meta = request.meta
        # 利用下载器中间件预处理响应对象
        for downloader_mid in self.downloader_mids:
            response = downloader_mid.process_response(request, response,
                                                       spider)
        # 利用爬虫中间件预处理响应对象
        for spider_mid in self.spider_mids:
            response = spider_mid.process_response(request, response, spider)
        # 5. 利用爬虫的解析响应的方法,处理响应,得到结果
        # request.parse指定的解析函数 = getattr(爬虫类对象, 指定的解析函数的字符串)
        parse_func = getattr(spider, request.parse)
        results = parse_func(response)
        for result in results:
            # 6. 判断结果对象
            if isinstance(result, Request):
                # 利用爬虫中间件预处理请求对象
                for spider_mid in self.spider_mids:
                    result = spider_mid.process_request(result, spider)
                # 给request对象增加spider_name的属性
                result.spider_name = request.spider_name
                # 6.1 如果是请求对象,那么就再交给调度器
                self.scheduler.add_request(result)
                # 总请求数 + 1
                self.total_request_nums += 1
            else:
                # 6.2 否则,就交给管道处理
                for pipeline in self.pipelines:
                    result = pipeline.process_item(result, spider)
        # 总响应数 +1
        self.total_response_nums += 1

    def _start_engine(self):
        # 框架运行的逻辑

        self._start_requests()  # 把所有初始request放入队列

        while True:
            time.sleep(0.1)
            self._execute_request_response_item()  # 处理一个从队列中取出的request

            # 程序退出的条件
            if self.scheduler.q.empty():
                break  # 判断队列为空