Exemple #1
0
 def __init__(self):
     self.spider = Spider()
     self.scheduler = Scheduler()
     self.pipeline = Pipeline()
     self.downloader = Downloader()
     self.spider_mid = SpiderMiddleware()  # 初始化爬虫中间件对象
     self.downloader_mid = DownloaderMiddleware()  # 初始化下载器中间件对象
Exemple #2
0
    def __init__(self, spiders):
        self.spiders = spiders
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipeline = Pipeline()

        self.spider_mid = SpiderMiddleware()
        self.downloader_mid = DownloaderMiddleware()

        self.total_response_nums = 0
        self.total_request_nums = 0
Exemple #3
0
    def __init__(self):
        self.spiders = self._auto_import_instances(path=SPIDERS, isspider=True)
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipelines = self._auto_import_instances(path=PIPELINES)
        self.spider_mids = self._auto_import_instances(path=SPIDER_MIDDLEWARES)
        self.downloader_mids = self._auto_import_instances(
            path=DOWNLOADER_MIDDLEWARES)

        self.total_response_nums = 0
        self.total_request_nums = 0
Exemple #4
0
    def __init__(self):
        self.spiders = self._auto_import_ret(path=SPIDERS, isspider=True)
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipelines = self._auto_import_ret(path=PIPELINES)

        self.spider_mids = self._auto_import_ret(path=SPIDER_MIDDLEWARES)
        self.downloader_mids = self._auto_import_ret(
            path=DOWNLOADER_MIDDLEWARES)

        self.total_request_nums = 0
        self.total_response_nums = 0

        self.pool = Pool(MAX_REQUEST_NUMS)
        self.is_running = True
Exemple #5
0
    def __init__(self):
        self.spiders = self._auto_import_instances(path=SPIDERS,
                                                   isspider=True)  # 爬虫字典
        self.scheduler = Scheduler()
        self.downloader = Downloader()

        self.pipelines = self._auto_import_instances(path=PIPELINES)
        self.spider_mids = self._auto_import_instances(path=SPIDER_MIDDLEWARES)
        self.downloader_mids = self._auto_import_instances(
            path=DOWNLOADER_MIDDLEWARES)

        self.total_response_nums = 0
        self.total_request_nums = 0

        self.pool = Pool(5)  # os.cpu_count() or 1
        self.is_running = True
Exemple #6
0
 def __init__(self):  # 接收外部传入的爬虫对象
     """实例化其他的组件,在引擎中通过调用组件的方法实现其功能"""
     # print(spiders)
     self.scheduler = Scheduler()  # 初始化调度器对象
     self.downloader = Downloader()  # 初始化下载器对象
     self.spiders = self._auto_import_instances(SPIDERS,
                                                is_spider=True)  # 爬虫对象 字典
     self.pipelines = self._auto_import_instances(PIPELINES)  # 管道对象 列表
     self.spider_mids = self._auto_import_instances(
         SPIDER_MIDDLEWARES)  # 列表
     self.downloader_mids = self._auto_import_instances(
         DOWNLOADER_MIDDLEWARES)  # 列表
     self.total_request_nums = 0
     self.total_response_nums = 0
     self.pool = Pool()  # 实例化线程池对象
     self.is_running = False  # 判断程序是否执行标志
Exemple #7
0
 def __init__(self):
     self.spiders = self.__auto_import(
         settings.SPIDERS, is_spider=True)  # 这里传递过来的是一个字典{爬虫名:爬虫对象}
     # 创建统计器对象
     self.stats_collector = StatsCollector()
     # 把统计器对象传递给调度器
     self.scheduler = Scheduler(self.stats_collector)
     self.downloader = Downloader()
     self.pipelines = self.__auto_import(settings.PIPELINES)
     self.spider_middlewares = self.__auto_import(
         settings.SPIDER_MIDDLEWARES)
     self.downloader_middlewares = self.__auto_import(
         settings.DOWNLOADER_MIDDLEWARES)
     self.pool = Pool()  # 创建线程池对象
     # 定义变量,用于记录起始请求完成的爬虫数量
     self.start_request_finished_spider_count = 0
Exemple #8
0
    def __init__(self):
        self.spiders = self._auto_import_instances(path=SPIDERS, isspider=True) # 爬虫字典
        self.pipelines = self._auto_import_instances(path=PIPELINES)
        self.spider_mids = self._auto_import_instances(path=SPIDER_MIDDLEWARES)
        self.downloader_mids = self._auto_import_instances(path=DOWNLOADER_MIDDLEWARES)

        if SCHEDULER_PERSIST:
            self.collector = ReidsStatsCollector()
        else:
            self.collector = NormalStatsCollector()
        # self.total_response_nums = 0
        # self.total_request_nums = 0

        self.scheduler = Scheduler(self.collector)
        self.downloader = Downloader()
        self.pool = Pool(5) # os.cpu_count() or 1
        self.is_running = True
Exemple #9
0
 def __init__(self):
     self.spider = Spider()
     self.scheduler = Scheduler()
     self.downloader = Downloader()
     self.pipeline = Pipeline()