Esempio n. 1
0
 def __init__(self, crawler):
     self.slots = {}
     self.spidermw = SpiderMiddlewareManager.from_crawler(crawler)
     itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR'])
     self.itemproc = itemproc_cls.from_crawler(crawler)
     self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS')
     self.crawler = crawler
Esempio n. 2
0
 def __init__(self, crawler):
     self.slot = None
     self.spidermw = SpiderMiddlewareManager.from_crawler(crawler)
     itemproc_cls = load_object(crawler.settings["ITEM_PROCESSOR"])
     self.itemproc = itemproc_cls.from_crawler(crawler)
     self.concurrent_items = crawler.settings.getint("CONCURRENT_ITEMS")
     self.crawler = crawler
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
Esempio n. 3
0
 def __init__(self, crawler):
     self.slot: Optional[Slot] = None
     self.spidermw = SpiderMiddlewareManager.from_crawler(crawler)
     itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR'])
     self.itemproc = itemproc_cls.from_crawler(crawler)
     self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS')
     self.crawler = crawler
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
Esempio n. 4
0
 def __init__(self, crawler):
     self.slot = None
     self.spidermw = SpiderMiddlewareManager.from_crawler(crawler)
     itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR']
                                )  #处理item的类,默认为ItemPipelineManager
     self.itemproc = itemproc_cls.from_crawler(crawler)
     self.concurrent_items = crawler.settings.getint(
         'CONCURRENT_ITEMS')  # 用于控制同时处理item的数量
     self.crawler = crawler
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
Esempio n. 5
0
 def __init__(self, crawler):
     self.slot = None  #不同于 engine里的slot 这里主要处理返回的respond 和 request
     self.spidermw = SpiderMiddlewareManager.from_crawler(crawler)
     itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR'])
     self.itemproc = itemproc_cls.from_crawler(
         crawler)  # 生成 ITEM_PROCESSOR 类实例
     self.concurrent_items = crawler.settings.getint(
         'CONCURRENT_ITEMS')  # 同时处理item个数
     self.crawler = crawler
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
Esempio n. 6
0
 def __init__(self, crawler):
     self.slot = None
     self.spidermw = SpiderMiddlewareManager.from_crawler(
         crawler)  # 实例化爬虫中间件管理器,其中包含了SPIDER_MIDDLEWARES_BASE的所有实例
     itemproc_cls = load_object(
         crawler.settings['ITEM_PROCESSOR'])  # 从配置文件中加载Pipeline处理器类
     self.itemproc = itemproc_cls.from_crawler(crawler)  # 实例化Pipeline处理器
     self.concurrent_items = crawler.settings.getint(
         'CONCURRENT_ITEMS')  # 从配置文件中获取同时处理输出的任务个数
     self.crawler = crawler
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
Esempio n. 7
0
 def _get_middleware_result(self,
                            *mw_classes,
                            start_index: Optional[int] = None):
     setting = self._construct_mw_setting(*mw_classes,
                                          start_index=start_index)
     self.crawler = get_crawler(Spider, {'SPIDER_MIDDLEWARES': setting})
     self.spider = self.crawler._create_spider('foo')
     self.mwman = SpiderMiddlewareManager.from_crawler(self.crawler)
     result = yield self.mwman.scrape_response(self._scrape_func,
                                               self.response, self.request,
                                               self.spider)
     return result
Esempio n. 8
0
    def __init__(self, crawler):
        self.slot = None
        self.spidermw = SpiderMiddlewareManager.from_crawler(crawler)

        # ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager'
        itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR'])
        self.itemproc = itemproc_cls.from_crawler(crawler)

        # CONCURRENT_ITEMS = 100
        self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS')
        self.crawler = crawler
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
Esempio n. 9
0
 def __init__(self, crawler):
     self.slot = None
     ## 实例化爬虫中间件管理器
     self.spidermw = SpiderMiddlewareManager.from_crawler(crawler)
     ## 从配置中加载 Pipeline 处理器类
     itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR'])
     ## 实例化 Pipeline 处理器
     self.itemproc = itemproc_cls.from_crawler(crawler)
     ## 从配置中获取同时处理 item 的并发数
     self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS')
     self.crawler = crawler
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
Esempio n. 10
0
    def init_smw(self, custom_settings):
        class TestSpider(Spider):
            name = 'test'

        self.spider = TestSpider
        scrapy_default_middlewares = {
            'scrapy.spidermiddlewares.referer.RefererMiddleware': 700
        }

        # monkey patch SPIDER_MIDDLEWARES_BASE to include only referer middleware
        sys.modules['scrapy.settings.default_settings'].SPIDER_MIDDLEWARES_BASE = scrapy_default_middlewares
        crawler = get_crawler(self.spider, custom_settings)
        self.add_frontera_scheduler(crawler)
        self.smw = SpiderMiddlewareManager.from_crawler(crawler)
Esempio n. 11
0
 def __init__(self, crawler):
     self.slot = None
     self.spidermw = SpiderMiddlewareManager.from_crawler(
         crawler
     )  # 居然在scraper中实例化了一个爬虫中间件,神奇,实现了是哪个方法process_spider_input、process_spider_exception、process_spider_requests
     itemproc_cls = load_object(
         crawler.settings['ITEM_PROCESSOR']
     )  # ITEM_PROCESSOR = 'scrapy.pipelines.ItemPipelineManager'
     self.itemproc = itemproc_cls.from_crawler(crawler)  # 获取一个所有中间件管道管理对象
     self.concurrent_items = crawler.settings.getint(
         'CONCURRENT_ITEMS')  # 100
     self.crawler = crawler
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
Esempio n. 12
0
 def _get_middleware_result(self,
                            *mw_classes,
                            start_index: Optional[int] = None):
     setting = self._construct_mw_setting(*mw_classes,
                                          start_index=start_index)
     self.crawler = get_crawler(Spider, {
         'SPIDER_MIDDLEWARES_BASE': {},
         'SPIDER_MIDDLEWARES': setting
     })
     self.spider = self.crawler._create_spider('foo')
     self.mwman = SpiderMiddlewareManager.from_crawler(self.crawler)
     start_requests = iter(self._start_requests())
     results = yield self.mwman.process_start_requests(
         start_requests, self.spider)
     return results
    def __init__(self, crawler):
        self.slot = None
        self.spidermw = SpiderMiddlewareManager.from_crawler(crawler)
        itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR'])
        #ITEM_PROCESSOR = 'scrapy.pipelines.ItemPipelineManager'
        #ItemPipelineManager是一个MiddlewareManager的派生类
        #加入了一个功能:将pipeline中process_item方法添加到回调链中。_add_middleware
        #然后以callback(spider)调用回调链。(pipelinemanager.process_item)

        self.itemproc = itemproc_cls.from_crawler(crawler)
        #管道管理器实例化。

        self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS')
        #默认100

        self.crawler = crawler
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
Esempio n. 14
0
    def setUp(self):
        class TestSpider(Spider):
            name = 'test'

        self.spider = TestSpider
        scrapy_default_middlewares = {
            'scrapy.spidermiddlewares.referer.RefererMiddleware': 700
        }

        # monkey patch SPIDER_MIDDLEWARES_BASE to include only referer middleware
        sys.modules['scrapy.settings.default_settings'].SPIDER_MIDDLEWARES_BASE = scrapy_default_middlewares

        custom_settings = {
            'SPIDER_MIDDLEWARES': {'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 1000}
        }
        crawler = get_crawler(self.spider, custom_settings)
        self.add_frontera_scheduler(crawler)
        self.smw = SpiderMiddlewareManager.from_crawler(crawler)
    def __init__(self, crawler):
        self.slot = None
        self.spidermw = SpiderMiddlewareManager.from_crawler(crawler)
        itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR'])
        #ITEM_PROCESSOR = 'scrapy.pipelines.ItemPipelineManager'
        #ItemPipelineManager是一个MiddlewareManager的派生类
        #加入了一个功能:将pipeline中process_item方法添加到回调链中。_add_middleware
        #然后以callback(spider)调用回调链。(pipelinemanager.process_item)

        self.itemproc = itemproc_cls.from_crawler(crawler)
        #管道管理器实例化。

        self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS')
        #默认100

        self.crawler = crawler
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
Esempio n. 16
0
 def setUp(self):
     self.request = Request('http://example.com/index.html')
     self.response = Response(self.request.url, request=self.request)
     self.crawler = get_crawler(Spider, {'SPIDER_MIDDLEWARES_BASE': {}})
     self.spider = self.crawler._create_spider('foo')
     self.mwman = SpiderMiddlewareManager.from_crawler(self.crawler)
Esempio n. 17
0
 def setUp(self):
     self.request = Request('http://example.com/index.html')
     self.response = Response(self.request.url, request=self.request)
     self.crawler = get_crawler(Spider)
     self.spider = self.crawler._create_spider('foo')
     self.mwman = SpiderMiddlewareManager.from_crawler(self.crawler)