def __init__(self, crawler): self.slots = {} self.spidermw = SpiderMiddlewareManager.from_crawler(crawler) itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR']) self.itemproc = itemproc_cls.from_crawler(crawler) self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS') self.crawler = crawler
def __init__(self, crawler): self.slot = None self.spidermw = SpiderMiddlewareManager.from_crawler(crawler) itemproc_cls = load_object(crawler.settings["ITEM_PROCESSOR"]) self.itemproc = itemproc_cls.from_crawler(crawler) self.concurrent_items = crawler.settings.getint("CONCURRENT_ITEMS") self.crawler = crawler self.signals = crawler.signals self.logformatter = crawler.logformatter
def __init__(self, crawler): self.slot: Optional[Slot] = None self.spidermw = SpiderMiddlewareManager.from_crawler(crawler) itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR']) self.itemproc = itemproc_cls.from_crawler(crawler) self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS') self.crawler = crawler self.signals = crawler.signals self.logformatter = crawler.logformatter
def __init__(self, crawler): self.slot = None self.spidermw = SpiderMiddlewareManager.from_crawler(crawler) itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR'] ) #处理item的类,默认为ItemPipelineManager self.itemproc = itemproc_cls.from_crawler(crawler) self.concurrent_items = crawler.settings.getint( 'CONCURRENT_ITEMS') # 用于控制同时处理item的数量 self.crawler = crawler self.signals = crawler.signals self.logformatter = crawler.logformatter
def __init__(self, crawler): self.slot = None #不同于 engine里的slot 这里主要处理返回的respond 和 request self.spidermw = SpiderMiddlewareManager.from_crawler(crawler) itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR']) self.itemproc = itemproc_cls.from_crawler( crawler) # 生成 ITEM_PROCESSOR 类实例 self.concurrent_items = crawler.settings.getint( 'CONCURRENT_ITEMS') # 同时处理item个数 self.crawler = crawler self.signals = crawler.signals self.logformatter = crawler.logformatter
def __init__(self, crawler): self.slot = None self.spidermw = SpiderMiddlewareManager.from_crawler( crawler) # 实例化爬虫中间件管理器,其中包含了SPIDER_MIDDLEWARES_BASE的所有实例 itemproc_cls = load_object( crawler.settings['ITEM_PROCESSOR']) # 从配置文件中加载Pipeline处理器类 self.itemproc = itemproc_cls.from_crawler(crawler) # 实例化Pipeline处理器 self.concurrent_items = crawler.settings.getint( 'CONCURRENT_ITEMS') # 从配置文件中获取同时处理输出的任务个数 self.crawler = crawler self.signals = crawler.signals self.logformatter = crawler.logformatter
def _get_middleware_result(self, *mw_classes, start_index: Optional[int] = None): setting = self._construct_mw_setting(*mw_classes, start_index=start_index) self.crawler = get_crawler(Spider, {'SPIDER_MIDDLEWARES': setting}) self.spider = self.crawler._create_spider('foo') self.mwman = SpiderMiddlewareManager.from_crawler(self.crawler) result = yield self.mwman.scrape_response(self._scrape_func, self.response, self.request, self.spider) return result
def __init__(self, crawler): self.slot = None self.spidermw = SpiderMiddlewareManager.from_crawler(crawler) # ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager' itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR']) self.itemproc = itemproc_cls.from_crawler(crawler) # CONCURRENT_ITEMS = 100 self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS') self.crawler = crawler self.signals = crawler.signals self.logformatter = crawler.logformatter
def __init__(self, crawler): self.slot = None ## 实例化爬虫中间件管理器 self.spidermw = SpiderMiddlewareManager.from_crawler(crawler) ## 从配置中加载 Pipeline 处理器类 itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR']) ## 实例化 Pipeline 处理器 self.itemproc = itemproc_cls.from_crawler(crawler) ## 从配置中获取同时处理 item 的并发数 self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS') self.crawler = crawler self.signals = crawler.signals self.logformatter = crawler.logformatter
def init_smw(self, custom_settings): class TestSpider(Spider): name = 'test' self.spider = TestSpider scrapy_default_middlewares = { 'scrapy.spidermiddlewares.referer.RefererMiddleware': 700 } # monkey patch SPIDER_MIDDLEWARES_BASE to include only referer middleware sys.modules['scrapy.settings.default_settings'].SPIDER_MIDDLEWARES_BASE = scrapy_default_middlewares crawler = get_crawler(self.spider, custom_settings) self.add_frontera_scheduler(crawler) self.smw = SpiderMiddlewareManager.from_crawler(crawler)
def __init__(self, crawler): self.slot = None self.spidermw = SpiderMiddlewareManager.from_crawler( crawler ) # 居然在scraper中实例化了一个爬虫中间件,神奇,实现了是哪个方法process_spider_input、process_spider_exception、process_spider_requests itemproc_cls = load_object( crawler.settings['ITEM_PROCESSOR'] ) # ITEM_PROCESSOR = 'scrapy.pipelines.ItemPipelineManager' self.itemproc = itemproc_cls.from_crawler(crawler) # 获取一个所有中间件管道管理对象 self.concurrent_items = crawler.settings.getint( 'CONCURRENT_ITEMS') # 100 self.crawler = crawler self.signals = crawler.signals self.logformatter = crawler.logformatter
def _get_middleware_result(self, *mw_classes, start_index: Optional[int] = None): setting = self._construct_mw_setting(*mw_classes, start_index=start_index) self.crawler = get_crawler(Spider, { 'SPIDER_MIDDLEWARES_BASE': {}, 'SPIDER_MIDDLEWARES': setting }) self.spider = self.crawler._create_spider('foo') self.mwman = SpiderMiddlewareManager.from_crawler(self.crawler) start_requests = iter(self._start_requests()) results = yield self.mwman.process_start_requests( start_requests, self.spider) return results
def __init__(self, crawler): self.slot = None self.spidermw = SpiderMiddlewareManager.from_crawler(crawler) itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR']) #ITEM_PROCESSOR = 'scrapy.pipelines.ItemPipelineManager' #ItemPipelineManager是一个MiddlewareManager的派生类 #加入了一个功能:将pipeline中process_item方法添加到回调链中。_add_middleware #然后以callback(spider)调用回调链。(pipelinemanager.process_item) self.itemproc = itemproc_cls.from_crawler(crawler) #管道管理器实例化。 self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS') #默认100 self.crawler = crawler self.signals = crawler.signals self.logformatter = crawler.logformatter
def setUp(self): class TestSpider(Spider): name = 'test' self.spider = TestSpider scrapy_default_middlewares = { 'scrapy.spidermiddlewares.referer.RefererMiddleware': 700 } # monkey patch SPIDER_MIDDLEWARES_BASE to include only referer middleware sys.modules['scrapy.settings.default_settings'].SPIDER_MIDDLEWARES_BASE = scrapy_default_middlewares custom_settings = { 'SPIDER_MIDDLEWARES': {'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 1000} } crawler = get_crawler(self.spider, custom_settings) self.add_frontera_scheduler(crawler) self.smw = SpiderMiddlewareManager.from_crawler(crawler)
def __init__(self, crawler): self.slot = None self.spidermw = SpiderMiddlewareManager.from_crawler(crawler) itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR']) #ITEM_PROCESSOR = 'scrapy.pipelines.ItemPipelineManager' #ItemPipelineManager是一个MiddlewareManager的派生类 #加入了一个功能:将pipeline中process_item方法添加到回调链中。_add_middleware #然后以callback(spider)调用回调链。(pipelinemanager.process_item) self.itemproc = itemproc_cls.from_crawler(crawler) #管道管理器实例化。 self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS') #默认100 self.crawler = crawler self.signals = crawler.signals self.logformatter = crawler.logformatter
def setUp(self): self.request = Request('http://example.com/index.html') self.response = Response(self.request.url, request=self.request) self.crawler = get_crawler(Spider, {'SPIDER_MIDDLEWARES_BASE': {}}) self.spider = self.crawler._create_spider('foo') self.mwman = SpiderMiddlewareManager.from_crawler(self.crawler)
def setUp(self): self.request = Request('http://example.com/index.html') self.response = Response(self.request.url, request=self.request) self.crawler = get_crawler(Spider) self.spider = self.crawler._create_spider('foo') self.mwman = SpiderMiddlewareManager.from_crawler(self.crawler)