Beispiel #1
0
 def __init__(self, crawler, spider_closed_callback):
     ## 将爬虫实例存储在执行引擎实例中
     self.crawler = crawler
     ## 将爬虫实例所对应的配置也存储在执行引擎实例中
     self.settings = crawler.settings
     ## 信号
     self.signals = crawler.signals
     ## 日志格式化器
     self.logformatter = crawler.logformatter
     self.slot = None
     self.spider = None
     ## 是否正在运行
     self.running = False
     ## 是否已暂停执行
     self.paused = False
     ## 从配置文件中加载调度器类
     self.scheduler_cls = load_object(self.settings['SCHEDULER'])
     ## 从配置文件中加载下载器类
     downloader_cls = load_object(self.settings['DOWNLOADER'])
     ## 实例化下载器
     self.downloader = downloader_cls(crawler)
     ## 实例化 scraper,它是引擎连接爬虫类(Spider)和管道类(Pipeline)的桥梁
     self.scraper = Scraper(crawler)
     ## 指定爬虫关闭的回调函数
     self._spider_closed_callback = spider_closed_callback
Beispiel #2
0
 def __init__(self, settings, spider_closed_callback):
     self.settings = settings
     self.closing = {}  # dict (spider -> reason) of spiders being closed
     self.closing_dfds = {
     }  # dict (spider -> deferred) of spiders being closed
     self.running = False
     self.paused = False
     self._next_request_calls = {}
     self.scheduler = load_object(settings['SCHEDULER'])()
     self.downloader = Downloader()
     self.scraper = Scraper(self, self.settings)
     self._spider_closed_callback = spider_closed_callback
Beispiel #3
0
 def __init__(self, crawler, spider_closed_callback):
     self.settings = crawler.settings
     self.slots = {}
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(self.settings['SCHEDULER'])
     self.downloader = Downloader(crawler)
     self.scraper = Scraper(crawler)
     self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS', 1)
     if self._concurrent_spiders != 1:
         warnings.warn("CONCURRENT_SPIDERS settings is deprecated, use " \
             "Scrapyd max_proc config instead", ScrapyDeprecationWarning)
     self._spider_closed_callback = spider_closed_callback
Beispiel #4
0
 def __init__(self, crawler, spider_closed_callback: Callable) -> None:
     self.crawler = crawler
     self.settings = crawler.settings
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
     self.slot: Optional[Slot] = None
     self.spider: Optional[Spider] = None
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(crawler.settings["SCHEDULER"])
     downloader_cls = load_object(self.settings['DOWNLOADER'])
     self.downloader = downloader_cls(crawler)
     self.scraper = Scraper(crawler)
     self._spider_closed_callback = spider_closed_callback
 def __init__(self, crawler, spider_closed_callback):
     self.crawler = crawler
     self.settings = crawler.settings
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
     self.slot = None
     self.spider = None
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(self.settings['SCHEDULER'])
     downloader_cls = load_object(self.settings['DOWNLOADER'])
     self.downloader = downloader_cls(crawler)
     self.scraper = Scraper(crawler)
     self._spider_closed_callback = spider_closed_callback
Beispiel #6
0
 def __init__(self, crawler, spider_closed_callback):
     self.crawler = crawler
     self.settings = crawler.settings
     self.signals = crawler.signals  #使用crawler的信号管理器
     self.logformatter = crawler.logformatter
     self.slot = None
     self.spider = None
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(
         self.settings['SCHEDULER'])  #根据配置的调度器类来生成对应的对象
     downloader_cls = load_object(
         self.settings['DOWNLOADER'])  #根据配置的下载器类来生成对应的类
     self.downloader = downloader_cls(crawler)
     self.scraper = Scraper(crawler)  #生成一个刮取器
     self._spider_closed_callback = spider_closed_callback
Beispiel #7
0
 def __init__(self, crawler, spider_closed_callback):
     self.crawler = crawler
     self.settings = crawler.settings  # 配置
     self.signals = crawler.signals  # 信号
     self.logformatter = crawler.logformatter  # 日志格式
     self.slot = None
     self.spider = None
     self.running = False
     self.paused = False
     # 提取scheduler调度器类名(未进行实例化), 其在open_spdier中实例化
     self.scheduler_cls = load_object(self.settings['SCHEDULER'])
     # 提取downloader下载器类名, 并实例化, 见scrapy/core/downloader/__init__.py文件
     downloader_cls = load_object(self.settings['DOWNLOADER'])
     self.downloader = downloader_cls(crawler)
     # 实例化scrapyer: engine和spider之间的桥梁, 见scrapy/core/scraper.py
     self.scraper = Scraper(crawler)
     self._spider_closed_callback = spider_closed_callback
Beispiel #8
0
 def __init__(self, crawler, spider_closed_callback):
     self.crawler = crawler
     self.settings = crawler.settings
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
     self.slot = None
     self.spider = None
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(
         self.settings['SCHEDULER']
     )  # 从settings中找到Scheduler调度器,找到Scheduler类
     downloader_cls = load_object(
         self.settings['DOWNLOADER'])  # 同样,找到Downloader下载器类
     self.downloader = downloader_cls(crawler)  # 实例化Downloader
     self.scraper = Scraper(crawler)  # 实例化Scraper,它是引擎连接爬虫类的桥梁
     self._spider_closed_callback = spider_closed_callback
Beispiel #9
0
 def configure(self, spider_closed_callback):
     """
     Configure execution engine with the given scheduling policy and downloader.
     """
     self.scheduler = load_object(settings['SCHEDULER'])()
     self.downloader = Downloader()
     self.scraper = Scraper(self)
     self.configured = True
     self._spider_closed_callback = spider_closed_callback
Beispiel #10
0
 def configure(self):
     """
     Configure execution engine with the given scheduling policy and downloader.
     """
     self.scheduler = load_object(settings['SCHEDULER'])()
     self.spider_scheduler = load_object(settings['SPIDER_SCHEDULER'])()
     self.downloader = Downloader()
     self.scraper = Scraper(self)
     self.configured = True
Beispiel #11
0
 def __init__(self, crawler, spider_closed_callback):
     self.locker = threading.Condition()
     self.crawler = crawler
     self.settings = crawler.settings
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
     self.slot = None
     self.spider = None
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(self.settings['SCHEDULER'])
     downloader_cls = load_object(self.settings['DOWNLOADER'])
     self.downloader = downloader_cls(crawler)
     self.scraper = Scraper(crawler)
     self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS', 1)
     if self._concurrent_spiders != 1:
         warnings.warn("CONCURRENT_SPIDERS settings is deprecated, use " \
             "Scrapyd max_proc config instead", ScrapyDeprecationWarning)
     self._spider_closed_callback = spider_closed_callback
Beispiel #12
0
 def __init__(self, crawler, spider_closed_callback):
     self.settings = crawler.settings
     self.slots = {}
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(self.settings['SCHEDULER'])
     self.downloader = Downloader(crawler)
     self.scraper = Scraper(crawler)
     self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS')
     self._spider_closed_callback = spider_closed_callback
Beispiel #13
0
 def __init__(self, settings, spider_closed_callback):
     self.settings = settings
     self.slots = {}
     self.running = False
     self.paused = False
     self._next_request_calls = {}
     self.scheduler = load_object(settings['SCHEDULER'])()
     self.downloader = Downloader()
     self.scraper = Scraper(self, self.settings)
     self._spider_closed_callback = spider_closed_callback
Beispiel #14
0
 def __init__(self, settings, spider_closed_callback):
     self.settings = settings
     self.closing = {} # dict (spider -> reason) of spiders being closed
     self.closing_dfds = {} # dict (spider -> deferred) of spiders being closed
     self.running = False
     self.paused = False
     self._next_request_calls = {}
     self.scheduler = load_object(settings['SCHEDULER'])()
     self.downloader = Downloader()
     self.scraper = Scraper(self, self.settings)
     self._spider_closed_callback = spider_closed_callback
Beispiel #15
0
    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        #在这里获取了scheduler类
        #默认是'scrapy.core.scheduler.Scheduler'

        downloader_cls = load_object(self.settings['DOWNLOADER'])
        #这里获取downloader
        #默认是'scrapy.core.downloader.Downloader'
        self.downloader = downloader_cls(crawler)
        #下载器实例化

        self.scraper = Scraper(crawler)
        self._spider_closed_callback = spider_closed_callback
Beispiel #16
0
 def __init__(self, crawler, spider_closed_callback):
     self.crawler = crawler
     self.settings = crawler.settings
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
     self.slot = None
     self.spider = None
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(
         self.settings['SCHEDULER']
     )  # SCHEDULER = 'scrapy.core.scheduler.Scheduler',仅仅获取对象,没做其他坏事
     downloader_cls = load_object(
         self.settings['DOWNLOADER']
     )  # DOWNLOADER = 'scrapy.core.downloader.Downloader'
     self.downloader = downloader_cls(
         crawler
     )  # 这个下载器,里面实例化了handler处理器,和到下载器之间的process_处理逻辑。就是具体的下载功能和中间件功能都已经实现了
     self.scraper = Scraper(
         crawler)  # 这里有定义有spidermw爬虫中间件和ITEM_pipeline管道对象,数据处理功能和存储功能都实现了
     self._spider_closed_callback = spider_closed_callback  # 这个回调很重要,关系到爬虫能不能停下来,是个匿名函数lambda _: self.stop(),最终还是执行engine的self.engine.stop
Beispiel #17
0
    def __init__(self, crawler, spider_closed_callback: Callable) -> None:
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot: Optional[Slot] = None
        self.spider: Optional[Spider] = None
        self.running = False
        # 是否暂停
        self.paused = False
        self.scheduler_cls = self._get_scheduler_class(crawler.settings)

        # 加载下载器
        downloader_cls = load_object(self.settings['DOWNLOADER'])
        self.downloader = downloader_cls(crawler)

        # 实例化这个抓取这个动作
        self.scraper = Scraper(crawler)

        # 外部传入的关闭回调
        self._spider_closed_callback = spider_closed_callback
Beispiel #18
0
 def __init__(self, crawler, spider_closed_callback):
     self.crawler = crawler
     self.settings = crawler.settings
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
     self.slot = None
     self.spider = None
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(self.settings['SCHEDULER'])
     downloader_cls = load_object(self.settings['DOWNLOADER'])
     self.downloader = downloader_cls(crawler)
     self.scraper = Scraper(crawler)
     self._spider_closed_callback = spider_closed_callback
Beispiel #19
0
 def __init__(self, crawler, spider_closed_callback):
     self.settings = crawler.settings
     self.slots = {}
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(self.settings["SCHEDULER"])
     self.downloader = Downloader(crawler)
     self.scraper = Scraper(crawler)
     self._concurrent_spiders = self.settings.getint("CONCURRENT_SPIDERS", 1)
     if self._concurrent_spiders != 1:
         warnings.warn(
             "CONCURRENT_SPIDERS settings is deprecated, use " "Scrapyd max_proc config instead",
             ScrapyDeprecationWarning,
         )
     self._spider_closed_callback = spider_closed_callback
Beispiel #20
0
 def __init__(self, crawler, spider_closed_callback):
     self.crawler = crawler
     self.settings = crawler.settings
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
     self.slot = None
     self.spider = None
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(self.settings['SCHEDULER'])
     self.downloader = Downloader(crawler)
     self.scraper = Scraper(crawler)
     self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS', 1)
     if self._concurrent_spiders != 1:
         warnings.warn("CONCURRENT_SPIDERS settings is deprecated, use " \
             "Scrapyd max_proc config instead", ScrapyDeprecationWarning)
     self._spider_closed_callback = spider_closed_callback
Beispiel #21
0
 def __init__(self, crawler, spider_closed_callback):
     self.crawler = crawler
     self.settings = crawler.settings
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
     self.slot = None
     self.spider = None
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(self.settings['SCHEDULER'])
     self.scraper = Scraper(crawler)
     self._spider_closed_callback = spider_closed_callback
     if self.downloader is None:
         downloader_cls = load_object(self.settings['DOWNLOADER'])
         MyExecutionEngine.downloader = downloader_cls(crawler)
     self.downloader = MyExecutionEngine.downloader
     self.downloader.close = CloseOnlyLastTime(self.downloader.close)
    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
	#在这里获取了scheduler类
	#默认是'scrapy.core.scheduler.Scheduler'

        downloader_cls = load_object(self.settings['DOWNLOADER'])
        #这里获取downloader
        #默认是'scrapy.core.downloader.Downloader'
        self.downloader = downloader_cls(crawler)
	#下载器实例化

        self.scraper = Scraper(crawler)
        self._spider_closed_callback = spider_closed_callback
Beispiel #23
0
class ExecutionEngine(object):

    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        downloader_cls = load_object(self.settings['DOWNLOADER'])
        self.downloader = downloader_cls(crawler)
        self.scraper = Scraper(crawler)
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(signal=signals.engine_started)
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def close(self):
        """Close the execution engine gracefully.

        If it has already been started, stop it. In all cases, close all spiders
        and the downloader.
        """
        if self.running:
            # Will also close spiders and downloader
            return self.stop()
        elif self.open_spiders:
            # Will also close downloader
            return self._close_all_spiders()
        else:
            return defer.succeed(self.downloader.close())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        slot = self.slot
        if not slot:
            return

        if self.paused:
            return

        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider):
                break

        if slot.start_requests and not self._needs_backout(spider):
            try:
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception:
                slot.start_requests = None
                logger.error('Error while obtaining start requests',
                             exc_info=True, extra={'spider': spider})
            else:
                self.crawl(request, spider)

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        slot = self.slot
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slot.needs_backout()

    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        request = slot.scheduler.next_request()
        if not request:
            return
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(lambda f: logger.info('Error while handling downloader output',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(lambda f: logger.info('Error while removing request from slot',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(lambda f: logger.info('Error while scheduling new request',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        return d

    def _handle_downloader_output(self, response, request, spider):
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(lambda f: logger.error('Error while enqueuing downloader output',
                                            exc_info=failure_to_exc_info(f),
                                            extra={'spider': spider}))
        return d

    def spider_is_idle(self, spider):
        if not self.scraper.slot.is_idle():
            # scraper is not idle
            return False

        if self.downloader.active:
            # downloader has pending requests
            return False

        if self.slot.start_requests is not None:
            # not all start requests are handled
            return False

        if self.slot.scheduler.has_pending_requests():
            # scheduler has pending requests
            return False

        return True

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        self.schedule(request, spider)
        self.slot.nextcall.schedule()

    def schedule(self, request, spider):
        self.signals.send_catch_log(signal=signals.request_scheduled,
                request=request, spider=spider)
        if not self.slot.scheduler.enqueue_request(request):
            self.signals.send_catch_log(signal=signals.request_dropped,
                                        request=request, spider=spider)

    def download(self, request, spider):
        slot = self.slot
        slot.add_request(request)
        d = self._download(request, spider)
        d.addBoth(self._downloaded, slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request)
        return self.download(response, spider) \
                if isinstance(response, Request) else response

    def _download(self, request, spider):
        slot = self.slot
        slot.add_request(request)
        def _on_success(response):
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request # tie request to response received
                logkws = self.logformatter.crawled(request, response, spider)
                logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
                self.signals.send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(request, spider)
        dwld.addCallbacks(_on_success)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        assert self.has_capacity(), "No free spider slot when opening %r" % \
            spider.name
        logger.info("Spider opened", extra={'spider': spider})
        nextcall = CallLaterOnce(self._next_request, spider)
        scheduler = self.scheduler_cls.from_crawler(self.crawler)
        start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.slot = slot
        self.spider = spider
        yield scheduler.open(spider)
        yield self.scraper.open_spider(spider)
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
        slot.nextcall.schedule()
        slot.heartbeat.start(5)

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        logger.info("Closing spider (%(reason)s)",
                    {'reason': reason},
                    extra={'spider': spider})

        dfd = slot.close()

        def log_failure(msg):
            def errback(failure):
                logger.error(
                    msg,
                    exc_info=failure_to_exc_info(failure),
                    extra={'spider': spider}
                )
            return errback

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log_failure('Downloader close failure'))

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log_failure('Scraper close failure'))

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log_failure('Scheduler close failure'))

        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
            signal=signals.spider_closed, spider=spider, reason=reason))
        dfd.addErrback(log_failure('Error while sending spider_close signal'))

        dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log_failure('Stats close failure'))

        dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
                                          {'reason': reason},
                                          extra={'spider': spider}))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log_failure('Error while unassigning slot'))

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log_failure('Error while unassigning spider'))

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
        self._closewait.callback(None)
Beispiel #24
0
class ExecutionEngine(object):

    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        self.downloader = Downloader(crawler)
        self.scraper = Scraper(crawler)
        self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS', 1)
        if self._concurrent_spiders != 1:
            warnings.warn("CONCURRENT_SPIDERS settings is deprecated, use " \
                "Scrapyd max_proc config instead", ScrapyDeprecationWarning)
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(signal=signals.engine_started)
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        slot = self.slot
        if not slot:
            return

        if self.paused:
            slot.nextcall.schedule(5)
            return

        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider):
                break

        if slot.start_requests and not self._needs_backout(spider):
            try:
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception as exc:
                log.err(None, 'Obtaining request from start requests', \
                        spider=spider)
            else:
                self.crawl(request, spider)

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        slot = self.slot
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slot.needs_backout()

    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        request = slot.scheduler.next_request()
        if not request:
            return
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(log.msg, spider=spider)
        return d

    def _handle_downloader_output(self, response, request, spider):
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(log.err, spider=spider)
        return d

    def spider_is_idle(self, spider):
        scraper_idle = self.scraper.slot.is_idle()
        pending = self.slot.scheduler.has_pending_requests()
        downloading = bool(self.downloader.active)
        idle = scraper_idle and not (pending or downloading)
        return idle

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        self.schedule(request, spider)
        self.slot.nextcall.schedule()

    def schedule(self, request, spider):
        self.signals.send_catch_log(signal=signals.request_scheduled,
                request=request, spider=spider)
        return self.slot.scheduler.enqueue_request(request)

    def download(self, request, spider):
        slot = self.slot
        slot.add_request(request)
        d = self._download(request, spider)
        d.addBoth(self._downloaded, slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request)
        return self.download(response, spider) \
                if isinstance(response, Request) else response

    def _download(self, request, spider):
        slot = self.slot
        slot.add_request(request)
        def _on_success(response):
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request # tie request to response received
                logkws = self.logformatter.crawled(request, response, spider)
                log.msg(spider=spider, **logkws)
                self.signals.send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(request, spider)
        dwld.addCallbacks(_on_success)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        assert self.has_capacity(), "No free spider slot when opening %r" % \
            spider.name
        log.msg("Spider opened", spider=spider)
        nextcall = CallLaterOnce(self._next_request, spider)
        scheduler = self.scheduler_cls.from_crawler(self.crawler)
        start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.slot = slot
        self.spider = spider
        yield scheduler.open(spider)
        yield self.scraper.open_spider(spider)
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
        slot.nextcall.schedule()

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            self.slot.nextcall.schedule(5)
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        log.msg(format="Closing spider (%(reason)s)", reason=reason, spider=spider)

        dfd = slot.close()

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log.err, spider=spider)

        # XXX: spider_stats argument was added for backwards compatibility with
        # stats collection refactoring added in 0.15. it should be removed in 0.17.
        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(signal=signals.spider_closed, \
            spider=spider, reason=reason, spider_stats=self.crawler.stats.get_stats()))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: log.msg(format="Spider closed (%(reason)s)", reason=reason, spider=spider))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
        self._closewait.callback(None)
Beispiel #25
0
"""
Beispiel #26
0
 def test_spider_output_handling(self):
     spider = self.MySpider()
     scraper = Scraper(Crawler(spider))
     scraper.open_spider(spider)
     scraper._process_spidermw_output(RssItem(), None, None, None)
     scraper._process_spidermw_output(ExtendableItem(), None, None, None)
     scraper._process_spidermw_output(RssedItem(), None, None, None)
     scraper.close_spider(spider)
Beispiel #27
0
class ExecutionEngine(object):

    def __init__(self):
        self.configured = False
        self.closing = {} # dict (spider -> reason) of spiders being closed
        self.running = False
        self.killed = False
        self.paused = False
        self._next_request_calls = {}
        self._crawled_logline = load_object(settings['LOG_FORMATTER_CRAWLED'])

    def configure(self, spider_closed_callback):
        """
        Configure execution engine with the given scheduling policy and downloader.
        """
        self.scheduler = load_object(settings['SCHEDULER'])()
        self.downloader = Downloader()
        self.scraper = Scraper(self)
        self.configured = True
        self._spider_closed_callback = spider_closed_callback

    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        send_catch_log(signal=signals.engine_started, sender=self.__class__)
        self.running = True

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def kill(self):
        """Forces shutdown without waiting for pending transfers to finish.
        stop() must have been called first
        """
        assert not self.running, "Call engine.stop() before engine.kill()"
        self.killed = True

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def is_idle(self):
        return self.scheduler.is_idle() and self.downloader.is_idle() and \
            self.scraper.is_idle()

    def next_request(self, spider, now=False):
        """Scrape the next request for the spider passed.

        The next request to be scraped is retrieved from the scheduler and
        requested from the downloader.

        The spider is closed if there are no more pages to scrape.
        """
        if now:
            self._next_request_calls.pop(spider, None)
        elif spider not in self._next_request_calls:
            call = reactor.callLater(0, self.next_request, spider, now=True)
            self._next_request_calls[spider] = call
            return call
        else:
            return

        if self.paused:
            return reactor.callLater(5, self.next_request, spider)

        while not self._needs_backout(spider):
            if not self._next_request(spider):
                break

        if self.spider_is_idle(spider):
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        return not self.running \
            or self.spider_is_closed(spider) \
            or self.downloader.sites[spider].needs_backout() \
            or self.scraper.sites[spider].needs_backout()

    def _next_request(self, spider):
        # Next pending request from scheduler
        request, deferred = self.scheduler.next_request(spider)
        if request:
            dwld = mustbe_deferred(self.download, request, spider)
            dwld.chainDeferred(deferred).addBoth(lambda _: deferred)
            dwld.addErrback(log.err, "Unhandled error on engine._next_request()",
                spider=spider)
            return dwld

    def spider_is_idle(self, spider):
        scraper_idle = spider in self.scraper.sites \
            and self.scraper.sites[spider].is_idle()
        pending = self.scheduler.spider_has_pending_requests(spider)
        downloading = spider in self.downloader.sites \
            and self.downloader.sites[spider].active
        return scraper_idle and not (pending or downloading)

    def spider_is_closed(self, spider):
        """Return True if the spider is fully closed (ie. not even in the
        closing stage)"""
        return spider not in self.downloader.sites

    def spider_is_open(self, spider):
        """Return True if the spider is fully opened (ie. not in closing
        stage)"""
        return spider in self.downloader.sites and spider not in self.closing

    @property
    def open_spiders(self):
        return self.downloader.sites.keys()

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return len(self.downloader.sites) < self.downloader.concurrent_spiders

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        if spider in self.closing: # ignore requests for spiders being closed
            return
        schd = mustbe_deferred(self.schedule, request, spider)
        # FIXME: we can't log errors because we would be preventing them from
        # propagating to the request errback. This should be fixed after the
        # next core refactoring.
        #schd.addErrback(log.err, "Error on engine.crawl()")
        schd.addBoth(self.scraper.enqueue_scrape, request, spider)
        schd.addErrback(log.err, "Unhandled error on engine.crawl()", spider=spider)
        schd.addBoth(lambda _: self.next_request(spider))

    def schedule(self, request, spider):
        if spider in self.closing:
            raise IgnoreRequest()
        self.next_request(spider)
        return self.scheduler.enqueue_request(spider, request)

    def download(self, request, spider):
        def _on_success(response):
            """handle the result of a page download"""
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request # tie request to response received
                log.msg(self._crawled_logline(request, response), \
                    level=log.DEBUG, spider=spider)
                return response
            elif isinstance(response, Request):
                newrequest = response
                dfd = mustbe_deferred(self.schedule, newrequest, spider)
                if newrequest.callback:
                    # XXX: this is a bit hacky and should be removed
                    dfd.addCallbacks(newrequest.callback, newrequest.errback)
                return dfd

        def _on_error(_failure):
            """handle an error processing a page"""
            exc = _failure.value
            if isinstance(exc, IgnoreRequest):
                errmsg = _failure.getErrorMessage()
                level = exc.level
            else:
                errmsg = str(_failure)
                level = log.ERROR
            if errmsg:
                log.msg("Crawling <%s>: %s" % (request.url, errmsg), \
                    level=level, spider=spider)
            return Failure(IgnoreRequest(str(exc)))

        def _on_complete(_):
            self.next_request(spider)
            return _

        if spider not in self.downloader.sites:
            return defer.fail(Failure(IgnoreRequest())).addBoth(_on_complete)

        dwld = mustbe_deferred(self.downloader.fetch, request, spider)
        dwld.addCallbacks(_on_success, _on_error)
        dwld.addBoth(_on_complete)
        return dwld

    def open_spider(self, spider):
        assert self.has_capacity(), "No free spider slots when opening %r" % \
            spider.name
        log.msg("Spider opened", spider=spider)
        self.scheduler.open_spider(spider)
        self.downloader.open_spider(spider)
        self.scraper.open_spider(spider)
        stats.open_spider(spider)
        send_catch_log(signals.spider_opened, sender=self.__class__, spider=spider)
        self.next_request(spider)

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        try:
            dispatcher.send(signal=signals.spider_idle, sender=self.__class__, \
                spider=spider)
        except DontCloseSpider:
            reactor.callLater(5, self.next_request, spider)
            return
        except Exception, e:
            log.msg("Exception caught on 'spider_idle' signal dispatch: %r" % e, \
                level=log.ERROR)
        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')
Beispiel #28
0
class ExecutionEngine(object):

    def __init__(self, settings, spider_closed_callback):
        self.settings = settings
        self.closing = {} # dict (spider -> reason) of spiders being closed
        self.closing_dfds = {} # dict (spider -> deferred) of spiders being closed
        self.running = False
        self.paused = False
        self._next_request_calls = {}
        self.scheduler = load_object(settings['SCHEDULER'])()
        self.downloader = Downloader()
        self.scraper = Scraper(self, self.settings)
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield send_catch_log_deferred(signal=signals.engine_started)
        self.running = True

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def is_idle(self):
        return self.scheduler.is_idle() and self.downloader.is_idle() and \
            self.scraper.is_idle()

    def next_request(self, spider, now=False):
        """Scrape the next request for the spider passed.

        The next request to be scraped is retrieved from the scheduler and
        requested from the downloader.

        The spider is closed if there are no more pages to scrape.
        """
        if now:
            self._next_request_calls.pop(spider, None)
        elif spider not in self._next_request_calls:
            call = reactor.callLater(0, self.next_request, spider, now=True)
            self._next_request_calls[spider] = call
            return call
        else:
            return

        if self.paused:
            return reactor.callLater(5, self.next_request, spider)

        while not self._needs_backout(spider):
            if not self._next_request(spider):
                break

        if self.spider_is_idle(spider):
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        return not self.running \
            or self.spider_is_closed(spider) \
            or self.downloader.sites[spider].needs_backout() \
            or self.scraper.sites[spider].needs_backout()

    def _next_request(self, spider):
        # Next pending request from scheduler
        request, deferred = self.scheduler.next_request(spider)
        if request:
            dwld = mustbe_deferred(self.download, request, spider)
            dwld.chainDeferred(deferred).addBoth(lambda _: deferred)
            dwld.addErrback(log.err, "Unhandled error on engine._next_request()",
                spider=spider)
            return dwld

    def spider_is_idle(self, spider):
        scraper_idle = spider in self.scraper.sites \
            and self.scraper.sites[spider].is_idle()
        pending = self.scheduler.spider_has_pending_requests(spider)
        downloading = spider in self.downloader.sites \
            and self.downloader.sites[spider].active
        return scraper_idle and not (pending or downloading)

    def spider_is_closed(self, spider):
        """Return True if the spider is fully closed (ie. not even in the
        closing stage)"""
        return spider not in self.downloader.sites

    def spider_is_open(self, spider):
        """Return True if the spider is fully opened (ie. not in closing
        stage)"""
        return spider in self.downloader.sites and spider not in self.closing

    @property
    def open_spiders(self):
        return self.downloader.sites.keys()

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return len(self.downloader.sites) < self.downloader.concurrent_spiders

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        if spider in self.closing: # ignore requests for spiders being closed
            return
        schd = mustbe_deferred(self.schedule, request, spider)
        # FIXME: we can't log errors because we would be preventing them from
        # propagating to the request errback. This should be fixed after the
        # next core refactoring.
        #schd.addErrback(log.err, "Error on engine.crawl()")
        schd.addBoth(self.scraper.enqueue_scrape, request, spider)
        schd.addErrback(log.err, "Unhandled error on engine.crawl()", spider=spider)
        schd.addBoth(lambda _: self.next_request(spider))

    def schedule(self, request, spider):
        if spider in self.closing:
            raise IgnoreRequest()
        self.next_request(spider)
        return self.scheduler.enqueue_request(spider, request)

    def download(self, request, spider):
        def _on_success(response):
            """handle the result of a page download"""
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request # tie request to response received
                log.msg(log.formatter.crawled(request, response, spider), \
                    level=log.DEBUG, spider=spider)
                return response
            elif isinstance(response, Request):
                return mustbe_deferred(self.schedule, response, spider)

        def _on_error(_failure):
            """handle an error processing a page"""
            exc = _failure.value
            if isinstance(exc, IgnoreRequest):
                errmsg = _failure.getErrorMessage()
                level = exc.level
            else:
                errmsg = str(_failure)
                level = log.ERROR
            if errmsg:
                log.msg("Error downloading <%s>: %s" % (request.url, errmsg), \
                    level=level, spider=spider)
            return Failure(IgnoreRequest(str(exc)))

        def _on_complete(_):
            self.next_request(spider)
            return _

        if spider not in self.downloader.sites:
            return defer.fail(Failure(IgnoreRequest())).addBoth(_on_complete)

        dwld = mustbe_deferred(self.downloader.fetch, request, spider)
        dwld.addCallbacks(_on_success, _on_error)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider):
        assert self.has_capacity(), "No free spider slots when opening %r" % \
            spider.name
        log.msg("Spider opened", spider=spider)
        yield self.scheduler.open_spider(spider)
        self.downloader.open_spider(spider)
        yield self.scraper.open_spider(spider)
        stats.open_spider(spider)
        yield send_catch_log_deferred(signals.spider_opened, spider=spider)
        self.next_request(spider)

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            reactor.callLater(5, self.next_request, spider)
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""
        if spider in self.closing:
            return defer.succeed(None)
        log.msg("Closing spider (%s)" % reason, spider=spider)
        self.closing[spider] = reason
        self.scheduler.clear_pending_requests(spider)
        dfd = self.downloader.close_spider(spider)
        self.closing_dfds[spider] = dfd
        dfd.addBoth(lambda _: self.scheduler.close_spider(spider))
        dfd.addErrback(log.err, "Unhandled error in scheduler.close_spider()", \
            spider=spider)
        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log.err, "Unhandled error in scraper.close_spider()", \
            spider=spider)
        dfd.addBoth(lambda _: self._finish_closing_spider(spider))
        return dfd

    def _close_all_spiders(self):
        dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
        dfds += self.closing_dfds.values()
        dlist = defer.DeferredList(dfds)
        return dlist

    def _finish_closing_spider(self, spider):
        """This function is called after the spider has been closed"""
        reason = self.closing.pop(spider, 'finished')
        call = self._next_request_calls.pop(spider, None)
        if call and call.active():
            call.cancel()
        dfd = send_catch_log_deferred(signal=signals.spider_closed, \
            spider=spider, reason=reason)
        dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason))
        dfd.addErrback(log.err, "Unhandled error in stats.close_spider()",
            spider=spider)
        dfd.addBoth(lambda _: log.msg("Spider closed (%s)" % reason, spider=spider))
        dfd.addBoth(lambda _: self.closing_dfds.pop(spider).callback(spider))
        dfd.addBoth(lambda _: self._spider_closed_callback(spider))
        return dfd

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield send_catch_log_deferred(signal=signals.engine_stopped)
Beispiel #29
0
class ExecutionEngine:

    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals #当 crawler 初始化应该初始化了一个 signalmanager 它里面的 sender 就是 crawler
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        downloader_cls = load_object(self.settings['DOWNLOADER'])
        self.downloader = downloader_cls(crawler)
        self.scraper = Scraper(crawler)
        self._spider_closed_callback = spider_closed_callback
    # 这里只是设置开启标志 并返回deferred对象 真正的准备阶段在 open_spider
    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        if self.running:
            raise RuntimeError("Engine already running")
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(signal=signals.engine_started) #增加 errback  并发送一次 signal
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        if not self.running:
            raise RuntimeError("Engine not running")
        self.running = False #打标记
        dfd = self._close_all_spiders() #拿到关闭deferred
        return dfd.addBoth(lambda _: self._finish_stopping_engine()) #给deferred添加 关闭的callback

    def close(self):
        """Close the execution engine gracefully.

        If it has already been started, stop it. In all cases, close all spiders
        and the downloader.
        """
        if self.running:
            # Will also close spiders and downloader
            return self.stop()
        elif self.open_spiders:
            # Will also close downloader
            return self._close_all_spiders()
        else:
            return defer.succeed(self.downloader.close()) #其实就是相当于d.callback(result)

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False
    # 判断暂停否,取出一个request  调用crawl 处理request 最后判断是否空闲关闭
    def _next_request(self, spider):
        slot = self.slot
        if not slot:
            return

        if self.paused:
            return

        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider): #如果从scheduler 取出 request 并添加 item处理的callback 查看其返回结果 没有了就break
                break

        if slot.start_requests and not self._needs_backout(spider): #处理关闭后仍有request情况
            try:
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception:
                slot.start_requests = None
                logger.error('Error while obtaining start requests',
                             exc_info=True, extra={'spider': spider})
            else:
                self.crawl(request, spider)# 调用自身crawl方法爬

        if self.spider_is_idle(spider) and slot.close_if_idle: #空闲时候是否关闭spider
            self._spider_idle(spider)
    # 从不同地方的标志位置判断是否需要退出
    def _needs_backout(self, spider):
        slot = self.slot
        return (
            not self.running
            or slot.closing
            or self.downloader.needs_backout()
            or self.scraper.slot.needs_backout()
        )
    # 从scheduler中取出request,放到_download下载 再给他添加处理返回值 和 调用下一个循环的回调
    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        request = slot.scheduler.next_request() # 从scheduler中取出request
        if not request:
            return
        d = self._download(request, spider) #将request放到_download中 生成deferred
        d.addBoth(self._handle_downloader_output, request, spider) # #### 这里其实就是 处理返回request的 回调函数 其中 函数的 response 就是这个deferred对象
        d.addErrback(lambda f: logger.info('Error while handling downloader output',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        d.addBoth(lambda _: slot.remove_request(request)) #从slot中删掉对应的reqeust
        d.addErrback(lambda f: logger.info('Error while removing request from slot',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        d.addBoth(lambda _: slot.nextcall.schedule()) # 添加一个运行下一个request的回调
        d.addErrback(lambda f: logger.info('Error while scheduling new request',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        return d
    # 判断resonse具体类别 正确情况下调用scraper.enqueue_scrape 压入响应并返回deferred对象,errback 添加log
    def _handle_downloader_output(self, response, request, spider):
        if not isinstance(response, (Request, Response, Failure)):
            raise TypeError(
                "Incorrect type: expected Request, Response or Failure, got "
                f"{type(response)}: {response!r}"
            )
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider) #将request 压入 scheduler
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request, spider) #将 request respond 和sipider 共同的做作用好的scraper 返回来
        d.addErrback(lambda f: logger.error('Error while enqueuing downloader output',
                                            exc_info=failure_to_exc_info(f),
                                            extra={'spider': spider}))
        return d #返回deferred
    #判断spider是不是闲着
    def spider_is_idle(self, spider): #判断 scrapy整体是不是 闲着
        if not self.scraper.slot.is_idle():
            # scraper is not idle
            return False

        if self.downloader.active:
            # downloader has pending requests
            return False

        if self.slot.start_requests is not None:
            # not all start requests are handled
            return False

        if self.slot.scheduler.has_pending_requests():
            # scheduler has pending requests
            return False

        return True

    @property
    def open_spiders(self): # 目前一个engine还是只能使用一个spider
        return [self.spider] if self.spider else []

    def has_capacity(self): #一个引擎对应一个slot
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)
    #执行单个的 request 压入scheduler 并执行下一步的命令
    def crawl(self, request, spider):
        if spider not in self.open_spiders:
            raise RuntimeError(f"Spider {spider.name!r} not opened when crawling: {request}")
        self.schedule(request, spider) #将request 压入scheduler【压入数据】
        self.slot.nextcall.schedule() #执行下一步操作 这里的nextcall 是调用 _next_request()【取出request并后处理】
    # 将request 压入spider的que中
    def schedule(self, request, spider):
        self.signals.send_catch_log(signals.request_scheduled, request=request, spider=spider)
        if not self.slot.scheduler.enqueue_request(request): #入队列 并陪你段是否被过滤
            self.signals.send_catch_log(signals.request_dropped, request=request, spider=spider)
    # 这里是调用 _downlaod 方法 进行下载 最后添加一个 _downloaded 到回调链路上
    def download(self, request, spider):
        d = self._download(request, spider)
        d.addBoth(self._downloaded, self.slot, request, spider) #addBoth(func, ,参数 ,参数)
        return d
    #是request的话 从新调用 dowload 否则返回 rewponse
    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request) #从slot里删除这个request
        return self.download(response, spider) if isinstance(response, Request) else response
    #### 下载起点slot ##### 添加这个 request 然后调用downloader 下载 request 添加对应的 处理callback
    def _download(self, request, spider): #
        slot = self.slot
        slot.add_request(request) # 在slot 的正在处理的request里面 增加这个request

        def _on_success(response):
            if not isinstance(response, (Response, Request)):
                raise TypeError(
                    "Incorrect type: expected Response or Request, got "
                    f"{type(response)}: {response!r}"
                )
            if isinstance(response, Response):
                if response.request is None:
                    response.request = request
                logkws = self.logformatter.crawled(response.request, response, spider)
                if logkws is not None:
                    logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
                self.signals.send_catch_log(
                    signal=signals.response_received,
                    response=response,
                    request=response.request,
                    spider=spider,
                )
            return response

        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(request, spider) #实际下载发生的情况
        dwld.addCallbacks(_on_success) #增加 callback
        dwld.addBoth(_on_complete) #增加callback 和errback
        return dwld
    ######## 程序的实际开始的地方 ###### 相当于 NO1 实际上一个引擎只能调用一个spider  这个方法中很多spider都只能是个例 不能是list
    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        if not self.has_capacity():
            raise RuntimeError(f"No free spider slot when opening {spider.name!r}")
        logger.info("Spider opened", extra={'spider': spider})
        nextcall = CallLaterOnce(self._next_request, spider) # 这里相当于 创建了一个 CallLaterOnce的对象 目标是从start_request里开始抛出request  不过 他是相当于仅仅生成个deferred
        scheduler = self.scheduler_cls.from_crawler(self.crawler)# 实例化scheduler
        start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider) # 调用middleware中的process_start_requests 来处理这些startrequest
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler) #这里创建对应的slot
        self.slot = slot
        self.spider = spider
        yield scheduler.open(spider) # 初始化 scheduler 生成que
        yield self.scraper.open_spider(spider)# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~·
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)#发出信号
        slot.nextcall.schedule() # 给reactor 添加任务 实际启动_next_request
        slot.heartbeat.start(5) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~·

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signals.spider_idle, spider=spider, dont_log=DontCloseSpider) #方法返回的类型为 (receiver, result)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) for _, x in res):
            return #当信号发出,返回的是错误 且 不许关闭spider标志的话 返回空

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        logger.info("Closing spider (%(reason)s)",
                    {'reason': reason},
                    extra={'spider': spider})

        dfd = slot.close()

        def log_failure(msg):
            def errback(failure):
                logger.error(
                    msg,
                    exc_info=failure_to_exc_info(failure),
                    extra={'spider': spider}
                )
            return errback

        dfd.addBoth(lambda _: self.downloader.close()) #通知downloader 关闭动作
        dfd.addErrback(log_failure('Downloader close failure'))

        dfd.addBoth(lambda _: self.scraper.close_spider(spider)) # 通知scraper 关闭动作
        dfd.addErrback(log_failure('Scraper close failure'))

        dfd.addBoth(lambda _: slot.scheduler.close(reason)) #通知slot
        dfd.addErrback(log_failure('Scheduler close failure'))

        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred( #发送信号
            signal=signals.spider_closed, spider=spider, reason=reason))
        dfd.addErrback(log_failure('Error while sending spider_close signal'))

        dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason)) #crawler 关闭信息
        dfd.addErrback(log_failure('Stats close failure'))

        dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
                                          {'reason': reason},
                                          extra={'spider': spider}))

        dfd.addBoth(lambda _: setattr(self, 'slot', None)) #清理内存
        dfd.addErrback(log_failure('Error while unassigning slot'))

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log_failure('Error while unassigning spider'))

        dfd.addBoth(lambda _: self._spider_closed_callback(spider)) #调用对应回调函数

        return dfd

    def _close_all_spiders(self):
        dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped) #
        self._closewait.callback(None)
Beispiel #30
0
class ExecutionEngine(object):
    def __init__(self, settings, spider_closed_callback):
        self.settings = settings
        self.closing = {}  # dict (spider -> reason) of spiders being closed
        self.closing_dfds = {
        }  # dict (spider -> deferred) of spiders being closed
        self.running = False
        self.paused = False
        self._next_request_calls = {}
        self.scheduler = load_object(settings['SCHEDULER'])()
        self.downloader = Downloader()
        self.scraper = Scraper(self, self.settings)
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield send_catch_log_deferred(signal=signals.engine_started)
        self.running = True

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def is_idle(self):
        return self.scheduler.is_idle() and self.downloader.is_idle() and \
            self.scraper.is_idle()

    def next_request(self, spider, now=False):
        """Scrape the next request for the spider passed.

        The next request to be scraped is retrieved from the scheduler and
        requested from the downloader.

        The spider is closed if there are no more pages to scrape.
        """
        if now:
            self._next_request_calls.pop(spider, None)
        elif spider not in self._next_request_calls:
            call = reactor.callLater(0, self.next_request, spider, now=True)
            self._next_request_calls[spider] = call
            return call
        else:
            return

        if self.paused:
            return reactor.callLater(5, self.next_request, spider)

        while not self._needs_backout(spider):
            if not self._next_request(spider):
                break

        if self.spider_is_idle(spider):
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        return not self.running \
            or self.spider_is_closed(spider) \
            or self.downloader.sites[spider].needs_backout() \
            or self.scraper.sites[spider].needs_backout()

    def _next_request(self, spider):
        # Next pending request from scheduler
        request, deferred = self.scheduler.next_request(spider)
        if request:
            dwld = mustbe_deferred(self.download, request, spider)
            dwld.chainDeferred(deferred).addBoth(lambda _: deferred)
            dwld.addErrback(log.err,
                            "Unhandled error on engine._next_request()",
                            spider=spider)
            return dwld

    def spider_is_idle(self, spider):
        scraper_idle = spider in self.scraper.sites \
            and self.scraper.sites[spider].is_idle()
        pending = self.scheduler.spider_has_pending_requests(spider)
        downloading = spider in self.downloader.sites \
            and self.downloader.sites[spider].active
        return scraper_idle and not (pending or downloading)

    def spider_is_closed(self, spider):
        """Return True if the spider is fully closed (ie. not even in the
        closing stage)"""
        return spider not in self.downloader.sites

    def spider_is_open(self, spider):
        """Return True if the spider is fully opened (ie. not in closing
        stage)"""
        return spider in self.downloader.sites and spider not in self.closing

    @property
    def open_spiders(self):
        return self.downloader.sites.keys()

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return len(self.downloader.sites) < self.downloader.concurrent_spiders

    def crawl(self, request, spider):
        if spider in self.closing:  # ignore requests for spiders being closed
            return
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        schd = mustbe_deferred(self.schedule, request, spider)
        # FIXME: we can't log errors because we would be preventing them from
        # propagating to the request errback. This should be fixed after the
        # next core refactoring.
        #schd.addErrback(log.err, "Error on engine.crawl()")
        schd.addBoth(self.scraper.enqueue_scrape, request, spider)
        schd.addErrback(log.err,
                        "Unhandled error on engine.crawl()",
                        spider=spider)
        schd.addBoth(lambda _: self.next_request(spider))

    def schedule(self, request, spider):
        if spider in self.closing:
            raise IgnoreRequest()
        self.next_request(spider)
        return self.scheduler.enqueue_request(spider, request)

    def download(self, request, spider):
        def _on_success(response):
            """handle the result of a page download"""
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request  # tie request to response received
                log.msg(log.formatter.crawled(request, response, spider), \
                    level=log.DEBUG, spider=spider)
                return response
            elif isinstance(response, Request):
                return mustbe_deferred(self.schedule, response, spider)

        def _on_error(_failure):
            """handle an error processing a page"""
            exc = _failure.value
            if isinstance(exc, IgnoreRequest):
                errmsg = _failure.getErrorMessage()
            else:
                errmsg = str(_failure)
            if errmsg:
                log.msg("Error downloading <%s>: %s" % (request.url, errmsg), \
                    level=log.ERROR, spider=spider)
            return Failure(IgnoreRequest(str(exc)))

        def _on_complete(_):
            self.next_request(spider)
            return _

        if spider not in self.downloader.sites:
            return defer.fail(Failure(IgnoreRequest())).addBoth(_on_complete)

        dwld = mustbe_deferred(self.downloader.fetch, request, spider)
        dwld.addCallbacks(_on_success, _on_error)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider):
        assert self.has_capacity(), "No free spider slots when opening %r" % \
            spider.name
        log.msg("Spider opened", spider=spider)
        yield self.scheduler.open_spider(spider)
        self.downloader.open_spider(spider)
        yield self.scraper.open_spider(spider)
        stats.open_spider(spider)
        yield send_catch_log_deferred(signals.spider_opened, spider=spider)
        self.next_request(spider)

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            reactor.callLater(5, self.next_request, spider)
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""
        if spider in self.closing:
            return defer.succeed(None)
        log.msg("Closing spider (%s)" % reason, spider=spider)
        self.closing[spider] = reason
        self.scheduler.clear_pending_requests(spider)
        dfd = self.downloader.close_spider(spider)
        self.closing_dfds[spider] = dfd
        dfd.addBoth(lambda _: self.scheduler.close_spider(spider))
        dfd.addErrback(log.err, "Unhandled error in scheduler.close_spider()", \
            spider=spider)
        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log.err, "Unhandled error in scraper.close_spider()", \
            spider=spider)
        dfd.addBoth(lambda _: self._finish_closing_spider(spider))
        return dfd

    def _close_all_spiders(self):
        dfds = [
            self.close_spider(s, reason='shutdown') for s in self.open_spiders
        ]
        dfds += self.closing_dfds.values()
        dlist = defer.DeferredList(dfds)
        return dlist

    def _finish_closing_spider(self, spider):
        """This function is called after the spider has been closed"""
        reason = self.closing.pop(spider, 'finished')
        call = self._next_request_calls.pop(spider, None)
        if call and call.active():
            call.cancel()
        dfd = send_catch_log_deferred(signal=signals.spider_closed, \
            spider=spider, reason=reason)
        dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason))
        dfd.addErrback(log.err,
                       "Unhandled error in stats.close_spider()",
                       spider=spider)
        dfd.addBoth(
            lambda _: log.msg("Spider closed (%s)" % reason, spider=spider))
        dfd.addBoth(lambda _: self.closing_dfds.pop(spider).callback(spider))
        dfd.addBoth(lambda _: self._spider_closed_callback(spider))
        return dfd

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield send_catch_log_deferred(signal=signals.engine_stopped)
class ExecutionEngine:
    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        downloader_cls = load_object(self.settings['DOWNLOADER'])
        self.downloader = downloader_cls(crawler)
        self.scraper = Scraper(crawler)
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        if self.running:
            raise RuntimeError("Engine already running")
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_started)
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        if not self.running:
            raise RuntimeError("Engine not running")
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def close(self):
        """Close the execution engine gracefully.

        If it has already been started, stop it. In all cases, close all spiders
        and the downloader.
        """
        if self.running:
            # Will also close spiders and downloader
            return self.stop()
        elif self.open_spiders:
            # Will also close downloader
            return self._close_all_spiders()
        else:
            return defer.succeed(self.downloader.close())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        slot = self.slot
        if not slot:
            return

        if self.paused:
            return

        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider):
                break

        if slot.start_requests and not self._needs_backout(spider):
            try:
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception:
                slot.start_requests = None
                logger.error('Error while obtaining start requests',
                             exc_info=True,
                             extra={'spider': spider})
            else:
                self.crawl(request, spider)

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        slot = self.slot
        return (not self.running or slot.closing
                or self.downloader.needs_backout()
                or self.scraper.slot.needs_backout())

    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        request = slot.scheduler.next_request()
        if not request:
            return
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(
            lambda f: logger.info('Error while handling downloader output',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(
            lambda f: logger.info('Error while removing request from slot',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(
            lambda f: logger.info('Error while scheduling new request',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        return d

    def _handle_downloader_output(self, response, request, spider):
        if not isinstance(response, (Request, Response, Failure)):
            raise TypeError(
                "Incorrect type: expected Request, Response or Failure, got "
                f"{type(response)}: {response!r}")
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(
            lambda f: logger.error('Error while enqueuing downloader output',
                                   exc_info=failure_to_exc_info(f),
                                   extra={'spider': spider}))
        return d

    def spider_is_idle(self, spider):
        if not self.scraper.slot.is_idle():
            # scraper is not idle
            return False

        if self.downloader.active:
            # downloader has pending requests
            return False

        if self.slot.start_requests is not None:
            # not all start requests are handled
            return False

        if self.slot.scheduler.has_pending_requests():
            # scheduler has pending requests
            return False

        return True

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)

    def crawl(self, request, spider):
        if spider not in self.open_spiders:
            raise RuntimeError(
                f"Spider {spider.name!r} not opened when crawling: {request}")
        self.schedule(request, spider)
        self.slot.nextcall.schedule()

    def schedule(self, request, spider):
        self.signals.send_catch_log(signals.request_scheduled,
                                    request=request,
                                    spider=spider)
        if not self.slot.scheduler.enqueue_request(request):
            self.signals.send_catch_log(signals.request_dropped,
                                        request=request,
                                        spider=spider)

    def download(self, request, spider):
        d = self._download(request, spider)
        d.addBoth(self._downloaded, self.slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request)
        return self.download(response, spider) if isinstance(
            response, Request) else response

    def _download(self, request, spider):
        slot = self.slot
        slot.add_request(request)

        def _on_success(response):
            if not isinstance(response, (Response, Request)):
                raise TypeError(
                    "Incorrect type: expected Response or Request, got "
                    f"{type(response)}: {response!r}")
            if isinstance(response, Response):
                if response.request is None:
                    response.request = request
                logkws = self.logformatter.crawled(response.request, response,
                                                   spider)
                if logkws is not None:
                    logger.log(*logformatter_adapter(logkws),
                               extra={'spider': spider})
                self.signals.send_catch_log(
                    signal=signals.response_received,
                    response=response,
                    request=response.request,
                    spider=spider,
                )
            return response

        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(request, spider)
        dwld.addCallbacks(_on_success)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        if not self.has_capacity():
            raise RuntimeError(
                f"No free spider slot when opening {spider.name!r}")
        logger.info("Spider opened", extra={'spider': spider})
        nextcall = CallLaterOnce(self._next_request, spider)
        scheduler = self.scheduler_cls.from_crawler(self.crawler)
        start_requests = yield self.scraper.spidermw.process_start_requests(
            start_requests, spider)
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.slot = slot
        self.spider = spider
        yield scheduler.open(spider)
        yield self.scraper.open_spider(spider)
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened,
                                                   spider=spider)
        slot.nextcall.schedule()
        slot.heartbeat.start(5)

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signals.spider_idle,
                                          spider=spider,
                                          dont_log=DontCloseSpider)
        if any(
                isinstance(x, Failure) and isinstance(x.value, DontCloseSpider)
                for _, x in res):
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        logger.info("Closing spider (%(reason)s)", {'reason': reason},
                    extra={'spider': spider})

        dfd = slot.close()

        def log_failure(msg):
            def errback(failure):
                logger.error(msg,
                             exc_info=failure_to_exc_info(failure),
                             extra={'spider': spider})

            return errback

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log_failure('Downloader close failure'))

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log_failure('Scraper close failure'))

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log_failure('Scheduler close failure'))

        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
            signal=signals.spider_closed, spider=spider, reason=reason))
        dfd.addErrback(log_failure('Error while sending spider_close signal'))

        dfd.addBoth(
            lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log_failure('Stats close failure'))

        dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
                                          {'reason': reason},
                                          extra={'spider': spider}))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log_failure('Error while unassigning slot'))

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log_failure('Error while unassigning spider'))

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [
            self.close_spider(s, reason='shutdown') for s in self.open_spiders
        ]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_stopped)
        self._closewait.callback(None)
Beispiel #32
0
class ExecutionEngine(object):

    def __init__(self, crawler, spider_closed_callback):
        ## 将爬虫实例存储在执行引擎实例中
        self.crawler = crawler
        ## 将爬虫实例所对应的配置也存储在执行引擎实例中
        self.settings = crawler.settings
        ## 信号
        self.signals = crawler.signals
        ## 日志格式化器
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        ## 是否正在运行
        self.running = False
        ## 是否已暂停执行
        self.paused = False
        ## 从配置文件中加载调度器类
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        ## 从配置文件中加载下载器类
        downloader_cls = load_object(self.settings['DOWNLOADER'])
        ## 实例化下载器
        self.downloader = downloader_cls(crawler)
        ## 实例化 scraper,它是引擎连接爬虫类(Spider)和管道类(Pipeline)的桥梁
        self.scraper = Scraper(crawler)
        ## 指定爬虫关闭的回调函数
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(signal=signals.engine_started)
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def close(self):
        """Close the execution engine gracefully.

        If it has already been started, stop it. In all cases, close all spiders
        and the downloader.
        """
        if self.running:
            # Will also close spiders and downloader
            return self.stop()
        elif self.open_spiders:
            # Will also close downloader
            return self._close_all_spiders()
        else:
            return defer.succeed(self.downloader.close())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        ## 该方法会被循环调度

        slot = self.slot
        if not slot:
            return

        if self.paused:
            return

        ## 是否撤销
        while not self._needs_backout(spider):
            ## 从 scheduler 中获取下一个 request
            ## 注意:第一次获取时,是没有的,也就是会 break 出来
            ## 从而执行下面的逻辑
            if not self._next_request_from_scheduler(spider):
                break

        ## 如果 start_requests 有数据且不需要撤销
        if slot.start_requests and not self._needs_backout(spider):
            try:
                ## 获取下一个种子请求
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception:
                slot.start_requests = None
                logger.error('Error while obtaining start requests',
                             exc_info=True, extra={'spider': spider})
            else:
                ## 调用 crawl, 实际是把 request 放入 scheduler 的队列中
                self.crawl(request, spider)

        ## 如果爬虫是空闲的则关闭爬虫
        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        ## 是否需要撤销,取决于 4 个条件
        ## 1. engine 是否在运行
        ## 2. slot 是否关闭
        ## 3. 下载器网络下载是否超过预设
        ## 4. scraper 处理输出是否超过预设

        slot = self.slot
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slot.needs_backout()

    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        ## 从调度器中获取下一个请求
        request = slot.scheduler.next_request()
        if not request:
            return
        ## 下载
        d = self._download(request, spider)

        ## 为下载结果添加回调,下载结果可能是 Request、Response、Failure

        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(lambda f: logger.info('Error while handling downloader output',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(lambda f: logger.info('Error while removing request from slot',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(lambda f: logger.info('Error while scheduling new request',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        return d

    def _handle_downloader_output(self, response, request, spider):
        ## 下载结果 response 必须是 Request、Response、Failure 之一
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        ## 如果下载结果是 Request,则再次调用 crawl,执行 Scheduler 的入队逻辑
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        ## 如果下载结果是 Response 或 Failure,则交给 scrapy 的 enqueue_scrape 方法进一步处理
        ## 主要是与 spiders 和 pipelines 交互
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(lambda f: logger.error('Error while enqueuing downloader output',
                                            exc_info=failure_to_exc_info(f),
                                            extra={'spider': spider}))
        return d

    def spider_is_idle(self, spider):
        if not self.scraper.slot.is_idle():
            # scraper is not idle
            return False

        if self.downloader.active:
            # downloader has pending requests
            return False

        if self.slot.start_requests is not None:
            # not all start requests are handled
            return False

        if self.slot.scheduler.has_pending_requests():
            # scheduler has pending requests
            return False

        return True

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        ## 将请求放入调度器队列
        self.schedule(request, spider)
        ## 调用 nextcall 的 schedule 方法,进行下一次调度
        self.slot.nextcall.schedule()

    def schedule(self, request, spider):
        self.signals.send_catch_log(signal=signals.request_scheduled,
                request=request, spider=spider)
        ## 调用调度器的 enqueue_request 方法,将请求放入调度器队列
        if not self.slot.scheduler.enqueue_request(request):
            self.signals.send_catch_log(signal=signals.request_dropped,
                                        request=request, spider=spider)

    def download(self, request, spider):
        d = self._download(request, spider)
        d.addBoth(self._downloaded, self.slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request)
        return self.download(response, spider) \
                if isinstance(response, Request) else response

    def _download(self, request, spider):
        slot = self.slot
        slot.add_request(request)
        ## 下载成功的回调,返回处理过的响应
        def _on_success(response):
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                ## 将请求放入响应对象的 request 属性中
                response.request = request # tie request to response received
                logkws = self.logformatter.crawled(request, response, spider)
                logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
                self.signals.send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

        ## 下载完成的回调,继续下一次调度
        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        ## 调用下载器进行下载
        dwld = self.downloader.fetch(request, spider)
        ## 注册下载成功的回调
        dwld.addCallbacks(_on_success)
        ## 注册下载完成的回调
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        assert self.has_capacity(), "No free spider slot when opening %r" % \
            spider.name
        logger.info("Spider opened", extra={'spider': spider})
        ## 注册 _next_request 调度方法,循环调度
        nextcall = CallLaterOnce(self._next_request, spider)
        ## 初始化调度器类
        scheduler = self.scheduler_cls.from_crawler(self.crawler)
        ## 调用爬虫中间件的 process_start_requests 方法处理种子请求
        ## 可以定义多个爬虫中间件,每个类都重写该方法,爬虫在调度之前会分别调用你定义好的
        ## 爬虫中间件,来分别处理起始请求,功能独立而且维护起来更加清晰
        start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
        ## 封装 slot 对象
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.slot = slot
        self.spider = spider
        ## 调用调度器的 open 方法
        yield scheduler.open(spider)
        ## 调用 scraper 的 open_spider 方法
        yield self.scraper.open_spider(spider)
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
        ## 发起调度
        slot.nextcall.schedule()
        slot.heartbeat.start(5)

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        logger.info("Closing spider (%(reason)s)",
                    {'reason': reason},
                    extra={'spider': spider})

        dfd = slot.close()

        def log_failure(msg):
            def errback(failure):
                logger.error(
                    msg,
                    exc_info=failure_to_exc_info(failure),
                    extra={'spider': spider}
                )
            return errback

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log_failure('Downloader close failure'))

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log_failure('Scraper close failure'))

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log_failure('Scheduler close failure'))

        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
            signal=signals.spider_closed, spider=spider, reason=reason))
        dfd.addErrback(log_failure('Error while sending spider_close signal'))

        dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log_failure('Stats close failure'))

        dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
                                          {'reason': reason},
                                          extra={'spider': spider}))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log_failure('Error while unassigning slot'))

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log_failure('Error while unassigning spider'))

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
        self._closewait.callback(None)
Beispiel #33
0
class ExecutionEngine(object):
    """
    有三个重要的实例化
    一、schedule
        调度器,实例化了dupefilter过滤器,然后还初始化了三个队列。
        dupefilter:过滤器,通过存储 method + url + response.body 生成sha1指纹,来进行过滤
        pqclass:一个优先级队列queuelib.PriorityQueue
        dqclass:一个FIFO队列,先进先出规则,并且通过pickle序列化了
        mqclass:一个FIFO队列,先进先出规则,直接存储在内存中

    二、downloader
        实例化了Handler对象,还实例化了下载器中间件
        Handler:具体的下载逻辑
        DownloaderMiddlewareManager:收集所有的下载中间件,在收集其中的process_request、process_exception、process_response三种方法

    三、scraper
        实例化了爬虫中间件,还实例化了管道处理器
        SpiderMiddlewareManager:实例化后获取process_spider_input、process_spider_output、process_spider_exception、process_start_requests
        itemproc_cls:获取ItemPipelineManager,实例化其中的ITEM_PIPELINES,获取process_item

    """
    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(
            self.settings['SCHEDULER']
        )  # SCHEDULER = 'scrapy.core.scheduler.Scheduler',仅仅获取对象,没做其他坏事
        downloader_cls = load_object(
            self.settings['DOWNLOADER']
        )  # DOWNLOADER = 'scrapy.core.downloader.Downloader'
        self.downloader = downloader_cls(
            crawler
        )  # 这个下载器,里面实例化了handler处理器,和到下载器之间的process_处理逻辑。就是具体的下载功能和中间件功能都已经实现了
        self.scraper = Scraper(
            crawler)  # 这里有定义有spidermw爬虫中间件和ITEM_pipeline管道对象,数据处理功能和存储功能都实现了
        self._spider_closed_callback = spider_closed_callback  # 这个回调很重要,关系到爬虫能不能停下来,是个匿名函数lambda _: self.stop(),最终还是执行engine的self.engine.stop

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_started)
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def close(self):
        """Close the execution engine gracefully.

        If it has already been started, stop it. In all cases, close all spiders
        and the downloader.
        """
        if self.running:
            # Will also close spiders and downloader
            return self.stop()
        elif self.open_spiders:
            # Will also close downloader
            return self._close_all_spiders()
        else:
            return defer.succeed(self.downloader.close())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        slot = self.slot
        if not slot:
            return

        if self.paused:
            return

        while not self._needs_backout(spider):  # 什么时候才会出来呢??
            """首次执行状态
            True False False False
            """
            # 当爬虫running为True
            # 心跳关闭slot.closing=True
            # 下载器有active活跃数量大于16
            # 刮擦有active活跃数量大于5000000
            if not self._next_request_from_scheduler(
                    spider):  # 就是加入这里我放了10个函数呢
                """
                会一直递归获取所有的request,丢到下载器进行下载,最后一步为经历了scraper的润色
                
                从调度器中pop出一个request请求
                执行下载函数,获取结果,如果是request则继续入队,并递归心跳函数,否则继续往下走
                执行对结果的处理
                
                居然会在start_requests之前执行,不可思议,存在记录的话最少要走两次,一次走完,还有一次走结束
                
                执行_next_request_from_scheduler
                Done!  首次直接走掉,应为队列里面一个数据都没有
                
                然后从start_requests里面next出一个数据,推到队列面去,所以数据怎么进去,居然是一个一个next出来推进去的,那个百度小哥着实牛逼
                """
                break

        if slot.start_requests and not self._needs_backout(
                spider):  # 卧槽,这个_needs_backout原来可以控制并发的大小
            try:
                request = next(
                    slot.start_requests
                )  # 数据就是从这里,一个一个的从start_requests调出来然后再推进去的,神奇的异步
            except StopIteration:
                slot.start_requests = None
            except Exception:
                slot.start_requests = None
                logger.error('Error while obtaining start requests',
                             exc_info=True,
                             extra={'spider': spider})
            else:
                """
                所以我感觉现在的情况就很尴尬,我刚往里面push一个数据,然后继续调用时,又立马给我pop出来了,真是醉了
                """
                self.crawl(request, spider)

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(
        self, spider
    ):  # len(self.active) >= self.total_concurrency # return self.active_size > self.max_active_size
        slot = self.slot
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slot.needs_backout()

    def _next_request_from_scheduler(self,
                                     spider):  # 怎么感觉这一个函数就可以把所有流程走完啊???????
        slot = self.slot
        request = slot.scheduler.next_request()  # 从调度器中pop出一条request记录
        if not request:
            return
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(
            lambda f: logger.info('Error while handling downloader output',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(
            lambda f: logger.info('Error while removing request from slot',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(
            lambda f: logger.info('Error while scheduling new request',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        return d

    def _handle_downloader_output(self, response, request,
                                  spider):  # 这里链接到download下载后的response
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):  # 对于结果,如果是Request,则直接入队,进入self.crawl
            self.crawl(response, spider)  # 对request请求指纹过滤,没问题则入队,然后递归心跳处理
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(
            response, request, spider
        )  # 如果是正确的response,对下载器输出的结果进行scraper的三个处理函数,如果结果是request继续入队,如果是字典或者Item则调用process_item函数进行后续处理
        d.addErrback(
            lambda f: logger.error('Error while enqueuing downloader output',
                                   exc_info=failure_to_exc_info(f),
                                   extra={'spider': spider}))
        return d

    def spider_is_idle(self, spider):  # 爬虫 闲置 状态 ??
        if not self.scraper.slot.is_idle():  # scraper处于闲置
            # scraper is not idle
            return False

        if self.downloader.active:  # 下载器处于闲置
            # downloader has pending requests
            return False

        if self.slot.start_requests is not None:  # 所有start_requests处于空
            # not all start requests are handled
            return False

        if self.slot.scheduler.has_pending_requests():  # 调度器 所有等待任务 为空
            # scheduler has pending requests
            return False

        return True  # 判断闲置的四个条件:start_requests、调度器、下载器、scraper均闲置,才会判断爬虫处于闲置状态。

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)

    def crawl(self, request, spider):  # 将请求进行指纹过滤,没问题则入队,然后递归执行心跳
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        self.schedule(request, spider)
        self.slot.nextcall.schedule()  # 又执行一次

    def schedule(self, request, spider):
        self.signals.send_catch_log(signal=signals.request_scheduled,
                                    request=request,
                                    spider=spider)
        if not self.slot.scheduler.enqueue_request(
                request
        ):  # 什么是否才会走到这里呢 - 请求指纹过滤,若没有过滤掉,则入队,self._dqpush(request)也就是push进队列
            self.signals.send_catch_log(signal=signals.request_dropped,
                                        request=request,
                                        spider=spider)

    def download(self, request, spider):
        d = self._download(request, spider)
        d.addBoth(self._downloaded, self.slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):  # 下载结束时,心跳中移除请求
        slot.remove_request(request)
        return self.download(response, spider) \
                if isinstance(response, Request) else response  # 是Request则继续调用上面函数

    def _download(self, request, spider):
        slot = self.slot
        slot.add_request(request)

        def _on_success(
                response):  # 如果是response,就是正常的response对象了,但是应该还没有进行回调处理吧
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request  # tie request to response received
                logkws = self.logformatter.crawled(request, response, spider)
                logger.log(*logformatter_adapter(logkws),
                           extra={'spider': spider})
                self.signals.send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(
            request, spider
        )  # 爬虫下载入口,调用middle进行下载,把真正的下载函数传递过滤,在middle中间进行回调的时候,处理第一个管道,没了再执行下载器进行处理
        dwld.addCallbacks(_on_success)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        assert self.has_capacity(), "No free spider slot when opening %r" % \
            spider.name
        logger.info("Spider opened", extra={'spider': spider})
        nextcall = CallLaterOnce(self._next_request, spider)
        scheduler = self.scheduler_cls.from_crawler(
            self.crawler
        )  # 对调度器进行实例化。实例化了dupefilter,还有三种队列。一种是优先级队列,还有来两个都是fifo先进先出队列,不过一个是直接存储在内存memory中,一个是通过pickle实例化了
        start_requests = yield self.scraper.spidermw.process_start_requests(
            start_requests, spider)  # 第一步执行的居然是爬虫中间件里面的process_start_requests
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.slot = slot
        self.spider = spider
        yield scheduler.open(spider)  # 打开内存队列FIFO,优先级队列,并打开过滤器
        yield self.scraper.open_spider(spider)  # 貌似没做啥事
        self.crawler.stats.open_spider(spider)  # pass,也没做啥事
        yield self.signals.send_catch_log_deferred(
            signals.spider_opened,
            spider=spider)  # 做了好多事啊,初始化日志,还有各种装啊提,中间件似乎都实现了这个函数?
        slot.nextcall.schedule()  # 执行一次self._next_request
        # 这鬼地方居然只会走一次,也就是初始化的走完这里,但是并不会执行里面的逻辑,应为这个schedule里面用的是reactor.callLater(delay, self),所以是不会执行的,除非你start
        slot.heartbeat.start(5)

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        logger.info("Closing spider (%(reason)s)", {'reason': reason},
                    extra={'spider': spider})

        dfd = slot.close()

        def log_failure(msg):
            def errback(failure):
                logger.error(msg,
                             exc_info=failure_to_exc_info(failure),
                             extra={'spider': spider})

            return errback

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log_failure('Downloader close failure'))

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log_failure('Scraper close failure'))

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log_failure('Scheduler close failure'))

        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
            signal=signals.spider_closed, spider=spider, reason=reason))
        dfd.addErrback(log_failure('Error while sending spider_close signal'))

        dfd.addBoth(
            lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log_failure('Stats close failure'))

        dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
                                          {'reason': reason},
                                          extra={'spider': spider}))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log_failure('Error while unassigning slot'))

        dfd.addBoth(lambda _: setattr(self, 'spider', None))  # 这里有点意思
        dfd.addErrback(log_failure('Error while unassigning spider'))

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [
            self.close_spider(s, reason='shutdown') for s in self.open_spiders
        ]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_stopped)
        self._closewait.callback(None)
Beispiel #34
0
class ExecutionEngine:
    def __init__(self, crawler, spider_closed_callback: Callable) -> None:
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot: Optional[Slot] = None
        self.spider: Optional[Spider] = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(crawler.settings["SCHEDULER"])
        downloader_cls = load_object(self.settings['DOWNLOADER'])
        self.downloader = downloader_cls(crawler)
        self.scraper = Scraper(crawler)
        self._spider_closed_callback = spider_closed_callback

    @inlineCallbacks
    def start(self) -> Deferred:
        if self.running:
            raise RuntimeError("Engine already running")
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_started)
        self.running = True
        self._closewait = Deferred()
        yield self._closewait

    def stop(self) -> Deferred:
        """Gracefully stop the execution engine"""
        @inlineCallbacks
        def _finish_stopping_engine(_) -> Deferred:
            yield self.signals.send_catch_log_deferred(
                signal=signals.engine_stopped)
            self._closewait.callback(None)

        if not self.running:
            raise RuntimeError("Engine not running")

        self.running = False
        dfd = self.close_spider(
            self.spider,
            reason="shutdown") if self.spider is not None else succeed(None)
        return dfd.addBoth(_finish_stopping_engine)

    def close(self) -> Deferred:
        """
        Gracefully close the execution engine.
        If it has already been started, stop it. In all cases, close the spider and the downloader.
        """
        if self.running:
            return self.stop()  # will also close spider and downloader
        if self.spider is not None:
            return self.close_spider(
                self.spider, reason="shutdown")  # will also close downloader
        return succeed(self.downloader.close())

    def pause(self) -> None:
        self.paused = True

    def unpause(self) -> None:
        self.paused = False

    def _next_request(self) -> None:
        assert self.slot is not None  # typing
        assert self.spider is not None  # typing

        if self.paused:
            return None

        while not self._needs_backout() and self._next_request_from_scheduler(
        ) is not None:
            pass

        if self.slot.start_requests is not None and not self._needs_backout():
            try:
                request = next(self.slot.start_requests)
            except StopIteration:
                self.slot.start_requests = None
            except Exception:
                self.slot.start_requests = None
                logger.error('Error while obtaining start requests',
                             exc_info=True,
                             extra={'spider': self.spider})
            else:
                self.crawl(request)

        if self.spider_is_idle() and self.slot.close_if_idle:
            self._spider_idle()

    def _needs_backout(self) -> bool:
        return (not self.running
                or self.slot.closing  # type: ignore[union-attr]
                or self.downloader.needs_backout() or
                self.scraper.slot.needs_backout()  # type: ignore[union-attr]
                )

    def _next_request_from_scheduler(self) -> Optional[Deferred]:
        assert self.slot is not None  # typing
        assert self.spider is not None  # typing

        request = self.slot.scheduler.next_request()
        if request is None:
            return None

        d = self._download(request, self.spider)
        d.addBoth(self._handle_downloader_output, request)
        d.addErrback(
            lambda f: logger.info('Error while handling downloader output',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': self.spider}))
        d.addBoth(lambda _: self.slot.remove_request(request))
        d.addErrback(
            lambda f: logger.info('Error while removing request from slot',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': self.spider}))
        d.addBoth(lambda _: self.slot.nextcall.schedule())
        d.addErrback(
            lambda f: logger.info('Error while scheduling new request',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': self.spider}))
        return d

    def _handle_downloader_output(self, result: Union[Request, Response,
                                                      Failure],
                                  request: Request) -> Optional[Deferred]:
        assert self.spider is not None  # typing

        if not isinstance(result, (Request, Response, Failure)):
            raise TypeError(
                f"Incorrect type: expected Request, Response or Failure, got {type(result)}: {result!r}"
            )

        # downloader middleware can return requests (for example, redirects)
        if isinstance(result, Request):
            self.crawl(result)
            return None

        d = self.scraper.enqueue_scrape(result, request, self.spider)
        d.addErrback(lambda f: logger.error(
            "Error while enqueuing downloader output",
            exc_info=failure_to_exc_info(f),
            extra={'spider': self.spider},
        ))
        return d

    def spider_is_idle(self, spider: Optional[Spider] = None) -> bool:
        if spider is not None:
            warnings.warn(
                "Passing a 'spider' argument to ExecutionEngine.spider_is_idle is deprecated",
                category=ScrapyDeprecationWarning,
                stacklevel=2,
            )
        if self.slot is None:
            raise RuntimeError("Engine slot not assigned")
        if not self.scraper.slot.is_idle():  # type: ignore[union-attr]
            return False
        if self.downloader.active:  # downloader has pending requests
            return False
        if self.slot.start_requests is not None:  # not all start requests are handled
            return False
        if self.slot.scheduler.has_pending_requests():
            return False
        return True

    def crawl(self, request: Request, spider: Optional[Spider] = None) -> None:
        """Inject the request into the spider <-> downloader pipeline"""
        if spider is not None:
            warnings.warn(
                "Passing a 'spider' argument to ExecutionEngine.crawl is deprecated",
                category=ScrapyDeprecationWarning,
                stacklevel=2,
            )
            if spider is not self.spider:
                raise RuntimeError(
                    f"The spider {spider.name!r} does not match the open spider"
                )
        if self.spider is None:
            raise RuntimeError(f"No open spider to crawl: {request}")
        self._schedule_request(request, self.spider)
        self.slot.nextcall.schedule()  # type: ignore[union-attr]

    def _schedule_request(self, request: Request, spider: Spider) -> None:
        self.signals.send_catch_log(signals.request_scheduled,
                                    request=request,
                                    spider=spider)
        if not self.slot.scheduler.enqueue_request(
                request):  # type: ignore[union-attr]
            self.signals.send_catch_log(signals.request_dropped,
                                        request=request,
                                        spider=spider)

    def download(self,
                 request: Request,
                 spider: Optional[Spider] = None) -> Deferred:
        """Return a Deferred which fires with a Response as result, only downloader middlewares are applied"""
        if spider is None:
            spider = self.spider
        else:
            warnings.warn(
                "Passing a 'spider' argument to ExecutionEngine.download is deprecated",
                category=ScrapyDeprecationWarning,
                stacklevel=2,
            )
            if spider is not self.spider:
                logger.warning(
                    "The spider '%s' does not match the open spider",
                    spider.name)
        if spider is None:
            raise RuntimeError(f"No open spider to crawl: {request}")
        return self._download(request, spider).addBoth(self._downloaded,
                                                       request, spider)

    def _downloaded(self, result: Union[Response, Request], request: Request,
                    spider: Spider) -> Union[Deferred, Response]:
        assert self.slot is not None  # typing
        self.slot.remove_request(request)
        return self.download(result, spider) if isinstance(result,
                                                           Request) else result

    def _download(self, request: Request, spider: Spider) -> Deferred:
        assert self.slot is not None  # typing

        self.slot.add_request(request)

        def _on_success(
                result: Union[Response, Request]) -> Union[Response, Request]:
            if not isinstance(result, (Response, Request)):
                raise TypeError(
                    f"Incorrect type: expected Response or Request, got {type(result)}: {result!r}"
                )
            if isinstance(result, Response):
                if result.request is None:
                    result.request = request
                logkws = self.logformatter.crawled(result.request, result,
                                                   spider)
                if logkws is not None:
                    logger.log(*logformatter_adapter(logkws),
                               extra={"spider": spider})
                self.signals.send_catch_log(
                    signal=signals.response_received,
                    response=result,
                    request=result.request,
                    spider=spider,
                )
            return result

        def _on_complete(_):
            self.slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(request, spider)
        dwld.addCallbacks(_on_success)
        dwld.addBoth(_on_complete)
        return dwld

    @inlineCallbacks
    def open_spider(self,
                    spider: Spider,
                    start_requests: Iterable = (),
                    close_if_idle: bool = True):
        if self.slot is not None:
            raise RuntimeError(
                f"No free spider slot when opening {spider.name!r}")
        logger.info("Spider opened", extra={'spider': spider})
        nextcall = CallLaterOnce(self._next_request)
        scheduler = create_instance(self.scheduler_cls,
                                    settings=None,
                                    crawler=self.crawler)
        start_requests = yield self.scraper.spidermw.process_start_requests(
            start_requests, spider)
        self.slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.spider = spider
        yield scheduler.open(spider)
        yield self.scraper.open_spider(spider)
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened,
                                                   spider=spider)
        self.slot.nextcall.schedule()
        self.slot.heartbeat.start(5)

    def _spider_idle(self) -> None:
        """
        Called when a spider gets idle, i.e. when there are no remaining requests to download or schedule.
        It can be called multiple times. If a handler for the spider_idle signal raises a DontCloseSpider
        exception, the spider is not closed until the next loop and this function is guaranteed to be called
        (at least) once again.
        """
        assert self.spider is not None  # typing
        res = self.signals.send_catch_log(signals.spider_idle,
                                          spider=self.spider,
                                          dont_log=DontCloseSpider)
        if any(
                isinstance(x, Failure) and isinstance(x.value, DontCloseSpider)
                for _, x in res):
            return None
        if self.spider_is_idle():
            self.close_spider(self.spider, reason='finished')

    def close_spider(self,
                     spider: Spider,
                     reason: str = "cancelled") -> Deferred:
        """Close (cancel) spider and clear all its outstanding requests"""
        if self.slot is None:
            raise RuntimeError("Engine slot not assigned")

        if self.slot.closing is not None:
            return self.slot.closing

        logger.info("Closing spider (%(reason)s)", {'reason': reason},
                    extra={'spider': spider})

        dfd = self.slot.close()

        def log_failure(msg: str) -> Callable:
            def errback(failure: Failure) -> None:
                logger.error(msg,
                             exc_info=failure_to_exc_info(failure),
                             extra={'spider': spider})

            return errback

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log_failure('Downloader close failure'))

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log_failure('Scraper close failure'))

        dfd.addBoth(lambda _: self.slot.scheduler.close(reason))
        dfd.addErrback(log_failure('Scheduler close failure'))

        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
            signal=signals.spider_closed,
            spider=spider,
            reason=reason,
        ))
        dfd.addErrback(log_failure('Error while sending spider_close signal'))

        dfd.addBoth(
            lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log_failure('Stats close failure'))

        dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
                                          {'reason': reason},
                                          extra={'spider': spider}))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log_failure('Error while unassigning slot'))

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log_failure('Error while unassigning spider'))

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    @property
    def open_spiders(self) -> list:
        warnings.warn(
            "ExecutionEngine.open_spiders is deprecated, please use ExecutionEngine.spider instead",
            category=ScrapyDeprecationWarning,
            stacklevel=2,
        )
        return [self.spider] if self.spider is not None else []

    def has_capacity(self) -> bool:
        warnings.warn("ExecutionEngine.has_capacity is deprecated",
                      ScrapyDeprecationWarning,
                      stacklevel=2)
        return not bool(self.slot)

    def schedule(self, request: Request, spider: Spider) -> None:
        warnings.warn(
            "ExecutionEngine.schedule is deprecated, please use "
            "ExecutionEngine.crawl or ExecutionEngine.download instead",
            category=ScrapyDeprecationWarning,
            stacklevel=2,
        )
        if self.slot is None:
            raise RuntimeError("Engine slot not assigned")
        self._schedule_request(request, spider)
Beispiel #35
0
class ExecutionEngine(object):

    def __init__(self, crawler, spider_closed_callback):
        self.settings = crawler.settings
        self.slots = {}
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        self.downloader = Downloader(crawler)
        self.scraper = Scraper(crawler)
        self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS')
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield send_catch_log_deferred(signal=signals.engine_started)
        self.running = True

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        try:
            slot = self.slots[spider]
        except KeyError:
            return

        if self.paused:
            slot.nextcall.schedule(5)
            return

        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider):
                break

        if slot.start_requests and not self._needs_backout(spider):
            try:
                request = slot.start_requests.next()
                self.crawl(request, spider)
            except StopIteration:
                slot.start_requests = None

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        slot = self.slots[spider]
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slots[spider].needs_backout()

    def _next_request_from_scheduler(self, spider):
        slot = self.slots[spider]
        request = slot.scheduler.next_request()
        if not request:
            return
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(log.msg, spider=spider)
        return d

    def _handle_downloader_output(self, response, request, spider):
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(log.err, spider=spider)
        return d

    def spider_is_idle(self, spider):
        scraper_idle = spider in self.scraper.slots \
            and self.scraper.slots[spider].is_idle()
        pending = self.slots[spider].scheduler.has_pending_requests()
        downloading = bool(self.downloader.slots)
        idle = scraper_idle and not (pending or downloading)
        return idle

    @property
    def open_spiders(self):
        return self.slots.keys()

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return len(self.slots) < self._concurrent_spiders

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        self.schedule(request, spider)
        self.slots[spider].nextcall.schedule()

    def schedule(self, request, spider):
        return self.slots[spider].scheduler.enqueue_request(request)

    def download(self, request, spider):
        slot = self.slots[spider]
        slot.add_request(request)
        d = self._download(request, spider)
        d.addBoth(self._downloaded, slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request)
        return self.download(response, spider) \
                if isinstance(response, Request) else response

    def _download(self, request, spider):
        slot = self.slots[spider]
        slot.add_request(request)
        def _on_success(response):
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request # tie request to response received
                log.msg(log.formatter.crawled(request, response, spider), \
                    level=log.DEBUG, spider=spider)
                send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

        def _on_error(failure):
            failure.request = request
            return failure

        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(request, spider)
        dwld.addCallbacks(_on_success, _on_error)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=None, close_if_idle=True):
        assert self.has_capacity(), "No free spider slots when opening %r" % \
            spider.name
        log.msg("Spider opened", spider=spider)
        nextcall = CallLaterOnce(self._next_request, spider)
        scheduler = self.scheduler_cls.from_settings(self.settings)
        slot = Slot(start_requests or (), close_if_idle, nextcall, scheduler)
        self.slots[spider] = slot
        yield scheduler.open(spider)
        yield self.scraper.open_spider(spider)
        stats.open_spider(spider)
        yield send_catch_log_deferred(signals.spider_opened, spider=spider)
        slot.nextcall.schedule()

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            self.slots[spider].nextcall.schedule(5)
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slots[spider]
        if slot.closing:
            return slot.closing
        log.msg("Closing spider (%s)" % reason, spider=spider)

        dfd = slot.close()

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: send_catch_log_deferred(signal=signals.spider_closed, \
            spider=spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: log.msg("Spider closed (%s)" % reason, spider=spider))

        dfd.addBoth(lambda _: self.slots.pop(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield send_catch_log_deferred(signal=signals.engine_stopped)
        yield stats.engine_stopped()
Beispiel #36
0
class ExecutionEngine(object):
    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings  # 配置
        self.signals = crawler.signals  # 信号
        self.logformatter = crawler.logformatter  # 日志格式
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        # 提取scheduler调度器类名(未进行实例化), 其在open_spdier中实例化
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        # 提取downloader下载器类名, 并实例化, 见scrapy/core/downloader/__init__.py文件
        downloader_cls = load_object(self.settings['DOWNLOADER'])
        self.downloader = downloader_cls(crawler)
        # 实例化scrapyer: engine和spider之间的桥梁, 见scrapy/core/scraper.py
        self.scraper = Scraper(crawler)
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_started)
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def close(self):
        """Close the execution engine gracefully.

        If it has already been started, stop it. In all cases, close all spiders
        and the downloader.
        """
        if self.running:
            # Will also close spiders and downloader
            return self.stop()
        elif self.open_spiders:
            # Will also close downloader
            return self._close_all_spiders()
        else:
            return defer.succeed(self.downloader.close())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        """ 1 在CallLaterOnce被注册 2 通过nextcall.schedule开启调度工作 """
        slot = self.slot
        if not slot:
            return

        if self.paused:  # 暂停时
            return

        while not self._needs_backout(spider):  # 是否等待
            # 循环从scheduler中获取Request, 首次会失败.
            # 这里进行下载任务
            if not self._next_request_from_scheduler(spider):
                break

        # 循环调用, 消费yield, 这里slot在open_spider中初始化, start_requests==process_start_requests方法
        # 从而调用每一个爬虫中间件的process_start_requests方法, 批量处理种子Request.
        # 这里start_requests实际上包含 Request 的可迭代对象
        if slot.start_requests and not self._needs_backout(spider):
            # start_requests有 Requests并且不需要等待时
            try:
                request = next(slot.start_requests)  # 提取下一个种子请求, 不管是不是首次
            except StopIteration:
                slot.start_requests = None
            except Exception:
                slot.start_requests = None
                logger.error('Error while obtaining start requests',
                             exc_info=True,
                             extra={'spider': spider})
            else:
                # 将request放入到scheduler的队列中
                self.crawl(request, spider)

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)  # 关闭spider, 满足空闲并且设置了空闲就关闭标志位之后

    def _needs_backout(self, spider):
        # 是否需要等待, 取决于如下条件:
        # 1. engine是否仍然运行
        # 2. slot是否关闭
        # 3. downloader下载超过预设的最大数: CONCURRENT_REQUESTS
        # 4. scraper处理response超过预设
        slot = self.slot
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slot.needs_backout()

    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        # 获取下一个request, 弹出队列中的request, 见scrapy/core/scheduler.py
        request = slot.scheduler.next_request()
        if not request:
            return
        # "下载"该request, 调用Downloader中相应的下载器, 在这之前会注册一批的回调函数, 返回即表示下载成功
        d = self._download(request, spider)
        # 对下载结果做处理(真正下载见scrapy/core/downloader/__init__.py中的_download函数)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(
            lambda f: logger.info('Error while handling downloader output',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(
            lambda f: logger.info('Error while removing request from slot',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(
            lambda f: logger.info('Error while scheduling new request',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        return d

    def _handle_downloader_output(self, response, request, spider):
        # 下载结果必须为下面三者之一: Request/Response/Failure
        assert isinstance(response, (Request, Response, Failure)), response

        # downloader middleware can return requests (for example, redirects)
        # 结果 1: Request, 则必须重新进行一轮下载操作
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        # 结果 2: 利用scraper完成同spiders/pipeline交互, 见scrapy/core/scraper.py
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(
            lambda f: logger.error('Error while enqueuing downloader output',
                                   exc_info=failure_to_exc_info(f),
                                   extra={'spider': spider}))
        return d

    def spider_is_idle(self, spider):
        if not self.scraper.slot.is_idle():
            # scraper is not idle
            return False

        if self.downloader.active:
            # downloader has pending requests
            return False

        if self.slot.start_requests is not None:
            # not all start requests are handled
            return False

        if self.slot.scheduler.has_pending_requests():
            # scheduler has pending requests
            return False

        return True

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        # 将request放入scheduler队列中, 以便下次循环调用
        self.schedule(request, spider)
        # 进行下一次调度
        self.slot.nextcall.schedule()

    def schedule(self, request, spider):
        self.signals.send_catch_log(signal=signals.request_scheduled,
                                    request=request,
                                    spider=spider)
        # 入队列
        if not self.slot.scheduler.enqueue_request(request):
            self.signals.send_catch_log(signal=signals.request_dropped,
                                        request=request,
                                        spider=spider)

    def download(self, request, spider):
        d = self._download(request, spider)
        d.addBoth(self._downloaded, self.slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request)
        return self.download(response, spider) \
            if isinstance(response, Request) else response

    def _download(self, request, spider):
        slot = self.slot
        slot.add_request(request)

        def _on_success(response):
            # 成功之后的回调函数, 结果须为request/response
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request  # tie request to response received
                logkws = self.logformatter.crawled(request, response, spider)
                if logkws is not None:
                    logger.log(*logformatter_adapter(logkws),
                               extra={'spider': spider})
                self.signals.send_catch_log(signal=signals.response_received,
                                            response=response,
                                            request=request,
                                            spider=spider)
            return response

        def _on_complete(_):
            # 下载完成之后的回调, 直接开始下一次调度
            slot.nextcall.schedule()
            return _

        # 下载请求, 调用 Downloader 进行下载(实际上没开始), 见scrapy/core/downloader/__init__.py
        dwld = self.downloader.fetch(request, spider)
        # 注册回调函数
        dwld.addCallbacks(_on_success)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        assert self.has_capacity(), "No free spider slot when opening %r" % \
            spider.name
        logger.info("Spider opened", extra={'spider': spider})
        # 注册next_request调度方法, 以便循环调度(利用twisted的reactor)
        nextcall = CallLaterOnce(self._next_request, spider)
        # 实例化调度器, 见scrapy/core/scheduler.py
        scheduler = self.scheduler_cls.from_crawler(self.crawler)
        # 爬虫中间件, 处理种子Request, 见scrapy/core/spidermw.py, 其中start_requests一般为包含 Requests的可迭代对象
        start_requests = yield self.scraper.spidermw.process_start_requests(
            start_requests, spider)
        # 封装slot对象, 并将返回的包含可迭代对象start_requests赋值给slot
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.slot = slot
        self.spider = spider
        # 调用调度器中的open, 进行爬虫对象绑定, 各种任务队列初始化, 并且开启指纹过滤, 见scrapy/core/scheduler.py
        yield scheduler.open(spider)
        # 见scrapy/core/scraper.py, 实例化实例化或者绑定spider到pipeline manager类上, 见scrapy/pipelines/__init__.py
        # 批量调度爬虫中间件所有open_spdier方法
        yield self.scraper.open_spider(spider)
        # scrapy/statscollectors/__init__.py
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened,
                                                   spider=spider)
        # 开始调度, 实际执行上面注册的_next_request方法
        slot.nextcall.schedule()
        slot.heartbeat.start(5)

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signal=signals.spider_idle,
                                          spider=spider,
                                          dont_log=DontCloseSpider)
        if any(
                isinstance(x, Failure) and isinstance(x.value, DontCloseSpider)
                for _, x in res):
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        logger.info("Closing spider (%(reason)s)", {'reason': reason},
                    extra={'spider': spider})

        dfd = slot.close()

        def log_failure(msg):
            def errback(failure):
                logger.error(msg,
                             exc_info=failure_to_exc_info(failure),
                             extra={'spider': spider})

            return errback

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log_failure('Downloader close failure'))

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log_failure('Scraper close failure'))

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log_failure('Scheduler close failure'))

        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
            signal=signals.spider_closed, spider=spider, reason=reason))
        dfd.addErrback(log_failure('Error while sending spider_close signal'))

        dfd.addBoth(
            lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log_failure('Stats close failure'))

        dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
                                          {'reason': reason},
                                          extra={'spider': spider}))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log_failure('Error while unassigning slot'))

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log_failure('Error while unassigning spider'))

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [
            self.close_spider(s, reason='shutdown') for s in self.open_spiders
        ]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_stopped)
        self._closewait.callback(None)
Beispiel #37
0
class ExecutionEngine(object):
    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        downloader_cls = load_object(self.settings['DOWNLOADER'])
        self.downloader = downloader_cls(crawler)
        self.scraper = Scraper(crawler)
        self._concurrent_spiders = self.settings.getint(
            'CONCURRENT_SPIDERS', 1)
        if self._concurrent_spiders != 1:
            warnings.warn("CONCURRENT_SPIDERS settings is deprecated, use " \
                "Scrapyd max_proc config instead", ScrapyDeprecationWarning)
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_started)
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        slot = self.slot
        if not slot:
            return

        if self.paused:
            slot.nextcall.schedule(5)
            return

        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider):
                break

        if slot.start_requests and not self._needs_backout(spider):
            try:
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception as exc:
                slot.start_requests = None
                log.err(None, 'Obtaining request from start requests', \
                        spider=spider)
            else:
                self.crawl(request, spider)

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        slot = self.slot
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slot.needs_backout()

    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        request = slot.scheduler.next_request()
        if not request:
            return
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(log.msg, spider=spider)
        return d

    def _handle_downloader_output(self, response, request, spider):
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(log.err, spider=spider)
        return d

    def spider_is_idle(self, spider):
        scraper_idle = self.scraper.slot.is_idle()
        pending = self.slot.scheduler.has_pending_requests()
        downloading = bool(self.downloader.active)
        pending_start_requests = self.slot.start_requests is not None
        idle = scraper_idle and not (pending or downloading
                                     or pending_start_requests)
        return idle

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        self.schedule(request, spider)
        self.slot.nextcall.schedule()

    def schedule(self, request, spider):
        self.signals.send_catch_log(signal=signals.request_scheduled,
                                    request=request,
                                    spider=spider)
        return self.slot.scheduler.enqueue_request(request)

    def download(self, request, spider):
        slot = self.slot
        slot.add_request(request)
        d = self._download(request, spider)
        d.addBoth(self._downloaded, slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request)
        return self.download(response, spider) \
                if isinstance(response, Request) else response

    def _download(self, request, spider):
        slot = self.slot
        slot.add_request(request)

        def _on_success(response):
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request  # tie request to response received
                logkws = self.logformatter.crawled(request, response, spider)
                log.msg(spider=spider, **logkws)
                self.signals.send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        dwld = self.downloader.fetch(request, spider)
        dwld.addCallbacks(_on_success)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        assert self.has_capacity(), "No free spider slot when opening %r" % \
            spider.name
        log.msg("Spider opened", spider=spider)
        nextcall = CallLaterOnce(self._next_request, spider)
        scheduler = self.scheduler_cls.from_crawler(self.crawler)
        start_requests = yield self.scraper.spidermw.process_start_requests(
            start_requests, spider)
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.slot = slot
        self.spider = spider
        yield scheduler.open(spider)
        yield self.scraper.open_spider(spider)
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened,
                                                   spider=spider)
        slot.nextcall.schedule()

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            self.slot.nextcall.schedule(5)
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        log.msg(format="Closing spider (%(reason)s)",
                reason=reason,
                spider=spider)

        dfd = slot.close()

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log.err, spider=spider)

        # XXX: spider_stats argument was added for backwards compatibility with
        # stats collection refactoring added in 0.15. it should be removed in 0.17.
        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(signal=signals.spider_closed, \
            spider=spider, reason=reason, spider_stats=self.crawler.stats.get_stats()))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(
            lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: log.msg(
            format="Spider closed (%(reason)s)", reason=reason, spider=spider))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [
            self.close_spider(s, reason='shutdown') for s in self.open_spiders
        ]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_stopped)
        self._closewait.callback(None)
Beispiel #38
0
class ExecutionEngine(object):
    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(
            self.settings['SCHEDULER']
        )  # 从settings中找到Scheduler调度器,找到Scheduler类
        downloader_cls = load_object(
            self.settings['DOWNLOADER'])  # 同样,找到Downloader下载器类
        self.downloader = downloader_cls(crawler)  # 实例化Downloader
        self.scraper = Scraper(crawler)  # 实例化Scraper,它是引擎连接爬虫类的桥梁
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_started)
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def close(self):
        """Close the execution engine gracefully.

        If it has already been started, stop it. In all cases, close all spiders
        and the downloader.
        """
        if self.running:
            # Will also close spiders and downloader
            return self.stop()
        elif self.open_spiders:
            # Will also close downloader
            return self._close_all_spiders()
        else:
            return defer.succeed(self.downloader.close())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        """
        这个_next_request方法有2种调用途径,一种是通过reactor的5s心跳定时启动运行,另一种则是在流程中需要时主动调用。
        """
        slot = self.slot
        if not slot:
            return

        if self.paused:
            return

        while not self._needs_backout(
                spider
        ):  # 从scheduler中获取request,这个循环的意思是,尽量把队列中的request都安排异步下载,除非是达到最大并发量或其他原因
            if not self._next_request_from_scheduler(
                    spider):  # 这个函数才是真正的发起下载任务
                break

        if slot.start_requests and not self._needs_backout(
                spider):  # 如果start_requests有数据且不需要等待
            try:
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception:
                slot.start_requests = None
                logger.error('Error while obtaining start requests',
                             exc_info=True,
                             extra={'spider': spider})
            else:
                self.crawl(
                    request, spider
                )  # 调用crawl,实际是把request放入scheduler对象的内存队列中,然后又安排马上调用_next_request

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        """
        是否需要等待,取决4个条件
        1. Engine是否stop
        2. slot是否close
        3. downloader下载超过预设
        4. scraper处理response超过预设
        """
        slot = self.slot
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slot.needs_backout()

    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        request = slot.scheduler.next_request()  # 从scheduler拿出下个request
        if not request:
            return
        d = self._download(
            request, spider
        )  # 处理各中间件的process_request方法,然后用self._enqueue_request将request发出下载,并绑定回调函数(如process_response)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(
            lambda f: logger.info('Error while handling downloader output',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(
            lambda f: logger.info('Error while removing request from slot',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(
            lambda f: logger.info('Error while scheduling new request',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        return d

    def _handle_downloader_output(self, response, request, spider):
        assert isinstance(
            response, (Request, Response,
                       Failure)), response  # 下载结果必须是Request、Response、Failure其一
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response,
                      Request):  # 如果是Request,则再次调用crawl,执行Scheduler的入队逻辑
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request,
                                        spider)  # 主要是和Spiders和Pipeline交互
        d.addErrback(
            lambda f: logger.error('Error while enqueuing downloader output',
                                   exc_info=failure_to_exc_info(f),
                                   extra={'spider': spider}))
        return d

    def spider_is_idle(self, spider):
        if not self.scraper.slot.is_idle():
            # scraper is not idle
            return False

        if self.downloader.active:
            # downloader has pending requests
            return False

        if self.slot.start_requests is not None:
            # not all start requests are handled
            return False

        if self.slot.scheduler.has_pending_requests():
            # scheduler has pending requests
            return False

        return True

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        self.schedule(request,
                      spider)  # 把该requests放在内存队列中(前提是指纹唯一,指纹不唯一则drop掉)
        self.slot.nextcall.schedule(
        )  # 下一个loop马上调用_next_request,这样就会一直去找spider中的request然后放入队列

    def schedule(self, request, spider):
        self.signals.send_catch_log(signal=signals.request_scheduled,
                                    request=request,
                                    spider=spider)
        if not self.slot.scheduler.enqueue_request(
                request):  # request指纹重复则drop,指纹唯一则将request入队
            self.signals.send_catch_log(signal=signals.request_dropped,
                                        request=request,
                                        spider=spider)

    def download(self, request, spider):  # 没有调用这个函数???
        d = self._download(request, spider)
        d.addBoth(self._downloaded, self.slot, request, spider)
        return d

    def _downloaded(self, response, slot, request, spider):  # 没有调用这个函数???
        slot.remove_request(request)
        return self.download(response, spider) \
                if isinstance(response, Request) else response  # 如果返回的是个request(比如从中间件返回),那就再次下载,如果是response则直接返回

    def _download(self, request, spider):
        slot = self.slot
        slot.add_request(request)  # 添加到正在处理的request的集合

        def _on_success(response):
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):  # 如果下载后结果为Response,返回Response
                response.request = request  # tie request to response received
                logkws = self.logformatter.crawled(request, response, spider)
                logger.log(*logformatter_adapter(logkws),
                           extra={'spider': spider})
                self.signals.send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

        def _on_complete(_):
            slot.nextcall.schedule()  # 这里一个重点,下载完成之后再次调度,即,再次取出request,然后发送请求
            return _

        dwld = self.downloader.fetch(
            request, spider
        )  # 处理各中间件的process_request方法,然后用self._enqueue_request将request发出下载,并绑定回调(如process_response)
        dwld.addCallbacks(_on_success)  # 绑定了内部函数,其实就相当于闭包了,所以变量啥的会保存下来
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        assert self.has_capacity(), "No free spider slot when opening %r" % \
            spider.name
        logger.info("Spider opened", extra={'spider': spider})
        nextcall = CallLaterOnce(self._next_request,
                                 spider)  # 注册_next_request调度方法,循环调度
        scheduler = self.scheduler_cls.from_crawler(
            self.crawler)  # 初始化scheduler
        start_requests = yield self.scraper.spidermw.process_start_requests(
            start_requests, spider)  # 调用爬虫中间件,处理种子请求
        slot = Slot(start_requests, close_if_idle, nextcall,
                    scheduler)  # 封装Slot对象
        self.slot = slot
        self.spider = spider
        yield scheduler.open(
            spider)  # 调用scheduler的open,实例化一个优先级队列,其余啥也没干,返回None
        yield self.scraper.open_spider(
            spider)  # 这里主要是调用所有itemPipline的open_spider方法
        self.crawler.stats.open_spider(spider)  # 返回None,这个不知道干啥的
        yield self.signals.send_catch_log_deferred(signals.spider_opened,
                                                   spider=spider)
        slot.nextcall.schedule()  # 发起调度
        slot.heartbeat.start(5)  # 每5秒调用一次CallLaterOnce.schedule

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        logger.info("Closing spider (%(reason)s)", {'reason': reason},
                    extra={'spider': spider})

        dfd = slot.close()

        def log_failure(msg):
            def errback(failure):
                logger.error(msg,
                             exc_info=failure_to_exc_info(failure),
                             extra={'spider': spider})

            return errback

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log_failure('Downloader close failure'))

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log_failure('Scraper close failure'))

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log_failure('Scheduler close failure'))

        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
            signal=signals.spider_closed, spider=spider, reason=reason))
        dfd.addErrback(log_failure('Error while sending spider_close signal'))

        dfd.addBoth(
            lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log_failure('Stats close failure'))

        dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
                                          {'reason': reason},
                                          extra={'spider': spider}))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log_failure('Error while unassigning slot'))

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log_failure('Error while unassigning spider'))

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [
            self.close_spider(s, reason='shutdown') for s in self.open_spiders
        ]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_stopped)
        self._closewait.callback(None)
Beispiel #39
0
class ExecutionEngine(object):
    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals  #使用crawler的信号管理器
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(
            self.settings['SCHEDULER'])  #根据配置的调度器类来生成对应的对象
        downloader_cls = load_object(
            self.settings['DOWNLOADER'])  #根据配置的下载器类来生成对应的类
        self.downloader = downloader_cls(crawler)
        self.scraper = Scraper(crawler)  #生成一个刮取器
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_started)
        self.running = True
        self._closewait = defer.Deferred()
        yield self._closewait

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def close(self):
        """Close the execution engine gracefully.

        If it has already been started, stop it. In all cases, close all spiders
        and the downloader.
        """
        if self.running:
            # Will also close spiders and downloader
            return self.stop()
        elif self.open_spiders:
            # Will also close downloader
            return self._close_all_spiders()
        else:
            return defer.succeed(self.downloader.close())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    """
    被CallLaterOnce包装后被slot设置,
    主要在reactor中的heartbeat中被定时调用(在slot中设置),不过也可以被代码主动调用
    """

    def _next_request(self, spider):
        slot = self.slot
        if not slot:
            return

        if self.paused:
            return
        #此处应该是通过调度器异步的获取待处理的request
        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider):
                break

        if slot.start_requests and not self._needs_backout(spider):
            try:
                request = next(slot.start_requests)
            except StopIteration:
                slot.start_requests = None
            except Exception:
                slot.start_requests = None
                logger.error('Error while obtaining start requests',
                             exc_info=True,
                             extra={'spider': spider})
            else:
                self.crawl(request, spider)

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    """
    判断当前引擎的状态是不是异常,需不需要回退(backout)
    """

    def _needs_backout(self, spider):
        slot = self.slot
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slot.needs_backout()

    """
    从调度器中请求下一个request,如果有request待处理,
    那么就对这个request进行下载处理,并对下载的操作添加一下回调函数
    """

    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        request = slot.scheduler.next_request()
        if not request:
            return

        # 调用下载器进行下载
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(
            lambda f: logger.info('Error while handling downloader output',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(
            lambda f: logger.info('Error while removing request from slot',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(
            lambda f: logger.info('Error while scheduling new request',
                                  exc_info=failure_to_exc_info(f),
                                  extra={'spider': spider}))
        return d

    """
    对下载器的结果输出进行的异步处理
    """

    def _handle_downloader_output(self, response, request, spider):
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(
            lambda f: logger.error('Error while enqueuing downloader output',
                                   exc_info=failure_to_exc_info(f),
                                   extra={'spider': spider}))
        return d

    def spider_is_idle(self, spider):
        if not self.scraper.slot.is_idle():
            # scraper is not idle
            return False

        if self.downloader.active:
            # downloader has pending requests
            return False

        if self.slot.start_requests is not None:
            # not all start requests are handled
            return False

        if self.slot.scheduler.has_pending_requests():
            # scheduler has pending requests
            return False

        return True

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)

    """
    调用schedule请求slot处理request,并且显式通知slot进行处理
    """

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        self.schedule(request, spider)
        self.slot.nextcall.schedule()

    """
    通过slot将request入队,等待被reactor处理,
    """

    def schedule(self, request, spider):
        self.signals.send_catch_log(signal=signals.request_scheduled,
                                    request=request,
                                    spider=spider)
        if not self.slot.scheduler.enqueue_request(request):
            self.signals.send_catch_log(signal=signals.request_dropped,
                                        request=request,
                                        spider=spider)

    """
    根据请求进行下载,其实是调用_download进行下载
    并且在下载完成之后,通过reactor异步调度_downloaded函数。
    """

    def download(self, request, spider):
        d = self._download(request, spider)
        d.addBoth(self._downloaded, self.slot, request, spider)
        return d

    """
    在下载完成之后,从slot中将要对应的request移除,然后在判断response的类型:
        如果是Request,则继续进行下载;若是Response,则直接返回
    """

    def _downloaded(self, response, slot, request, spider):
        slot.remove_request(request)
        return self.download(response, spider) if isinstance(
            response, Request) else response

    """
    将下载的任务由下载器downloader进行下载的操作
    并添加了两个回调函数:
        在下载完毕complete的时候
        在下载成功success的时候
    """

    def _download(self, request, spider):
        slot = self.slot
        slot.add_request(request)

        def _on_success(response):
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request  # tie request to response received
                logkws = self.logformatter.crawled(request, response, spider)
                logger.log(*logformatter_adapter(logkws),
                           extra={'spider': spider})
                self.signals.send_catch_log(signal=signals.response_received,
                                            response=response,
                                            request=request,
                                            spider=spider)
            return response

        """
        在下载完成的时候显式调用slot进行调度处理
        """

        def _on_complete(_):
            slot.nextcall.schedule()
            return _

        # 从下载器中获取下载的结果deferred
        dwld = self.downloader.fetch(request, spider)
        dwld.addCallbacks(_on_success)
        dwld.addBoth(_on_complete)
        return dwld

    """
    ### 被scrapy.crawler.crawl调用
    开启爬虫系统
    创建调度器并开启,
    """

    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        assert self.has_capacity(), "No free spider slot when opening %r" % \
            spider.name
        logger.info("Spider opened", extra={'spider': spider})
        nextcall = CallLaterOnce(self._next_request, spider)
        scheduler = self.scheduler_cls.from_crawler(self.crawler)
        start_requests = yield self.scraper.spidermw.process_start_requests(
            start_requests, spider)  #先调用spider中间件进行处理
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.slot = slot
        self.spider = spider
        yield scheduler.open(spider)  #开启调度器
        yield self.scraper.open_spider(spider)
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened,
                                                   spider=spider)
        slot.nextcall.schedule()
        slot.heartbeat.start(5)

    """ 
    当调度器空闲的时候调用(在_next_request中判断)。
    可以被多次调用。
    如果某些extension引起了DontCloseSpider异常(在spider_idle 信号的处理器中),spider就不会关闭,直到下一个循环。
    并且这个方法会保证至少被执行一次
    """

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signal=signals.spider_idle,
                                          spider=spider,
                                          dont_log=DontCloseSpider)
        if any(
                isinstance(x, Failure) and isinstance(x.value, DontCloseSpider)
                for _, x in res):
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    """
    关闭爬虫(引擎)
    发送信息:关闭下载器、使用scrapyer关闭爬虫spider、关闭调度器
        发送关闭日志、关闭scawler关闭爬虫的信息、打印日志、
        重设当前的slot为空、重设当前的spider为空等
    """

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slot
        if slot.closing:
            return slot.closing
        logger.info("Closing spider (%(reason)s)", {'reason': reason},
                    extra={'spider': spider})

        dfd = slot.close()

        def log_failure(msg):
            def errback(failure):
                logger.error(msg,
                             exc_info=failure_to_exc_info(failure),
                             extra={'spider': spider})

            return errback

        dfd.addBoth(lambda _: self.downloader.close())
        dfd.addErrback(log_failure('Downloader close failure'))

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log_failure('Scraper close failure'))

        dfd.addBoth(lambda _: slot.scheduler.close(reason))
        dfd.addErrback(log_failure('Scheduler close failure'))

        dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
            signal=signals.spider_closed, spider=spider, reason=reason))
        dfd.addErrback(log_failure('Error while sending spider_close signal'))

        dfd.addBoth(
            lambda _: self.crawler.stats.close_spider(spider, reason=reason))
        dfd.addErrback(log_failure('Stats close failure'))

        dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
                                          {'reason': reason},
                                          extra={'spider': spider}))

        dfd.addBoth(lambda _: setattr(self, 'slot', None))
        dfd.addErrback(log_failure('Error while unassigning slot'))

        dfd.addBoth(lambda _: setattr(self, 'spider', None))
        dfd.addErrback(log_failure('Error while unassigning spider'))

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _close_all_spiders(self):
        dfds = [
            self.close_spider(s, reason='shutdown') for s in self.open_spiders
        ]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield self.signals.send_catch_log_deferred(
            signal=signals.engine_stopped)
        self._closewait.callback(None)
Beispiel #40
0
class ExecutionEngine(object):

    def __init__(self, settings, spider_closed_callback):
        self.settings = settings
        self.slots = {}
        self.running = False
        self.paused = False
        self._next_request_calls = {}
        self.scheduler = load_object(settings['SCHEDULER'])()
        self.downloader = Downloader()
        self.scraper = Scraper(self, self.settings)
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield send_catch_log_deferred(signal=signals.engine_started)
        self.running = True

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def is_idle(self):
        return self.scheduler.is_idle() and self.downloader.is_idle() and \
            self.scraper.is_idle()

    def next_request(self, spider, now=False):
        """Scrape the next request for the spider passed.

        The next request to be scraped is retrieved from the scheduler and
        requested from the downloader.

        The spider is closed if there are no more pages to scrape.
        """
        if now:
            self._next_request_calls.pop(spider, None)
        elif spider not in self._next_request_calls:
            call = reactor.callLater(0, self.next_request, spider, now=True)
            self._next_request_calls[spider] = call
            return call
        else:
            return

        if self.paused:
            return reactor.callLater(5, self.next_request, spider)

        while not self._needs_backout(spider):
            if not self._next_request(spider):
                break

        if self.spider_is_idle(spider):
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        slot = self.slots[spider]
        return not self.running \
            or slot.closing \
            or self.spider_is_closed(spider) \
            or self.downloader.sites[spider].needs_backout() \
            or self.scraper.sites[spider].needs_backout()

    def _next_request(self, spider):
        request = self.scheduler.next_request(spider)
        if not request:
            return
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(log.msg, spider=spider)
        slot = self.slots[spider]
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: self.next_request(spider))
        return d

    def _handle_downloader_output(self, response, request, spider):
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = defer.Deferred()
        d.addBoth(self.scraper.enqueue_scrape, request, spider)
        d.addErrback(log.err, spider=spider)
        if isinstance(response, Failure):
            d.errback(response)
        else:
            d.callback(response)
        return d

    def spider_is_idle(self, spider):
        scraper_idle = spider in self.scraper.sites \
            and self.scraper.sites[spider].is_idle()
        pending = self.scheduler.spider_has_pending_requests(spider)
        downloading = spider in self.downloader.sites \
            and self.downloader.sites[spider].active
        return scraper_idle and not (pending or downloading)

    def spider_is_closed(self, spider):
        """Return True if the spider is fully closed (ie. not even in the
        closing stage)"""
        return spider not in self.downloader.sites

    @property
    def open_spiders(self):
        return self.downloader.sites.keys()

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return len(self.downloader.sites) < self.downloader.concurrent_spiders

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        self.schedule(request, spider)
        self.next_request(spider)

    def schedule(self, request, spider):
        return self.scheduler.enqueue_request(spider, request)

    def download(self, request, spider):
        slot = self.slots[request]
        slot.add_request(request)
        if isinstance(request, Response):
            return request
        d = self._download(request, spider)
        d.addCallback(self.download, spider)
        d.addBoth(self._remove_request, slot, request)
        return d

    def _remove_request(self, _, slot, request):
        slot.remove_request(request)
        return _

    def _download(self, request, spider):
        slot = self.slots[spider]
        slot.add_request(request)
        def _on_success(response):
            """handle the result of a page download"""
            assert isinstance(response, (Response, Request))
            if isinstance(response, Response):
                response.request = request # tie request to response received
                log.msg(log.formatter.crawled(request, response, spider), \
                    level=log.DEBUG, spider=spider)
                send_catch_log(signal=signals.response_received, \
                    response=response, request=request, spider=spider)
            return response

        def _on_complete(_):
            self.next_request(spider)
            return _

        dwld = mustbe_deferred(self.downloader.fetch, request, spider)
        dwld.addCallback(_on_success)
        dwld.addBoth(_on_complete)
        return dwld

    @defer.inlineCallbacks
    def open_spider(self, spider):
        assert self.has_capacity(), "No free spider slots when opening %r" % \
            spider.name
        log.msg("Spider opened", spider=spider)
        self.slots[spider] = Slot()
        yield self.scheduler.open_spider(spider)
        self.downloader.open_spider(spider)
        yield self.scraper.open_spider(spider)
        stats.open_spider(spider)
        yield send_catch_log_deferred(signals.spider_opened, spider=spider)
        self.next_request(spider)

    def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            reactor.callLater(5, self.next_request, spider)
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished')

    def close_spider(self, spider, reason='cancelled'):
        """Close (cancel) spider and clear all its outstanding requests"""

        slot = self.slots[spider]
        if slot.closing:
            return slot.closing
        log.msg("Closing spider (%s)" % reason, spider=spider)

        self.scheduler.clear_pending_requests(spider)

        dfd = slot.close()

        dfd.addBoth(lambda _: self.downloader.close_spider(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self.scraper.close_spider(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self.scheduler.close_spider(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self._cancel_next_call(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: send_catch_log_deferred(signal=signals.spider_closed, \
            spider=spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: log.msg("Spider closed (%s)" % reason, spider=spider))

        dfd.addBoth(lambda _: self.slots.pop(spider))
        dfd.addErrback(log.err, spider=spider)

        dfd.addBoth(lambda _: self._spider_closed_callback(spider))

        return dfd

    def _cancel_next_call(self, spider):
        call = self._next_request_calls.pop(spider, None)
        if call and call.active:
            call.cancel()

    def _close_all_spiders(self):
        dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
        dlist = defer.DeferredList(dfds)
        return dlist

    @defer.inlineCallbacks
    def _finish_stopping_engine(self):
        yield send_catch_log_deferred(signal=signals.engine_stopped)
        yield stats.engine_stopped()