Exemple #1
0
 def test_crawl_deprecated_spider_arg(self):
     with pytest.warns(ScrapyDeprecationWarning,
                       match="Passing a 'spider' argument to "
                             "ExecutionEngine.crawl is deprecated"):
         e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
         spider = TestSpider()
         yield e.open_spider(spider, [])
         e.start()
         e.crawl(Request("data:,"), spider)
         yield e.close()
Exemple #2
0
 def test_crawl_deprecated_spider_arg(self):
     with warnings.catch_warnings(record=True) as warning_list:
         e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
         spider = TestSpider()
         yield e.open_spider(spider, [])
         e.start()
         e.crawl(Request("data:,"), spider)
         yield e.close()
         self.assertEqual(warning_list[0].category, ScrapyDeprecationWarning)
         self.assertEqual(
             str(warning_list[0].message),
             "Passing a 'spider' argument to ExecutionEngine.crawl is deprecated",
         )
Exemple #3
0
class ExecutionManager(object):

    def __init__(self):
        self.configured = False
        self.control_reactor = True
        self.engine = ExecutionEngine()

    def configure(self, control_reactor=True, queue=None):
        self.control_reactor = control_reactor
        if control_reactor:
            install_shutdown_handlers(self._signal_shutdown)

        if not log.started:
            log.start()
        if not extensions.loaded:
            extensions.load()
        if not spiders.loaded:
            spiders.load()
        log.msg("Enabled extensions: %s" % ", ".join(extensions.enabled.iterkeys()),
            level=log.DEBUG)

        self.queue = queue or ExecutionQueue()
        self.engine.configure(self._spider_closed)
        self.configured = True

    @defer.inlineCallbacks
    def _start_next_spider(self):
        spider, requests = yield defer.maybeDeferred(self.queue.get_next)
        if spider:
            self._start_spider(spider, requests)
        if self.engine.has_capacity() and not self._nextcall.active():
            self._nextcall = reactor.callLater(self.queue.polling_delay, \
                self._start_next_spider)

    @defer.inlineCallbacks
    def _start_spider(self, spider, requests):
        """Don't call this method. Use self.queue to start new spiders"""
        yield defer.maybeDeferred(self.engine.open_spider, spider)
        for request in requests:
            self.engine.crawl(request, spider)

    @defer.inlineCallbacks
    def _spider_closed(self, spider=None):
        if not self.engine.open_spiders:
            is_finished = yield defer.maybeDeferred(self.queue.is_finished)
            if is_finished:
                self.stop()
                return
        if self.engine.has_capacity():
            self._start_next_spider()

    @defer.inlineCallbacks
    def start(self):
        yield defer.maybeDeferred(self.engine.start)
        self._nextcall = reactor.callLater(0, self._spider_closed)
        reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
        if self.control_reactor:
            reactor.run(installSignalHandlers=False)

    @defer.inlineCallbacks
    def stop(self):
        if self._nextcall.active():
            self._nextcall.cancel()
        if self.engine.running:
            yield defer.maybeDeferred(self.engine.stop)
        try:
            reactor.stop()
        except RuntimeError: # raised if already stopped or in shutdown stage
            pass

    def _signal_shutdown(self, signum, _):
        signame = signal_names[signum]
        log.msg("Received %s, shutting down gracefully. Send again to force " \
            "unclean shutdown" % signame, level=log.INFO)
        reactor.callFromThread(self.stop)
        install_shutdown_handlers(self._signal_kill)

    def _signal_kill(self, signum, _):
        signame = signal_names[signum]
        log.msg('Received %s twice, forcing unclean shutdown' % signame, \
            level=log.INFO)
        log.log_level = log.SILENT # disable logging of confusing tracebacks
        reactor.callFromThread(self.engine.kill)
        install_shutdown_handlers(signal.SIG_IGN)
Exemple #4
0
class Crawler(object):
    def __init__(self, settings):
        self.configured = False
        self.settings = settings

    def install(self):
        import scrapy.project
        assert not hasattr(scrapy.project,
                           'crawler'), "crawler already installed"
        scrapy.project.crawler = self

    def uninstall(self):
        import scrapy.project
        assert hasattr(scrapy.project, 'crawler'), "crawler not installed"
        del scrapy.project.crawler

    def configure(self):
        if self.configured:
            return
        self.configured = True
        self.extensions = ExtensionManager.from_settings(self.settings)  # 加载扩展
        spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
        self.spiders = spman_cls.from_settings(self.settings)
        spq_cls = load_object(self.settings['SPIDER_QUEUE_CLASS'])
        spq = spq_cls.from_settings(self.settings)
        keepalive = self.settings.getbool('KEEP_ALIVE')
        pollint = self.settings.getfloat('QUEUE_POLL_INTERVAL')
        self.queue = ExecutionQueue(self.spiders,
                                    spq,
                                    poll_interval=pollint,
                                    keep_alive=keepalive)  # 获取队列
        self.engine = ExecutionEngine(self.settings,
                                      self._spider_closed)  # 启动引擎

    @defer.inlineCallbacks
    def _start_next_spider(self):
        spider, requests = yield defer.maybeDeferred(self.queue.get_next)
        if spider:
            self._start_spider(spider, requests)
        if self.engine.has_capacity() and not self._nextcall.active():
            self._nextcall = reactor.callLater(self.queue.poll_interval, \
                self._spider_closed)

    @defer.inlineCallbacks
    def _start_spider(self, spider, requests):
        """Don't call this method. Use self.queue to start new spiders"""
        spider.set_crawler(self)
        yield defer.maybeDeferred(self.engine.open_spider, spider)
        for request in requests:
            self.engine.crawl(request, spider)

    @defer.inlineCallbacks
    def _spider_closed(self, spider=None):
        if not self.engine.open_spiders:
            is_finished = yield defer.maybeDeferred(self.queue.is_finished)
            if is_finished:
                self.stop()
                return
        if self.engine.has_capacity():
            self._start_next_spider()

    @defer.inlineCallbacks
    def start(self):
        yield defer.maybeDeferred(self.configure)
        yield defer.maybeDeferred(self.engine.start)
        self._nextcall = reactor.callLater(0, self._start_next_spider)

    @defer.inlineCallbacks
    def stop(self):
        if self._nextcall.active():
            self._nextcall.cancel()
        if self.engine.running:
            yield defer.maybeDeferred(self.engine.stop)
Exemple #5
0
class Crawler(object):

    def __init__(self, settings):
        self.configured = False
        self.settings = settings

    def install(self):
        import scrapy.project
        assert not hasattr(scrapy.project, 'crawler'), "crawler already installed"
        scrapy.project.crawler = self

    def uninstall(self):
        import scrapy.project
        assert hasattr(scrapy.project, 'crawler'), "crawler not installed"
        del scrapy.project.crawler

    def configure(self):
        if self.configured:
            return
        self.configured = True
        self.extensions = ExtensionManager.from_settings(self.settings)
        spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
        self.spiders = spman_cls.from_settings(self.settings)
        spq_cls = load_object(self.settings['SPIDER_QUEUE_CLASS'])
        spq = spq_cls.from_settings(self.settings)
        keepalive = self.settings.getbool('KEEP_ALIVE')
        pollint = self.settings.getfloat('QUEUE_POLL_INTERVAL')
        self.queue = ExecutionQueue(self.spiders, spq, poll_interval=pollint,
            keep_alive=keepalive)
        self.engine = ExecutionEngine(self.settings, self._spider_closed)

    @defer.inlineCallbacks
    def _start_next_spider(self):
        spider, requests = yield defer.maybeDeferred(self.queue.get_next)
        if spider:
            self._start_spider(spider, requests)
        if self.engine.has_capacity() and not self._nextcall.active():
            self._nextcall = reactor.callLater(self.queue.poll_interval, \
                self._spider_closed)

    @defer.inlineCallbacks
    def _start_spider(self, spider, requests):
        """Don't call this method. Use self.queue to start new spiders"""
        spider.set_crawler(self)
        yield defer.maybeDeferred(self.engine.open_spider, spider)
        for request in requests:
            self.engine.crawl(request, spider)

    @defer.inlineCallbacks
    def _spider_closed(self, spider=None):
        if not self.engine.open_spiders:
            is_finished = yield defer.maybeDeferred(self.queue.is_finished)
            if is_finished:
                self.stop()
                return
        if self.engine.has_capacity():
            self._start_next_spider()

    @defer.inlineCallbacks
    def start(self):
        yield defer.maybeDeferred(self.configure)
        yield defer.maybeDeferred(self.engine.start)
        self._nextcall = reactor.callLater(0, self._start_next_spider)

    @defer.inlineCallbacks
    def stop(self):
        if self._nextcall.active():
            self._nextcall.cancel()
        if self.engine.running:
            yield defer.maybeDeferred(self.engine.stop)