Example #1
0
 def test_close_engine_spiders_downloader(self):
     e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
     yield e.open_spider(TestSpider(), [])
     e.start()
     self.assertTrue(e.running)
     yield e.close()
     self.assertFalse(e.running)
     self.assertEqual(len(e.open_spiders), 0)
Example #2
0
class Crawler(object):

    def __init__(self, settings):
        self.configured = False
        self.settings = settings
        self.signals = SignalManager(self)
        self.stats = load_object(settings['STATS_CLASS'])(self)
        self._start_requests = lambda: ()
        self._spider = None
        # TODO: move SpiderManager to CrawlerProcess
        spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
        self.spiders = spman_cls.from_crawler(self)

    def install(self):
        # TODO: remove together with scrapy.project.crawler usage
        import scrapy.project
        assert not hasattr(
            scrapy.project, 'crawler'), "crawler already installed"
        scrapy.project.crawler = self

    def uninstall(self):
        # TODO: remove together with scrapy.project.crawler usage
        import scrapy.project
        assert hasattr(scrapy.project, 'crawler'), "crawler not installed"
        del scrapy.project.crawler

    def configure(self):
        if self.configured:
            return

        self.configured = True
        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)
        self.engine = ExecutionEngine(self, self._spider_closed)

    def crawl(self, spider, requests=None):
        assert self._spider is None, 'Spider already attached'
        self._spider = spider
        spider.set_crawler(self)
        if requests is None:
            self._start_requests = spider.start_requests
        else:
            self._start_requests = lambda: requests

    def _spider_closed(self, spider=None):
        if not self.engine.open_spiders:
            self.stop()

    @defer.inlineCallbacks
    def start(self):
        yield defer.maybeDeferred(self.configure)
        if self._spider:
            yield self.engine.open_spider(self._spider, self._start_requests())
        yield defer.maybeDeferred(self.engine.start)

    @defer.inlineCallbacks
    def stop(self):
        if self.configured and self.engine.running:
            yield defer.maybeDeferred(self.engine.stop)
Example #3
0
 def configure(self):
     if self.configured:
         return
     self.configured = True
     self.extensions = ExtensionManager.from_crawler(self)
     spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
     self.spiders = spman_cls.from_crawler(self)
     self.engine = ExecutionEngine(self, self._spider_closed)
Example #4
0
 def configure(self):
     if self.configured:
         return
     self.configured = True
     self.extensions = ExtensionManager(self.metas, self)
     spman_cls = load_object(self.spider_manager_class.to_value())
     self.spiders = spman_cls(self.metas)
     self.engine = ExecutionEngine(self, self._spider_closed)
Example #5
0
    def configure(self):
        if self.configured:
            return

        self.configured = True
        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)
        self.engine = ExecutionEngine(self, self._spider_closed)
Example #6
0
 def configure(self):
     if self.configured:
         return
     self.configured = True
     d = dict(overridden_settings(self.settings))
     log.msg(format="Overridden settings: %(settings)r", settings=d, level=log.DEBUG)
     lf_cls = load_object(self.settings['LOG_FORMATTER'])
     self.logformatter = lf_cls.from_crawler(self)
     self.extensions = ExtensionManager.from_crawler(self)
     spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
     self.spiders = spman_cls.from_crawler(self)
     self.engine = ExecutionEngine(self, self._spider_closed)
Example #7
0
class Crawler(object):

    def __init__(self, settings):
        self.configured = False
        self.settings = settings
        self.signals = SignalManager(self)
        self.stats = load_object(settings['STATS_CLASS'])(self)

    def install(self):
        import scrapy.project
        assert not hasattr(scrapy.project, 'crawler'), "crawler already installed"
        scrapy.project.crawler = self

    def uninstall(self):
        import scrapy.project
        assert hasattr(scrapy.project, 'crawler'), "crawler not installed"
        del scrapy.project.crawler

    def configure(self):
        if self.configured:
            return
        self.configured = True
        d = dict(overridden_settings(self.settings))
        log.msg(format="Overridden settings: %(settings)r", settings=d, level=log.DEBUG)
        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)
        spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
        self.spiders = spman_cls.from_crawler(self)
        self.engine = ExecutionEngine(self, self._spider_closed)

    def crawl(self, spider, requests=None):
        spider.set_crawler(self)
        if requests is None:
            requests = spider.start_requests()
        return self.engine.open_spider(spider, requests)

    def _spider_closed(self, spider=None):
        if not self.engine.open_spiders:
            self.stop()

    @defer.inlineCallbacks
    def start(self):
        yield defer.maybeDeferred(self.configure)
        yield defer.maybeDeferred(self.engine.start)

    @defer.inlineCallbacks
    def stop(self):
        if self.engine.running:
            yield defer.maybeDeferred(self.engine.stop)
Example #8
0
 def configure(self):
     if self.configured:
         return
     self.configured = True
     self.extensions = ExtensionManager.from_settings(self.settings)
     spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
     self.spiders = spman_cls.from_settings(self.settings)
     spq_cls = load_object(self.settings['SPIDER_QUEUE_CLASS'])
     spq = spq_cls.from_settings(self.settings)
     keepalive = self.settings.getbool('KEEP_ALIVE')
     pollint = self.settings.getfloat('QUEUE_POLL_INTERVAL')
     self.queue = ExecutionQueue(self.spiders, spq, poll_interval=pollint,
         keep_alive=keepalive)
     self.engine = ExecutionEngine(self.settings, self._spider_closed)
Example #9
0
class Crawler(object):
    def __init__(self, settings):
        self.configured = False
        self.settings = settings
        self.signals = SignalManager(self)
        self.stats = load_object(settings["STATS_CLASS"])(self)

    def install(self):
        import scrapy.project

        assert not hasattr(scrapy.project, "crawler"), "crawler already installed"
        scrapy.project.crawler = self

    def uninstall(self):
        import scrapy.project

        assert hasattr(scrapy.project, "crawler"), "crawler not installed"
        del scrapy.project.crawler

    def configure(self):
        if self.configured:
            return
        self.configured = True
        self.extensions = ExtensionManager.from_crawler(self)
        spman_cls = load_object(self.settings["SPIDER_MANAGER_CLASS"])
        self.spiders = spman_cls.from_crawler(self)
        self.engine = ExecutionEngine(self, self._spider_closed)

    def crawl(self, spider, requests=None):
        spider.set_crawler(self)
        if requests is None:
            requests = spider.start_requests()
        return self.engine.open_spider(spider, requests)

    def _spider_closed(self, spider=None):
        if not self.engine.open_spiders:
            self.stop()

    @defer.inlineCallbacks
    def start(self):
        yield defer.maybeDeferred(self.configure)
        yield defer.maybeDeferred(self.engine.start)

    @defer.inlineCallbacks
    def stop(self):
        if self.engine.running:
            yield defer.maybeDeferred(self.engine.stop)
Example #10
0
class Crawler(SettingObject):
    
    spider_manager_class = StringField(default="scrapy.spidermanager.SpiderManager")
    
    def __init__(self, settings):
        super(Crawler, self).__init__(settings)
        self.configured = False

    def configure(self):
        if self.configured:
            return
        self.configured = True
        self.extensions = ExtensionManager(self.metas, self)
        spman_cls = load_object(self.spider_manager_class.to_value())
        self.spiders = spman_cls(self.metas)
        self.engine = ExecutionEngine(self, self._spider_closed)

    def crawl(self, spider, requests=None):
        spider.set_crawler(self)
        if requests is None:
            requests = spider.start_requests()
        return self.engine.open_spider(spider, requests)

    def _spider_closed(self, spider=None):
        if not self.engine.open_spiders:
            self.stop()

    @defer.inlineCallbacks
    def start(self):
        yield defer.maybeDeferred(self.configure)
        yield defer.maybeDeferred(self.engine.start)

    @defer.inlineCallbacks
    def stop(self):
        if self.engine.running:
            yield defer.maybeDeferred(self.engine.stop)
Example #11
0
 def test_close_spiders_downloader(self):
     e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
     yield e.open_spider(TestSpider(), [])
     self.assertEqual(len(e.open_spiders), 1)
     yield e.close()
     self.assertEqual(len(e.open_spiders), 0)
Example #12
0
 def test_close_downloader(self):
     e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
     yield e.close()
Example #13
0
class Crawler(object):

    def __init__(self, settings):
        self.configured = False
        self.settings = settings
        self.signals = SignalManager(self)
        self.stats = load_object(settings['STATS_CLASS'])(self)

        spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
        self.spiders = spman_cls.from_crawler(self)
        self._scheduled = {}

    def install(self):
        import scrapy.project
        assert not hasattr(scrapy.project, 'crawler'), "crawler already installed"
        scrapy.project.crawler = self

    def uninstall(self):
        import scrapy.project
        assert hasattr(scrapy.project, 'crawler'), "crawler not installed"
        del scrapy.project.crawler

    def configure(self):
        if self.configured:
            return

        self.configured = True
        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)
        self.engine = ExecutionEngine(self, self._spider_closed)

    def crawl(self, spider, requests=None):
        spider.set_crawler(self)
        if self.configured and self.engine.running:
            assert not self._scheduled
            return self._schedule(spider, requests)
        elif requests is None:
            self._scheduled[spider] = None
        else:
            self._scheduled.setdefault(spider, []).append(requests)

    def _schedule(self, spider, batches=()):
        requests = chain.from_iterable(batches) \
            if batches else spider.start_requests()
        return self.engine.open_spider(spider, requests)

    def _spider_closed(self, spider=None):
        if not self.engine.open_spiders:
            self.stop()

    @defer.inlineCallbacks
    def start(self):
        yield defer.maybeDeferred(self.configure)

        for spider, batches in self._scheduled.iteritems():
            yield self._schedule(spider, batches)

        yield defer.maybeDeferred(self.engine.start)

    @defer.inlineCallbacks
    def stop(self):
        if self.engine.running:
            yield defer.maybeDeferred(self.engine.stop)
Example #14
0
 def test_close_spiders_downloader(self):
     e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
     yield e.open_spider(TestSpider(), [])
     self.assertEqual(len(e.open_spiders), 1)
     yield e.close()
     self.assertEqual(len(e.open_spiders), 0)
Example #15
0
 def _create_engine(self):
     ## 返回一个执行引擎类的实例
     return ExecutionEngine(self, lambda _: self.stop())
Example #16
0
class ExecutionManager(object):

    def __init__(self):
        self.configured = False
        self.control_reactor = True
        self.engine = ExecutionEngine()

    def configure(self, control_reactor=True, queue=None):
        self.control_reactor = control_reactor
        if control_reactor:
            install_shutdown_handlers(self._signal_shutdown)

        if not log.started:
            log.start()
        if not extensions.loaded:
            extensions.load()
        if not spiders.loaded:
            spiders.load()
        log.msg("Enabled extensions: %s" % ", ".join(extensions.enabled.iterkeys()),
            level=log.DEBUG)

        self.queue = queue or ExecutionQueue()
        self.engine.configure(self._spider_closed)
        self.configured = True

    @defer.inlineCallbacks
    def _start_next_spider(self):
        spider, requests = yield defer.maybeDeferred(self.queue.get_next)
        if spider:
            self._start_spider(spider, requests)
        if self.engine.has_capacity() and not self._nextcall.active():
            self._nextcall = reactor.callLater(self.queue.polling_delay, \
                self._start_next_spider)

    @defer.inlineCallbacks
    def _start_spider(self, spider, requests):
        """Don't call this method. Use self.queue to start new spiders"""
        yield defer.maybeDeferred(self.engine.open_spider, spider)
        for request in requests:
            self.engine.crawl(request, spider)

    @defer.inlineCallbacks
    def _spider_closed(self, spider=None):
        if not self.engine.open_spiders:
            is_finished = yield defer.maybeDeferred(self.queue.is_finished)
            if is_finished:
                self.stop()
                return
        if self.engine.has_capacity():
            self._start_next_spider()

    @defer.inlineCallbacks
    def start(self):
        yield defer.maybeDeferred(self.engine.start)
        self._nextcall = reactor.callLater(0, self._spider_closed)
        reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
        if self.control_reactor:
            reactor.run(installSignalHandlers=False)

    @defer.inlineCallbacks
    def stop(self):
        if self._nextcall.active():
            self._nextcall.cancel()
        if self.engine.running:
            yield defer.maybeDeferred(self.engine.stop)
        try:
            reactor.stop()
        except RuntimeError: # raised if already stopped or in shutdown stage
            pass

    def _signal_shutdown(self, signum, _):
        signame = signal_names[signum]
        log.msg("Received %s, shutting down gracefully. Send again to force " \
            "unclean shutdown" % signame, level=log.INFO)
        reactor.callFromThread(self.stop)
        install_shutdown_handlers(self._signal_kill)

    def _signal_kill(self, signum, _):
        signame = signal_names[signum]
        log.msg('Received %s twice, forcing unclean shutdown' % signame, \
            level=log.INFO)
        log.log_level = log.SILENT # disable logging of confusing tracebacks
        reactor.callFromThread(self.engine.kill)
        install_shutdown_handlers(signal.SIG_IGN)
Example #17
0
 def _create_engine(self):
     # p.11 实例化引擎,并挂载引擎正常停止的回调
     return ExecutionEngine(self, lambda _: self.stop())
Example #18
0
class Crawler(object):
    def __init__(self, settings):
        self.configured = False
        self.settings = settings
        self.signals = SignalManager(self)
        self.stats = load_object(settings['STATS_CLASS'])(self)
        self._start_requests = lambda: ()
        self._spider = None
        # TODO: move SpiderManager to CrawlerProcess
        spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
        self.spiders = spman_cls.from_crawler(self)

    def install(self):
        # TODO: remove together with scrapy.project.crawler usage
        import scrapy.project
        assert not hasattr(scrapy.project,
                           'crawler'), "crawler already installed"
        scrapy.project.crawler = self

    def uninstall(self):
        # TODO: remove together with scrapy.project.crawler usage
        import scrapy.project
        assert hasattr(scrapy.project, 'crawler'), "crawler not installed"
        del scrapy.project.crawler

    def configure(self):
        if self.configured:
            return

        self.configured = True
        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)
        self.engine = ExecutionEngine(self, self._spider_closed)

    def crawl(self, spider, requests=None):
        assert self._spider is None, 'Spider already attached'
        self._spider = spider
        spider.set_crawler(self)
        if requests is None:
            self._start_requests = spider.start_requests
        else:
            self._start_requests = lambda: requests

    def _spider_closed(self, spider=None):
        if not self.engine.open_spiders:
            self.stop()

    #modify by tangm 2015.03.27 04:00
    #for use close_if_idle
    @defer.inlineCallbacks
    def start(self, close_if_idle=True):
        yield defer.maybeDeferred(self.configure)
        if self._spider:
            yield self.engine.open_spider(self._spider,
                                          self._start_requests(),
                                          close_if_idle=close_if_idle)
        yield defer.maybeDeferred(self.engine.start)

    @defer.inlineCallbacks
    def stop(self):
        if self.configured and self.engine.running:
            yield defer.maybeDeferred(self.engine.stop)
Example #19
0
class Crawler(object):
    def __init__(self, settings):
        self.configured = False
        self.settings = settings
        self.signals = SignalManager(self)
        self.stats = load_object(settings['STATS_CLASS'])(self)

        spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
        self.spiders = spman_cls.from_crawler(self)

        self.scheduled = {}

    def install(self):
        import scrapy.project
        assert not hasattr(scrapy.project,
                           'crawler'), "crawler already installed"
        scrapy.project.crawler = self

    def uninstall(self):
        import scrapy.project
        assert hasattr(scrapy.project, 'crawler'), "crawler not installed"
        del scrapy.project.crawler

    def configure(self):
        if self.configured:
            return

        self.configured = True
        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)
        self.engine = ExecutionEngine(self, self._spider_closed)

    def crawl(self, spider, requests=None):
        spider.set_crawler(self)

        if self.configured and self.engine.running:
            assert not self.scheduled
            return self.schedule(spider, requests)
        else:
            self.scheduled.setdefault(spider, []).append(requests)

    def schedule(self, spider, batches=[]):
        requests = []
        for batch in batches:
            if batch is None:
                batch = spider.start_requests()
            requests.extend(batch)

        return self.engine.open_spider(spider, requests)

    def _spider_closed(self, spider=None):
        if not self.engine.open_spiders:
            self.stop()

    @defer.inlineCallbacks
    def start(self):
        yield defer.maybeDeferred(self.configure)

        for spider, batches in self.scheduled.iteritems():
            yield self.schedule(spider, batches)

        yield defer.maybeDeferred(self.engine.start)

    @defer.inlineCallbacks
    def stop(self):
        if self.engine.running:
            yield defer.maybeDeferred(self.engine.stop)
Example #20
0
 def __init__(self):
     self.configured = False
     self.control_reactor = True
     self.engine = ExecutionEngine()
Example #21
0
 def _create_engine(self):
     return ExecutionEngine(self, lambda _: self.stop())
Example #22
0
 def test_close_downloader(self):
     e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
     yield e.close()
Example #23
0
class Crawler(object):

    def __init__(self, settings):
        self.configured = False
        self.settings = settings

    def install(self):
        import scrapy.project
        assert not hasattr(scrapy.project, 'crawler'), "crawler already installed"
        scrapy.project.crawler = self

    def uninstall(self):
        import scrapy.project
        assert hasattr(scrapy.project, 'crawler'), "crawler not installed"
        del scrapy.project.crawler

    def configure(self):
        if self.configured:
            return
        self.configured = True
        self.extensions = ExtensionManager.from_settings(self.settings)
        spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
        self.spiders = spman_cls.from_settings(self.settings)
        spq_cls = load_object(self.settings['SPIDER_QUEUE_CLASS'])
        spq = spq_cls.from_settings(self.settings)
        keepalive = self.settings.getbool('KEEP_ALIVE')
        pollint = self.settings.getfloat('QUEUE_POLL_INTERVAL')
        self.queue = ExecutionQueue(self.spiders, spq, poll_interval=pollint,
            keep_alive=keepalive)
        self.engine = ExecutionEngine(self.settings, self._spider_closed)

    @defer.inlineCallbacks
    def _start_next_spider(self):
        spider, requests = yield defer.maybeDeferred(self.queue.get_next)
        if spider:
            self._start_spider(spider, requests)
        if self.engine.has_capacity() and not self._nextcall.active():
            self._nextcall = reactor.callLater(self.queue.poll_interval, \
                self._spider_closed)

    @defer.inlineCallbacks
    def _start_spider(self, spider, requests):
        """Don't call this method. Use self.queue to start new spiders"""
        spider.set_crawler(self)
        yield defer.maybeDeferred(self.engine.open_spider, spider)
        for request in requests:
            self.engine.crawl(request, spider)

    @defer.inlineCallbacks
    def _spider_closed(self, spider=None):
        if not self.engine.open_spiders:
            is_finished = yield defer.maybeDeferred(self.queue.is_finished)
            if is_finished:
                self.stop()
                return
        if self.engine.has_capacity():
            self._start_next_spider()

    @defer.inlineCallbacks
    def start(self):
        yield defer.maybeDeferred(self.configure)
        yield defer.maybeDeferred(self.engine.start)
        self._nextcall = reactor.callLater(0, self._start_next_spider)

    @defer.inlineCallbacks
    def stop(self):
        if self._nextcall.active():
            self._nextcall.cancel()
        if self.engine.running:
            yield defer.maybeDeferred(self.engine.stop)