Beispiel #1
0
 def test_start_already_running_exception(self):
     e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
     yield e.open_spider(TestSpider(), [])
     e.start()
     yield self.assertFailure(e.start(), RuntimeError).addBoth(
         lambda exc: self.assertEqual(str(exc), "Engine already running"))
     yield e.stop()
Beispiel #2
0
class Crawler(object):

    def __init__(self, settings):
        self.configured = False
        self.settings = settings
        self.signals = SignalManager(self)
        self.stats = load_object(settings['STATS_CLASS'])(self)
        self._start_requests = lambda: ()
        self._spider = None
        # TODO: move SpiderManager to CrawlerProcess
        spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
        self.spiders = spman_cls.from_crawler(self)

    def install(self):
        # TODO: remove together with scrapy.project.crawler usage
        import scrapy.project
        assert not hasattr(
            scrapy.project, 'crawler'), "crawler already installed"
        scrapy.project.crawler = self

    def uninstall(self):
        # TODO: remove together with scrapy.project.crawler usage
        import scrapy.project
        assert hasattr(scrapy.project, 'crawler'), "crawler not installed"
        del scrapy.project.crawler

    def configure(self):
        if self.configured:
            return

        self.configured = True
        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)
        self.engine = ExecutionEngine(self, self._spider_closed)

    def crawl(self, spider, requests=None):
        assert self._spider is None, 'Spider already attached'
        self._spider = spider
        spider.set_crawler(self)
        if requests is None:
            self._start_requests = spider.start_requests
        else:
            self._start_requests = lambda: requests

    def _spider_closed(self, spider=None):
        if not self.engine.open_spiders:
            self.stop()

    @defer.inlineCallbacks
    def start(self):
        yield defer.maybeDeferred(self.configure)
        if self._spider:
            yield self.engine.open_spider(self._spider, self._start_requests())
        yield defer.maybeDeferred(self.engine.start)

    @defer.inlineCallbacks
    def stop(self):
        if self.configured and self.engine.running:
            yield defer.maybeDeferred(self.engine.stop)
Beispiel #3
0
class Crawler(object):
    def __init__(self, settings):
        self.configured = False
        self.settings = settings
        self.signals = SignalManager(self)
        self.stats = load_object(settings['STATS_CLASS'])(self)
        self._start_requests = lambda: ()
        self._spider = None
        # TODO: move SpiderManager to CrawlerProcess
        spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
        self.spiders = spman_cls.from_crawler(self)

    def install(self):
        # TODO: remove together with scrapy.project.crawler usage
        import scrapy.project
        assert not hasattr(scrapy.project,
                           'crawler'), "crawler already installed"
        scrapy.project.crawler = self

    def uninstall(self):
        # TODO: remove together with scrapy.project.crawler usage
        import scrapy.project
        assert hasattr(scrapy.project, 'crawler'), "crawler not installed"
        del scrapy.project.crawler

    def configure(self):
        if self.configured:
            return

        self.configured = True
        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)
        self.engine = ExecutionEngine(self, self._spider_closed)

    def crawl(self, spider, requests=None):
        assert self._spider is None, 'Spider already attached'
        self._spider = spider
        spider.set_crawler(self)
        if requests is None:
            self._start_requests = spider.start_requests
        else:
            self._start_requests = lambda: requests

    def _spider_closed(self, spider=None):
        if not self.engine.open_spiders:
            self.stop()

    @defer.inlineCallbacks
    def start(self):
        yield defer.maybeDeferred(self.configure)
        if self._spider:
            yield self.engine.open_spider(self._spider, self._start_requests())
        yield defer.maybeDeferred(self.engine.start)

    @defer.inlineCallbacks
    def stop(self):
        if self.configured and self.engine.running:
            yield defer.maybeDeferred(self.engine.stop)
Beispiel #4
0
 def test_close_engine_spiders_downloader(self):
     e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
     yield e.open_spider(TestSpider(), [])
     e.start()
     self.assertTrue(e.running)
     yield e.close()
     self.assertFalse(e.running)
     self.assertEqual(len(e.open_spiders), 0)
Beispiel #5
0
 def test_close_engine_spiders_downloader(self):
     e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
     yield e.open_spider(TestSpider(), [])
     e.start()
     self.assertTrue(e.running)
     yield e.close()
     self.assertFalse(e.running)
     self.assertEqual(len(e.open_spiders), 0)
Beispiel #6
0
 def test_close_spiders_downloader(self):
     with pytest.warns(ScrapyDeprecationWarning,
                       match="ExecutionEngine.open_spiders is deprecated, "
                             "please use ExecutionEngine.spider instead"):
         e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
         yield e.open_spider(TestSpider(), [])
         self.assertEqual(len(e.open_spiders), 1)
         yield e.close()
         self.assertEqual(len(e.open_spiders), 0)
Beispiel #7
0
 def test_deprecated_schedule(self):
     with pytest.warns(ScrapyDeprecationWarning,
                       match="ExecutionEngine.schedule is deprecated, please use "
                             "ExecutionEngine.crawl or ExecutionEngine.download instead"):
         e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
         spider = TestSpider()
         yield e.open_spider(spider, [])
         e.start()
         e.schedule(Request("data:,"), spider)
         yield e.close()
Beispiel #8
0
 def test_download_deprecated_spider_arg(self):
     with pytest.warns(ScrapyDeprecationWarning,
                       match="Passing a 'spider' argument to "
                             "ExecutionEngine.download is deprecated"):
         e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
         spider = TestSpider()
         yield e.open_spider(spider, [])
         e.start()
         e.download(Request("data:,"), spider)
         yield e.close()
Beispiel #9
0
 def test_deprecated_has_capacity(self):
     with pytest.warns(ScrapyDeprecationWarning,
                       match="ExecutionEngine.has_capacity is deprecated"):
         e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
         self.assertTrue(e.has_capacity())
         spider = TestSpider()
         yield e.open_spider(spider, [])
         self.assertFalse(e.has_capacity())
         e.start()
         yield e.close()
         self.assertTrue(e.has_capacity())
Beispiel #10
0
 def test_deprecated_has_capacity(self):
     with warnings.catch_warnings(record=True) as warning_list:
         e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
         self.assertTrue(e.has_capacity())
         spider = TestSpider()
         yield e.open_spider(spider, [])
         self.assertFalse(e.has_capacity())
         e.start()
         yield e.close()
         self.assertTrue(e.has_capacity())
         self.assertEqual(warning_list[0].category, ScrapyDeprecationWarning)
         self.assertEqual(str(warning_list[0].message), "ExecutionEngine.has_capacity is deprecated")
Beispiel #11
0
 def test_close_spiders_downloader(self):
     with warnings.catch_warnings(record=True) as warning_list:
         e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
         yield e.open_spider(TestSpider(), [])
         self.assertEqual(len(e.open_spiders), 1)
         yield e.close()
         self.assertEqual(len(e.open_spiders), 0)
         self.assertEqual(warning_list[0].category, ScrapyDeprecationWarning)
         self.assertEqual(
             str(warning_list[0].message),
             "ExecutionEngine.open_spiders is deprecated, please use ExecutionEngine.spider instead",
         )
Beispiel #12
0
 def test_download_deprecated_spider_arg(self):
     with warnings.catch_warnings(record=True) as warning_list:
         e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
         spider = TestSpider()
         yield e.open_spider(spider, [])
         e.start()
         e.download(Request("data:,"), spider)
         yield e.close()
         self.assertEqual(warning_list[0].category, ScrapyDeprecationWarning)
         self.assertEqual(
             str(warning_list[0].message),
             "Passing a 'spider' argument to ExecutionEngine.download is deprecated",
         )
Beispiel #13
0
class Crawler(object):

    def __init__(self, settings):
        self.configured = False
        self.settings = settings
        self.signals = SignalManager(self)
        self.stats = load_object(settings['STATS_CLASS'])(self)

    def install(self):
        import scrapy.project
        assert not hasattr(scrapy.project, 'crawler'), "crawler already installed"
        scrapy.project.crawler = self

    def uninstall(self):
        import scrapy.project
        assert hasattr(scrapy.project, 'crawler'), "crawler not installed"
        del scrapy.project.crawler

    def configure(self):
        if self.configured:
            return
        self.configured = True
        d = dict(overridden_settings(self.settings))
        log.msg(format="Overridden settings: %(settings)r", settings=d, level=log.DEBUG)
        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)
        spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
        self.spiders = spman_cls.from_crawler(self)
        self.engine = ExecutionEngine(self, self._spider_closed)

    def crawl(self, spider, requests=None):
        spider.set_crawler(self)
        if requests is None:
            requests = spider.start_requests()
        return self.engine.open_spider(spider, requests)

    def _spider_closed(self, spider=None):
        if not self.engine.open_spiders:
            self.stop()

    @defer.inlineCallbacks
    def start(self):
        yield defer.maybeDeferred(self.configure)
        yield defer.maybeDeferred(self.engine.start)

    @defer.inlineCallbacks
    def stop(self):
        if self.engine.running:
            yield defer.maybeDeferred(self.engine.stop)
Beispiel #14
0
 def test_deprecated_schedule(self):
     with warnings.catch_warnings(record=True) as warning_list:
         e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
         spider = TestSpider()
         yield e.open_spider(spider, [])
         e.start()
         e.schedule(Request("data:,"), spider)
         yield e.close()
         self.assertEqual(warning_list[0].category, ScrapyDeprecationWarning)
         self.assertEqual(
             str(warning_list[0].message),
             "ExecutionEngine.schedule is deprecated, please use "
             "ExecutionEngine.crawl or ExecutionEngine.download instead",
         )
Beispiel #15
0
class Crawler(object):
    def __init__(self, settings):
        self.configured = False
        self.settings = settings
        self.signals = SignalManager(self)
        self.stats = load_object(settings["STATS_CLASS"])(self)

    def install(self):
        import scrapy.project

        assert not hasattr(scrapy.project, "crawler"), "crawler already installed"
        scrapy.project.crawler = self

    def uninstall(self):
        import scrapy.project

        assert hasattr(scrapy.project, "crawler"), "crawler not installed"
        del scrapy.project.crawler

    def configure(self):
        if self.configured:
            return
        self.configured = True
        self.extensions = ExtensionManager.from_crawler(self)
        spman_cls = load_object(self.settings["SPIDER_MANAGER_CLASS"])
        self.spiders = spman_cls.from_crawler(self)
        self.engine = ExecutionEngine(self, self._spider_closed)

    def crawl(self, spider, requests=None):
        spider.set_crawler(self)
        if requests is None:
            requests = spider.start_requests()
        return self.engine.open_spider(spider, requests)

    def _spider_closed(self, spider=None):
        if not self.engine.open_spiders:
            self.stop()

    @defer.inlineCallbacks
    def start(self):
        yield defer.maybeDeferred(self.configure)
        yield defer.maybeDeferred(self.engine.start)

    @defer.inlineCallbacks
    def stop(self):
        if self.engine.running:
            yield defer.maybeDeferred(self.engine.stop)
Beispiel #16
0
class Crawler(SettingObject):
    
    spider_manager_class = StringField(default="scrapy.spidermanager.SpiderManager")
    
    def __init__(self, settings):
        super(Crawler, self).__init__(settings)
        self.configured = False

    def configure(self):
        if self.configured:
            return
        self.configured = True
        self.extensions = ExtensionManager(self.metas, self)
        spman_cls = load_object(self.spider_manager_class.to_value())
        self.spiders = spman_cls(self.metas)
        self.engine = ExecutionEngine(self, self._spider_closed)

    def crawl(self, spider, requests=None):
        spider.set_crawler(self)
        if requests is None:
            requests = spider.start_requests()
        return self.engine.open_spider(spider, requests)

    def _spider_closed(self, spider=None):
        if not self.engine.open_spiders:
            self.stop()

    @defer.inlineCallbacks
    def start(self):
        yield defer.maybeDeferred(self.configure)
        yield defer.maybeDeferred(self.engine.start)

    @defer.inlineCallbacks
    def stop(self):
        if self.engine.running:
            yield defer.maybeDeferred(self.engine.stop)
Beispiel #17
0
class Crawler(object):
    def __init__(self, settings):
        self.configured = False
        self.settings = settings
        self.signals = SignalManager(self)
        self.stats = load_object(settings['STATS_CLASS'])(self)

        spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
        self.spiders = spman_cls.from_crawler(self)

        self.scheduled = {}

    def install(self):
        import scrapy.project
        assert not hasattr(scrapy.project,
                           'crawler'), "crawler already installed"
        scrapy.project.crawler = self

    def uninstall(self):
        import scrapy.project
        assert hasattr(scrapy.project, 'crawler'), "crawler not installed"
        del scrapy.project.crawler

    def configure(self):
        if self.configured:
            return

        self.configured = True
        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)
        self.engine = ExecutionEngine(self, self._spider_closed)

    def crawl(self, spider, requests=None):
        spider.set_crawler(self)

        if self.configured and self.engine.running:
            assert not self.scheduled
            return self.schedule(spider, requests)
        else:
            self.scheduled.setdefault(spider, []).append(requests)

    def schedule(self, spider, batches=[]):
        requests = []
        for batch in batches:
            if batch is None:
                batch = spider.start_requests()
            requests.extend(batch)

        return self.engine.open_spider(spider, requests)

    def _spider_closed(self, spider=None):
        if not self.engine.open_spiders:
            self.stop()

    @defer.inlineCallbacks
    def start(self):
        yield defer.maybeDeferred(self.configure)

        for spider, batches in self.scheduled.iteritems():
            yield self.schedule(spider, batches)

        yield defer.maybeDeferred(self.engine.start)

    @defer.inlineCallbacks
    def stop(self):
        if self.engine.running:
            yield defer.maybeDeferred(self.engine.stop)
Beispiel #18
0
 def test_close_spiders_downloader(self):
     e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
     yield e.open_spider(TestSpider(), [])
     self.assertEqual(len(e.open_spiders), 1)
     yield e.close()
     self.assertEqual(len(e.open_spiders), 0)
Beispiel #19
0
 def test_close_spiders_downloader(self):
     e = ExecutionEngine(get_crawler(TestSpider), lambda _: None)
     yield e.open_spider(TestSpider(), [])
     self.assertEqual(len(e.open_spiders), 1)
     yield e.close()
     self.assertEqual(len(e.open_spiders), 0)
Beispiel #20
0
class Crawler(object):

    def __init__(self, settings):
        self.configured = False
        self.settings = settings
        self.signals = SignalManager(self)
        self.stats = load_object(settings['STATS_CLASS'])(self)

        spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
        self.spiders = spman_cls.from_crawler(self)
        self._scheduled = {}

    def install(self):
        import scrapy.project
        assert not hasattr(scrapy.project, 'crawler'), "crawler already installed"
        scrapy.project.crawler = self

    def uninstall(self):
        import scrapy.project
        assert hasattr(scrapy.project, 'crawler'), "crawler not installed"
        del scrapy.project.crawler

    def configure(self):
        if self.configured:
            return

        self.configured = True
        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)
        self.engine = ExecutionEngine(self, self._spider_closed)

    def crawl(self, spider, requests=None):
        spider.set_crawler(self)
        if self.configured and self.engine.running:
            assert not self._scheduled
            return self._schedule(spider, requests)
        elif requests is None:
            self._scheduled[spider] = None
        else:
            self._scheduled.setdefault(spider, []).append(requests)

    def _schedule(self, spider, batches=()):
        requests = chain.from_iterable(batches) \
            if batches else spider.start_requests()
        return self.engine.open_spider(spider, requests)

    def _spider_closed(self, spider=None):
        if not self.engine.open_spiders:
            self.stop()

    @defer.inlineCallbacks
    def start(self):
        yield defer.maybeDeferred(self.configure)

        for spider, batches in self._scheduled.iteritems():
            yield self._schedule(spider, batches)

        yield defer.maybeDeferred(self.engine.start)

    @defer.inlineCallbacks
    def stop(self):
        if self.engine.running:
            yield defer.maybeDeferred(self.engine.stop)