def test_crawl_deprecated_spider_arg(self): with pytest.warns(ScrapyDeprecationWarning, match="Passing a 'spider' argument to " "ExecutionEngine.crawl is deprecated"): e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) spider = TestSpider() yield e.open_spider(spider, []) e.start() e.crawl(Request("data:,"), spider) yield e.close()
def test_crawl_deprecated_spider_arg(self): with warnings.catch_warnings(record=True) as warning_list: e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) spider = TestSpider() yield e.open_spider(spider, []) e.start() e.crawl(Request("data:,"), spider) yield e.close() self.assertEqual(warning_list[0].category, ScrapyDeprecationWarning) self.assertEqual( str(warning_list[0].message), "Passing a 'spider' argument to ExecutionEngine.crawl is deprecated", )
class ExecutionManager(object): def __init__(self): self.configured = False self.control_reactor = True self.engine = ExecutionEngine() def configure(self, control_reactor=True, queue=None): self.control_reactor = control_reactor if control_reactor: install_shutdown_handlers(self._signal_shutdown) if not log.started: log.start() if not extensions.loaded: extensions.load() if not spiders.loaded: spiders.load() log.msg("Enabled extensions: %s" % ", ".join(extensions.enabled.iterkeys()), level=log.DEBUG) self.queue = queue or ExecutionQueue() self.engine.configure(self._spider_closed) self.configured = True @defer.inlineCallbacks def _start_next_spider(self): spider, requests = yield defer.maybeDeferred(self.queue.get_next) if spider: self._start_spider(spider, requests) if self.engine.has_capacity() and not self._nextcall.active(): self._nextcall = reactor.callLater(self.queue.polling_delay, \ self._start_next_spider) @defer.inlineCallbacks def _start_spider(self, spider, requests): """Don't call this method. Use self.queue to start new spiders""" yield defer.maybeDeferred(self.engine.open_spider, spider) for request in requests: self.engine.crawl(request, spider) @defer.inlineCallbacks def _spider_closed(self, spider=None): if not self.engine.open_spiders: is_finished = yield defer.maybeDeferred(self.queue.is_finished) if is_finished: self.stop() return if self.engine.has_capacity(): self._start_next_spider() @defer.inlineCallbacks def start(self): yield defer.maybeDeferred(self.engine.start) self._nextcall = reactor.callLater(0, self._spider_closed) reactor.addSystemEventTrigger('before', 'shutdown', self.stop) if self.control_reactor: reactor.run(installSignalHandlers=False) @defer.inlineCallbacks def stop(self): if self._nextcall.active(): self._nextcall.cancel() if self.engine.running: yield defer.maybeDeferred(self.engine.stop) try: reactor.stop() except RuntimeError: # raised if already stopped or in shutdown stage pass def _signal_shutdown(self, signum, _): signame = signal_names[signum] log.msg("Received %s, shutting down gracefully. Send again to force " \ "unclean shutdown" % signame, level=log.INFO) reactor.callFromThread(self.stop) install_shutdown_handlers(self._signal_kill) def _signal_kill(self, signum, _): signame = signal_names[signum] log.msg('Received %s twice, forcing unclean shutdown' % signame, \ level=log.INFO) log.log_level = log.SILENT # disable logging of confusing tracebacks reactor.callFromThread(self.engine.kill) install_shutdown_handlers(signal.SIG_IGN)
class Crawler(object): def __init__(self, settings): self.configured = False self.settings = settings def install(self): import scrapy.project assert not hasattr(scrapy.project, 'crawler'), "crawler already installed" scrapy.project.crawler = self def uninstall(self): import scrapy.project assert hasattr(scrapy.project, 'crawler'), "crawler not installed" del scrapy.project.crawler def configure(self): if self.configured: return self.configured = True self.extensions = ExtensionManager.from_settings(self.settings) # 加载扩展 spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_settings(self.settings) spq_cls = load_object(self.settings['SPIDER_QUEUE_CLASS']) spq = spq_cls.from_settings(self.settings) keepalive = self.settings.getbool('KEEP_ALIVE') pollint = self.settings.getfloat('QUEUE_POLL_INTERVAL') self.queue = ExecutionQueue(self.spiders, spq, poll_interval=pollint, keep_alive=keepalive) # 获取队列 self.engine = ExecutionEngine(self.settings, self._spider_closed) # 启动引擎 @defer.inlineCallbacks def _start_next_spider(self): spider, requests = yield defer.maybeDeferred(self.queue.get_next) if spider: self._start_spider(spider, requests) if self.engine.has_capacity() and not self._nextcall.active(): self._nextcall = reactor.callLater(self.queue.poll_interval, \ self._spider_closed) @defer.inlineCallbacks def _start_spider(self, spider, requests): """Don't call this method. Use self.queue to start new spiders""" spider.set_crawler(self) yield defer.maybeDeferred(self.engine.open_spider, spider) for request in requests: self.engine.crawl(request, spider) @defer.inlineCallbacks def _spider_closed(self, spider=None): if not self.engine.open_spiders: is_finished = yield defer.maybeDeferred(self.queue.is_finished) if is_finished: self.stop() return if self.engine.has_capacity(): self._start_next_spider() @defer.inlineCallbacks def start(self): yield defer.maybeDeferred(self.configure) yield defer.maybeDeferred(self.engine.start) self._nextcall = reactor.callLater(0, self._start_next_spider) @defer.inlineCallbacks def stop(self): if self._nextcall.active(): self._nextcall.cancel() if self.engine.running: yield defer.maybeDeferred(self.engine.stop)
class Crawler(object): def __init__(self, settings): self.configured = False self.settings = settings def install(self): import scrapy.project assert not hasattr(scrapy.project, 'crawler'), "crawler already installed" scrapy.project.crawler = self def uninstall(self): import scrapy.project assert hasattr(scrapy.project, 'crawler'), "crawler not installed" del scrapy.project.crawler def configure(self): if self.configured: return self.configured = True self.extensions = ExtensionManager.from_settings(self.settings) spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_settings(self.settings) spq_cls = load_object(self.settings['SPIDER_QUEUE_CLASS']) spq = spq_cls.from_settings(self.settings) keepalive = self.settings.getbool('KEEP_ALIVE') pollint = self.settings.getfloat('QUEUE_POLL_INTERVAL') self.queue = ExecutionQueue(self.spiders, spq, poll_interval=pollint, keep_alive=keepalive) self.engine = ExecutionEngine(self.settings, self._spider_closed) @defer.inlineCallbacks def _start_next_spider(self): spider, requests = yield defer.maybeDeferred(self.queue.get_next) if spider: self._start_spider(spider, requests) if self.engine.has_capacity() and not self._nextcall.active(): self._nextcall = reactor.callLater(self.queue.poll_interval, \ self._spider_closed) @defer.inlineCallbacks def _start_spider(self, spider, requests): """Don't call this method. Use self.queue to start new spiders""" spider.set_crawler(self) yield defer.maybeDeferred(self.engine.open_spider, spider) for request in requests: self.engine.crawl(request, spider) @defer.inlineCallbacks def _spider_closed(self, spider=None): if not self.engine.open_spiders: is_finished = yield defer.maybeDeferred(self.queue.is_finished) if is_finished: self.stop() return if self.engine.has_capacity(): self._start_next_spider() @defer.inlineCallbacks def start(self): yield defer.maybeDeferred(self.configure) yield defer.maybeDeferred(self.engine.start) self._nextcall = reactor.callLater(0, self._start_next_spider) @defer.inlineCallbacks def stop(self): if self._nextcall.active(): self._nextcall.cancel() if self.engine.running: yield defer.maybeDeferred(self.engine.stop)