def test_close_engine_spiders_downloader(self): e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) yield e.open_spider(TestSpider(), []) e.start() self.assertTrue(e.running) yield e.close() self.assertFalse(e.running) self.assertEqual(len(e.open_spiders), 0)
class Crawler(object): def __init__(self, settings): self.configured = False self.settings = settings self.signals = SignalManager(self) self.stats = load_object(settings['STATS_CLASS'])(self) self._start_requests = lambda: () self._spider = None # TODO: move SpiderManager to CrawlerProcess spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_crawler(self) def install(self): # TODO: remove together with scrapy.project.crawler usage import scrapy.project assert not hasattr( scrapy.project, 'crawler'), "crawler already installed" scrapy.project.crawler = self def uninstall(self): # TODO: remove together with scrapy.project.crawler usage import scrapy.project assert hasattr(scrapy.project, 'crawler'), "crawler not installed" del scrapy.project.crawler def configure(self): if self.configured: return self.configured = True lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.engine = ExecutionEngine(self, self._spider_closed) def crawl(self, spider, requests=None): assert self._spider is None, 'Spider already attached' self._spider = spider spider.set_crawler(self) if requests is None: self._start_requests = spider.start_requests else: self._start_requests = lambda: requests def _spider_closed(self, spider=None): if not self.engine.open_spiders: self.stop() @defer.inlineCallbacks def start(self): yield defer.maybeDeferred(self.configure) if self._spider: yield self.engine.open_spider(self._spider, self._start_requests()) yield defer.maybeDeferred(self.engine.start) @defer.inlineCallbacks def stop(self): if self.configured and self.engine.running: yield defer.maybeDeferred(self.engine.stop)
def configure(self): if self.configured: return self.configured = True self.extensions = ExtensionManager.from_crawler(self) spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_crawler(self) self.engine = ExecutionEngine(self, self._spider_closed)
def configure(self): if self.configured: return self.configured = True self.extensions = ExtensionManager(self.metas, self) spman_cls = load_object(self.spider_manager_class.to_value()) self.spiders = spman_cls(self.metas) self.engine = ExecutionEngine(self, self._spider_closed)
def configure(self): if self.configured: return self.configured = True lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.engine = ExecutionEngine(self, self._spider_closed)
def configure(self): if self.configured: return self.configured = True d = dict(overridden_settings(self.settings)) log.msg(format="Overridden settings: %(settings)r", settings=d, level=log.DEBUG) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_crawler(self) self.engine = ExecutionEngine(self, self._spider_closed)
class Crawler(object): def __init__(self, settings): self.configured = False self.settings = settings self.signals = SignalManager(self) self.stats = load_object(settings['STATS_CLASS'])(self) def install(self): import scrapy.project assert not hasattr(scrapy.project, 'crawler'), "crawler already installed" scrapy.project.crawler = self def uninstall(self): import scrapy.project assert hasattr(scrapy.project, 'crawler'), "crawler not installed" del scrapy.project.crawler def configure(self): if self.configured: return self.configured = True d = dict(overridden_settings(self.settings)) log.msg(format="Overridden settings: %(settings)r", settings=d, level=log.DEBUG) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_crawler(self) self.engine = ExecutionEngine(self, self._spider_closed) def crawl(self, spider, requests=None): spider.set_crawler(self) if requests is None: requests = spider.start_requests() return self.engine.open_spider(spider, requests) def _spider_closed(self, spider=None): if not self.engine.open_spiders: self.stop() @defer.inlineCallbacks def start(self): yield defer.maybeDeferred(self.configure) yield defer.maybeDeferred(self.engine.start) @defer.inlineCallbacks def stop(self): if self.engine.running: yield defer.maybeDeferred(self.engine.stop)
def configure(self): if self.configured: return self.configured = True self.extensions = ExtensionManager.from_settings(self.settings) spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_settings(self.settings) spq_cls = load_object(self.settings['SPIDER_QUEUE_CLASS']) spq = spq_cls.from_settings(self.settings) keepalive = self.settings.getbool('KEEP_ALIVE') pollint = self.settings.getfloat('QUEUE_POLL_INTERVAL') self.queue = ExecutionQueue(self.spiders, spq, poll_interval=pollint, keep_alive=keepalive) self.engine = ExecutionEngine(self.settings, self._spider_closed)
class Crawler(object): def __init__(self, settings): self.configured = False self.settings = settings self.signals = SignalManager(self) self.stats = load_object(settings["STATS_CLASS"])(self) def install(self): import scrapy.project assert not hasattr(scrapy.project, "crawler"), "crawler already installed" scrapy.project.crawler = self def uninstall(self): import scrapy.project assert hasattr(scrapy.project, "crawler"), "crawler not installed" del scrapy.project.crawler def configure(self): if self.configured: return self.configured = True self.extensions = ExtensionManager.from_crawler(self) spman_cls = load_object(self.settings["SPIDER_MANAGER_CLASS"]) self.spiders = spman_cls.from_crawler(self) self.engine = ExecutionEngine(self, self._spider_closed) def crawl(self, spider, requests=None): spider.set_crawler(self) if requests is None: requests = spider.start_requests() return self.engine.open_spider(spider, requests) def _spider_closed(self, spider=None): if not self.engine.open_spiders: self.stop() @defer.inlineCallbacks def start(self): yield defer.maybeDeferred(self.configure) yield defer.maybeDeferred(self.engine.start) @defer.inlineCallbacks def stop(self): if self.engine.running: yield defer.maybeDeferred(self.engine.stop)
class Crawler(SettingObject): spider_manager_class = StringField(default="scrapy.spidermanager.SpiderManager") def __init__(self, settings): super(Crawler, self).__init__(settings) self.configured = False def configure(self): if self.configured: return self.configured = True self.extensions = ExtensionManager(self.metas, self) spman_cls = load_object(self.spider_manager_class.to_value()) self.spiders = spman_cls(self.metas) self.engine = ExecutionEngine(self, self._spider_closed) def crawl(self, spider, requests=None): spider.set_crawler(self) if requests is None: requests = spider.start_requests() return self.engine.open_spider(spider, requests) def _spider_closed(self, spider=None): if not self.engine.open_spiders: self.stop() @defer.inlineCallbacks def start(self): yield defer.maybeDeferred(self.configure) yield defer.maybeDeferred(self.engine.start) @defer.inlineCallbacks def stop(self): if self.engine.running: yield defer.maybeDeferred(self.engine.stop)
def test_close_spiders_downloader(self): e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) yield e.open_spider(TestSpider(), []) self.assertEqual(len(e.open_spiders), 1) yield e.close() self.assertEqual(len(e.open_spiders), 0)
def test_close_downloader(self): e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) yield e.close()
class Crawler(object): def __init__(self, settings): self.configured = False self.settings = settings self.signals = SignalManager(self) self.stats = load_object(settings['STATS_CLASS'])(self) spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_crawler(self) self._scheduled = {} def install(self): import scrapy.project assert not hasattr(scrapy.project, 'crawler'), "crawler already installed" scrapy.project.crawler = self def uninstall(self): import scrapy.project assert hasattr(scrapy.project, 'crawler'), "crawler not installed" del scrapy.project.crawler def configure(self): if self.configured: return self.configured = True lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.engine = ExecutionEngine(self, self._spider_closed) def crawl(self, spider, requests=None): spider.set_crawler(self) if self.configured and self.engine.running: assert not self._scheduled return self._schedule(spider, requests) elif requests is None: self._scheduled[spider] = None else: self._scheduled.setdefault(spider, []).append(requests) def _schedule(self, spider, batches=()): requests = chain.from_iterable(batches) \ if batches else spider.start_requests() return self.engine.open_spider(spider, requests) def _spider_closed(self, spider=None): if not self.engine.open_spiders: self.stop() @defer.inlineCallbacks def start(self): yield defer.maybeDeferred(self.configure) for spider, batches in self._scheduled.iteritems(): yield self._schedule(spider, batches) yield defer.maybeDeferred(self.engine.start) @defer.inlineCallbacks def stop(self): if self.engine.running: yield defer.maybeDeferred(self.engine.stop)
def _create_engine(self): ## 返回一个执行引擎类的实例 return ExecutionEngine(self, lambda _: self.stop())
class ExecutionManager(object): def __init__(self): self.configured = False self.control_reactor = True self.engine = ExecutionEngine() def configure(self, control_reactor=True, queue=None): self.control_reactor = control_reactor if control_reactor: install_shutdown_handlers(self._signal_shutdown) if not log.started: log.start() if not extensions.loaded: extensions.load() if not spiders.loaded: spiders.load() log.msg("Enabled extensions: %s" % ", ".join(extensions.enabled.iterkeys()), level=log.DEBUG) self.queue = queue or ExecutionQueue() self.engine.configure(self._spider_closed) self.configured = True @defer.inlineCallbacks def _start_next_spider(self): spider, requests = yield defer.maybeDeferred(self.queue.get_next) if spider: self._start_spider(spider, requests) if self.engine.has_capacity() and not self._nextcall.active(): self._nextcall = reactor.callLater(self.queue.polling_delay, \ self._start_next_spider) @defer.inlineCallbacks def _start_spider(self, spider, requests): """Don't call this method. Use self.queue to start new spiders""" yield defer.maybeDeferred(self.engine.open_spider, spider) for request in requests: self.engine.crawl(request, spider) @defer.inlineCallbacks def _spider_closed(self, spider=None): if not self.engine.open_spiders: is_finished = yield defer.maybeDeferred(self.queue.is_finished) if is_finished: self.stop() return if self.engine.has_capacity(): self._start_next_spider() @defer.inlineCallbacks def start(self): yield defer.maybeDeferred(self.engine.start) self._nextcall = reactor.callLater(0, self._spider_closed) reactor.addSystemEventTrigger('before', 'shutdown', self.stop) if self.control_reactor: reactor.run(installSignalHandlers=False) @defer.inlineCallbacks def stop(self): if self._nextcall.active(): self._nextcall.cancel() if self.engine.running: yield defer.maybeDeferred(self.engine.stop) try: reactor.stop() except RuntimeError: # raised if already stopped or in shutdown stage pass def _signal_shutdown(self, signum, _): signame = signal_names[signum] log.msg("Received %s, shutting down gracefully. Send again to force " \ "unclean shutdown" % signame, level=log.INFO) reactor.callFromThread(self.stop) install_shutdown_handlers(self._signal_kill) def _signal_kill(self, signum, _): signame = signal_names[signum] log.msg('Received %s twice, forcing unclean shutdown' % signame, \ level=log.INFO) log.log_level = log.SILENT # disable logging of confusing tracebacks reactor.callFromThread(self.engine.kill) install_shutdown_handlers(signal.SIG_IGN)
def _create_engine(self): # p.11 实例化引擎,并挂载引擎正常停止的回调 return ExecutionEngine(self, lambda _: self.stop())
class Crawler(object): def __init__(self, settings): self.configured = False self.settings = settings self.signals = SignalManager(self) self.stats = load_object(settings['STATS_CLASS'])(self) self._start_requests = lambda: () self._spider = None # TODO: move SpiderManager to CrawlerProcess spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_crawler(self) def install(self): # TODO: remove together with scrapy.project.crawler usage import scrapy.project assert not hasattr(scrapy.project, 'crawler'), "crawler already installed" scrapy.project.crawler = self def uninstall(self): # TODO: remove together with scrapy.project.crawler usage import scrapy.project assert hasattr(scrapy.project, 'crawler'), "crawler not installed" del scrapy.project.crawler def configure(self): if self.configured: return self.configured = True lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.engine = ExecutionEngine(self, self._spider_closed) def crawl(self, spider, requests=None): assert self._spider is None, 'Spider already attached' self._spider = spider spider.set_crawler(self) if requests is None: self._start_requests = spider.start_requests else: self._start_requests = lambda: requests def _spider_closed(self, spider=None): if not self.engine.open_spiders: self.stop() #modify by tangm 2015.03.27 04:00 #for use close_if_idle @defer.inlineCallbacks def start(self, close_if_idle=True): yield defer.maybeDeferred(self.configure) if self._spider: yield self.engine.open_spider(self._spider, self._start_requests(), close_if_idle=close_if_idle) yield defer.maybeDeferred(self.engine.start) @defer.inlineCallbacks def stop(self): if self.configured and self.engine.running: yield defer.maybeDeferred(self.engine.stop)
class Crawler(object): def __init__(self, settings): self.configured = False self.settings = settings self.signals = SignalManager(self) self.stats = load_object(settings['STATS_CLASS'])(self) spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_crawler(self) self.scheduled = {} def install(self): import scrapy.project assert not hasattr(scrapy.project, 'crawler'), "crawler already installed" scrapy.project.crawler = self def uninstall(self): import scrapy.project assert hasattr(scrapy.project, 'crawler'), "crawler not installed" del scrapy.project.crawler def configure(self): if self.configured: return self.configured = True lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.engine = ExecutionEngine(self, self._spider_closed) def crawl(self, spider, requests=None): spider.set_crawler(self) if self.configured and self.engine.running: assert not self.scheduled return self.schedule(spider, requests) else: self.scheduled.setdefault(spider, []).append(requests) def schedule(self, spider, batches=[]): requests = [] for batch in batches: if batch is None: batch = spider.start_requests() requests.extend(batch) return self.engine.open_spider(spider, requests) def _spider_closed(self, spider=None): if not self.engine.open_spiders: self.stop() @defer.inlineCallbacks def start(self): yield defer.maybeDeferred(self.configure) for spider, batches in self.scheduled.iteritems(): yield self.schedule(spider, batches) yield defer.maybeDeferred(self.engine.start) @defer.inlineCallbacks def stop(self): if self.engine.running: yield defer.maybeDeferred(self.engine.stop)
def __init__(self): self.configured = False self.control_reactor = True self.engine = ExecutionEngine()
def _create_engine(self): return ExecutionEngine(self, lambda _: self.stop())
class Crawler(object): def __init__(self, settings): self.configured = False self.settings = settings def install(self): import scrapy.project assert not hasattr(scrapy.project, 'crawler'), "crawler already installed" scrapy.project.crawler = self def uninstall(self): import scrapy.project assert hasattr(scrapy.project, 'crawler'), "crawler not installed" del scrapy.project.crawler def configure(self): if self.configured: return self.configured = True self.extensions = ExtensionManager.from_settings(self.settings) spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_settings(self.settings) spq_cls = load_object(self.settings['SPIDER_QUEUE_CLASS']) spq = spq_cls.from_settings(self.settings) keepalive = self.settings.getbool('KEEP_ALIVE') pollint = self.settings.getfloat('QUEUE_POLL_INTERVAL') self.queue = ExecutionQueue(self.spiders, spq, poll_interval=pollint, keep_alive=keepalive) self.engine = ExecutionEngine(self.settings, self._spider_closed) @defer.inlineCallbacks def _start_next_spider(self): spider, requests = yield defer.maybeDeferred(self.queue.get_next) if spider: self._start_spider(spider, requests) if self.engine.has_capacity() and not self._nextcall.active(): self._nextcall = reactor.callLater(self.queue.poll_interval, \ self._spider_closed) @defer.inlineCallbacks def _start_spider(self, spider, requests): """Don't call this method. Use self.queue to start new spiders""" spider.set_crawler(self) yield defer.maybeDeferred(self.engine.open_spider, spider) for request in requests: self.engine.crawl(request, spider) @defer.inlineCallbacks def _spider_closed(self, spider=None): if not self.engine.open_spiders: is_finished = yield defer.maybeDeferred(self.queue.is_finished) if is_finished: self.stop() return if self.engine.has_capacity(): self._start_next_spider() @defer.inlineCallbacks def start(self): yield defer.maybeDeferred(self.configure) yield defer.maybeDeferred(self.engine.start) self._nextcall = reactor.callLater(0, self._start_next_spider) @defer.inlineCallbacks def stop(self): if self._nextcall.active(): self._nextcall.cancel() if self.engine.running: yield defer.maybeDeferred(self.engine.stop)