def __init__(self, settings=None): self.signals = SignalManager(self) self.signals.connect(self.on_spider_closed, CrawlerProcessSignals.spider_closed) self._finished_jobs = [] self._paused_jobs = set() self.procmon = ProcessStatsMonitor() self.procmon.start() super(ArachnadoCrawlerProcess, self).__init__(settings or {}) # don't log DepthMiddleware messages # see https://github.com/scrapy/scrapy/issues/1308 logging.getLogger("scrapy.spidermiddlewares.depth").setLevel( logging.INFO)
def __init__(self, settings=None): self.signals = SignalManager(self) self.signals.connect(self.on_spider_closed, CrawlerProcessSignals.spider_closed) self._finished_jobs = [] self._paused_jobs = set() self.procmon = ProcessStatsMonitor() self.procmon.start() super(ArachnadoCrawlerProcess, self).__init__(settings or {}) # don't log DepthMiddleware messages # see https://github.com/scrapy/scrapy/issues/1308 logging.getLogger("scrapy.spidermiddlewares.depth").setLevel(logging.INFO)
class ArachnadoCrawlerProcess(CrawlerProcess): """ CrawlerProcess which sets up a global signals manager, assigns unique ids to each spider job, workarounds some Scrapy issues and provides extra stats. """ crawl_ids = itertools.count(start=1) def __init__(self, settings=None): self.signals = SignalManager(self) self.signals.connect(self.on_spider_closed, CrawlerProcessSignals.spider_closed) self._finished_jobs = [] self._paused_jobs = set() self.procmon = ProcessStatsMonitor() self.procmon.start() super(ArachnadoCrawlerProcess, self).__init__(settings or {}) # don't log DepthMiddleware messages # see https://github.com/scrapy/scrapy/issues/1308 logging.getLogger("scrapy.spidermiddlewares.depth").setLevel( logging.INFO) def crawl(self, crawler_or_spidercls, *args, **kwargs): kwargs['crawl_id'] = next(self.crawl_ids) crawler = crawler_or_spidercls if not isinstance(crawler_or_spidercls, Crawler): crawler = self._create_crawler(crawler_or_spidercls) # aggregate all crawler signals for name in SCRAPY_SIGNAL_NAMES: crawler.signals.connect(self._resend_signal, getattr(signals, name)) # aggregate signals from crawler EventedStatsCollectors if hasattr(crawler.stats, "signals"): crawler.stats.signals.connect(self._resend_signal, stats.stats_changed) d = super(ArachnadoCrawlerProcess, self).crawl(crawler_or_spidercls, *args, **kwargs) return d def _create_crawler(self, spidercls): if isinstance(spidercls, six.string_types): spidercls = self.spider_loader.load(spidercls) return ArachnadoCrawler(spidercls, self.settings) def stop_job(self, crawl_id): """ Stop a single crawl job """ self.get_crawler(crawl_id).stop() def pause_job(self, crawl_id): """ Pause a crawling job """ self._paused_jobs.add(crawl_id) self.get_crawler(crawl_id).engine.pause() def resume_job(self, crawl_id): """ Resume a crawling job """ self._paused_jobs.remove(crawl_id) self.get_crawler(crawl_id).engine.unpause() def get_crawler(self, crawl_id): for crawler in self.crawlers: if getattr(crawler.spider, "crawl_id") == crawl_id: return crawler raise KeyError("Job is not known: %s" % crawl_id) def _resend_signal(self, **kwargs): # FIXME: this is a mess. Signal handling should be unified somehow: # there shouldn't be two separate code paths # for CrawlerProcessSignals and STAT_SIGNALS. signal = kwargs['signal'] if signal in STAT_SIGNALS: signal = STAT_SIGNALS[signal] kwargs['crawler'] = kwargs.pop('sender').crawler else: signal = CrawlerProcessSignals.signal(signal) kwargs['crawler'] = kwargs.pop('sender') kwargs['signal'] = signal if signal.supports_defer: return self.signals.send_catch_log_deferred(**kwargs) else: return self.signals.send_catch_log(**kwargs) def stop(self): """ Terminate the process (exit from application). """ self.procmon.stop() return super(ArachnadoCrawlerProcess, self).stop() def on_spider_closed(self, spider, reason): # spiders are closed not that often, insert(0,...) should be fine self._finished_jobs.insert( 0, { 'id': spider.crawl_id, 'job_id': getattr(spider, 'motor_job_id'), 'seed': spider.domain, 'status': reason, 'stats': spider.crawler.stats.get_stats(spider), 'downloads': self._downloader_stats(spider.crawler) }) # FIXME: methods below are ugly for two reasons: # 1. they assume spiders have certain attributes; # 2. they try to get crawling status based on auxilary information. def get_jobs(self): """ Return a list of active jobs """ crawlers = [ crawler for crawler in self.crawlers if crawler.spider is not None ] return [ { 'id': crawler.spider.crawl_id, 'job_id': getattr(crawler.spider, 'motor_job_id'), 'seed': crawler.spider.domain, 'status': self._get_crawler_status(crawler), 'stats': crawler.spider.crawler.stats.get_stats(crawler.spider), 'downloads': self._downloader_stats(crawler) # 'engine_info': dict(get_engine_status(crawler.engine)) } for crawler in crawlers ] @classmethod def _downloader_stats(cls, crawler): downloader = crawler.engine.downloader return { 'active': [cls._request_info(req) for req in downloader.active], 'slots': sorted([ cls._slot_info(key, slot) for key, slot in downloader.slots.items() ], key=operator.itemgetter('key')) } @classmethod def _request_info(cls, request): return {'url': request.url, 'method': request.method} @classmethod def _slot_info(cls, key, slot): return { 'key': key, 'concurrency': slot.concurrency, 'delay': slot.delay, 'lastseen': slot.lastseen, 'len(queue)': len(slot.queue), 'transferring': [cls._request_info(req) for req in slot.transferring], 'active': [cls._request_info(req) for req in slot.active], } def _get_crawler_status(self, crawler): if crawler.spider is None: return "unknown" if not crawler.crawling: return "stopping" if int(crawler.spider.crawl_id) in self._paused_jobs: return "suspended" return "crawling" @property def jobs(self): """ Current crawl state """ # filter out active jobs which are in fact finished finished_ids = {job['id'] for job in self._finished_jobs} active_jobs = [ job for job in self.get_jobs() if job['id'] not in finished_ids ] return active_jobs + self._finished_jobs
class ArachnadoCrawlerProcess(CrawlerProcess): """ CrawlerProcess which sets up a global signals manager, assigns unique ids to each spider job, workarounds some Scrapy issues and provides extra stats. """ crawl_ids = itertools.count(start=1) def __init__(self, settings=None): self.signals = SignalManager(self) self.signals.connect(self.on_spider_closed, CrawlerProcessSignals.spider_closed) self._finished_jobs = [] self._paused_jobs = set() self.procmon = ProcessStatsMonitor() self.procmon.start() super(ArachnadoCrawlerProcess, self).__init__(settings or {}) # don't log DepthMiddleware messages # see https://github.com/scrapy/scrapy/issues/1308 logging.getLogger("scrapy.spidermiddlewares.depth").setLevel(logging.INFO) def crawl(self, crawler_or_spidercls, *args, **kwargs): kwargs["crawl_id"] = next(self.crawl_ids) crawler = crawler_or_spidercls if not isinstance(crawler_or_spidercls, Crawler): crawler = self._create_crawler(crawler_or_spidercls) # aggregate all crawler signals for name in SCRAPY_SIGNAL_NAMES: crawler.signals.connect(self._resend_signal, getattr(signals, name)) # aggregate signals from crawler EventedStatsCollectors if hasattr(crawler.stats, "signals"): crawler.stats.signals.connect(self._resend_signal, stats.stats_changed) d = super(ArachnadoCrawlerProcess, self).crawl(crawler_or_spidercls, *args, **kwargs) return d def _create_crawler(self, spidercls): if isinstance(spidercls, six.string_types): spidercls = self.spider_loader.load(spidercls) return ArachnadoCrawler(spidercls, self.settings) def stop_job(self, crawl_id): """ Stop a single crawl job """ self.get_crawler(crawl_id).stop() def pause_job(self, crawl_id): """ Pause a crawling job """ self._paused_jobs.add(crawl_id) self.get_crawler(crawl_id).engine.pause() def resume_job(self, crawl_id): """ Resume a crawling job """ self._paused_jobs.remove(crawl_id) self.get_crawler(crawl_id).engine.unpause() def get_crawler(self, crawl_id): for crawler in self.crawlers: if getattr(crawler.spider, "crawl_id") == crawl_id: return crawler raise KeyError("Job is not known: %s" % crawl_id) def _resend_signal(self, **kwargs): # FIXME: this is a mess. Signal handling should be unified somehow: # there shouldn't be two separate code paths # for CrawlerProcessSignals and STAT_SIGNALS. signal = kwargs["signal"] if signal in STAT_SIGNALS: signal = STAT_SIGNALS[signal] kwargs["crawler"] = kwargs.pop("sender").crawler else: signal = CrawlerProcessSignals.signal(signal) kwargs["crawler"] = kwargs.pop("sender") kwargs["signal"] = signal if signal.supports_defer: return self.signals.send_catch_log_deferred(**kwargs) else: return self.signals.send_catch_log(**kwargs) def stop(self): """ Terminate the process (exit from application). """ self.procmon.stop() return super(ArachnadoCrawlerProcess, self).stop() def on_spider_closed(self, spider, reason): # spiders are closed not that often, insert(0,...) should be fine self._finished_jobs.insert( 0, { "id": spider.crawl_id, "job_id": getattr(spider, "motor_job_id"), "seed": spider.domain, "status": reason, "stats": spider.crawler.stats.get_stats(spider), "downloads": self._downloader_stats(spider.crawler), }, ) # FIXME: methods below are ugly for two reasons: # 1. they assume spiders have certain attributes; # 2. they try to get crawling status based on auxilary information. def get_jobs(self): """ Return a list of active jobs """ crawlers = [crawler for crawler in self.crawlers if crawler.spider is not None] return [ { "id": crawler.spider.crawl_id, "job_id": getattr(crawler.spider, "motor_job_id"), "seed": crawler.spider.domain, "status": self._get_crawler_status(crawler), "stats": crawler.spider.crawler.stats.get_stats(crawler.spider), "downloads": self._downloader_stats(crawler) # 'engine_info': dict(get_engine_status(crawler.engine)) } for crawler in crawlers ] @classmethod def _downloader_stats(cls, crawler): downloader = crawler.engine.downloader return { "active": [cls._request_info(req) for req in downloader.active], "slots": sorted( [cls._slot_info(key, slot) for key, slot in downloader.slots.items()], key=operator.itemgetter("key") ), } @classmethod def _request_info(cls, request): return {"url": request.url, "method": request.method} @classmethod def _slot_info(cls, key, slot): return { "key": key, "concurrency": slot.concurrency, "delay": slot.delay, "lastseen": slot.lastseen, "len(queue)": len(slot.queue), "transferring": [cls._request_info(req) for req in slot.transferring], "active": [cls._request_info(req) for req in slot.active], } def _get_crawler_status(self, crawler): if crawler.spider is None: return "unknown" if not crawler.crawling: return "stopping" if int(crawler.spider.crawl_id) in self._paused_jobs: return "suspended" return "crawling" @property def jobs(self): """ Current crawl state """ # filter out active jobs which are in fact finished finished_ids = {job["id"] for job in self._finished_jobs} active_jobs = [job for job in self.get_jobs() if job["id"] not in finished_ids] return active_jobs + self._finished_jobs