def __init__(self, settings=None):
        self.signals = SignalManager(self)
        self.signals.connect(self.on_spider_closed,
                             CrawlerProcessSignals.spider_closed)
        self._finished_jobs = []
        self._paused_jobs = set()
        self.procmon = ProcessStatsMonitor()
        self.procmon.start()

        super(ArachnadoCrawlerProcess, self).__init__(settings or {})

        # don't log DepthMiddleware messages
        # see https://github.com/scrapy/scrapy/issues/1308
        logging.getLogger("scrapy.spidermiddlewares.depth").setLevel(
            logging.INFO)
    def __init__(self, settings=None):
        self.signals = SignalManager(self)
        self.signals.connect(self.on_spider_closed, CrawlerProcessSignals.spider_closed)
        self._finished_jobs = []
        self._paused_jobs = set()
        self.procmon = ProcessStatsMonitor()
        self.procmon.start()

        super(ArachnadoCrawlerProcess, self).__init__(settings or {})

        # don't log DepthMiddleware messages
        # see https://github.com/scrapy/scrapy/issues/1308
        logging.getLogger("scrapy.spidermiddlewares.depth").setLevel(logging.INFO)
class ArachnadoCrawlerProcess(CrawlerProcess):
    """
    CrawlerProcess which sets up a global signals manager,
    assigns unique ids to each spider job, workarounds some Scrapy
    issues and provides extra stats.
    """
    crawl_ids = itertools.count(start=1)

    def __init__(self, settings=None):
        self.signals = SignalManager(self)
        self.signals.connect(self.on_spider_closed,
                             CrawlerProcessSignals.spider_closed)
        self._finished_jobs = []
        self._paused_jobs = set()
        self.procmon = ProcessStatsMonitor()
        self.procmon.start()

        super(ArachnadoCrawlerProcess, self).__init__(settings or {})

        # don't log DepthMiddleware messages
        # see https://github.com/scrapy/scrapy/issues/1308
        logging.getLogger("scrapy.spidermiddlewares.depth").setLevel(
            logging.INFO)

    def crawl(self, crawler_or_spidercls, *args, **kwargs):
        kwargs['crawl_id'] = next(self.crawl_ids)

        crawler = crawler_or_spidercls
        if not isinstance(crawler_or_spidercls, Crawler):
            crawler = self._create_crawler(crawler_or_spidercls)

        # aggregate all crawler signals
        for name in SCRAPY_SIGNAL_NAMES:
            crawler.signals.connect(self._resend_signal,
                                    getattr(signals, name))

        # aggregate signals from crawler EventedStatsCollectors
        if hasattr(crawler.stats, "signals"):
            crawler.stats.signals.connect(self._resend_signal,
                                          stats.stats_changed)

        d = super(ArachnadoCrawlerProcess, self).crawl(crawler_or_spidercls,
                                                       *args, **kwargs)
        return d

    def _create_crawler(self, spidercls):
        if isinstance(spidercls, six.string_types):
            spidercls = self.spider_loader.load(spidercls)
        return ArachnadoCrawler(spidercls, self.settings)

    def stop_job(self, crawl_id):
        """ Stop a single crawl job """
        self.get_crawler(crawl_id).stop()

    def pause_job(self, crawl_id):
        """ Pause a crawling job """
        self._paused_jobs.add(crawl_id)
        self.get_crawler(crawl_id).engine.pause()

    def resume_job(self, crawl_id):
        """ Resume a crawling job """
        self._paused_jobs.remove(crawl_id)
        self.get_crawler(crawl_id).engine.unpause()

    def get_crawler(self, crawl_id):
        for crawler in self.crawlers:
            if getattr(crawler.spider, "crawl_id") == crawl_id:
                return crawler
        raise KeyError("Job is not known: %s" % crawl_id)

    def _resend_signal(self, **kwargs):
        # FIXME: this is a mess. Signal handling should be unified somehow:
        # there shouldn't be two separate code paths
        # for CrawlerProcessSignals and STAT_SIGNALS.
        signal = kwargs['signal']
        if signal in STAT_SIGNALS:
            signal = STAT_SIGNALS[signal]
            kwargs['crawler'] = kwargs.pop('sender').crawler
        else:
            signal = CrawlerProcessSignals.signal(signal)
            kwargs['crawler'] = kwargs.pop('sender')

        kwargs['signal'] = signal
        if signal.supports_defer:
            return self.signals.send_catch_log_deferred(**kwargs)
        else:
            return self.signals.send_catch_log(**kwargs)

    def stop(self):
        """ Terminate the process (exit from application). """
        self.procmon.stop()
        return super(ArachnadoCrawlerProcess, self).stop()

    def on_spider_closed(self, spider, reason):
        # spiders are closed not that often, insert(0,...) should be fine
        self._finished_jobs.insert(
            0, {
                'id': spider.crawl_id,
                'job_id': getattr(spider, 'motor_job_id'),
                'seed': spider.domain,
                'status': reason,
                'stats': spider.crawler.stats.get_stats(spider),
                'downloads': self._downloader_stats(spider.crawler)
            })

    # FIXME: methods below are ugly for two reasons:
    # 1. they assume spiders have certain attributes;
    # 2. they try to get crawling status based on auxilary information.

    def get_jobs(self):
        """ Return a list of active jobs """
        crawlers = [
            crawler for crawler in self.crawlers if crawler.spider is not None
        ]
        return [
            {
                'id': crawler.spider.crawl_id,
                'job_id': getattr(crawler.spider, 'motor_job_id'),
                'seed': crawler.spider.domain,
                'status': self._get_crawler_status(crawler),
                'stats':
                crawler.spider.crawler.stats.get_stats(crawler.spider),
                'downloads': self._downloader_stats(crawler)
                # 'engine_info': dict(get_engine_status(crawler.engine))
            } for crawler in crawlers
        ]

    @classmethod
    def _downloader_stats(cls, crawler):
        downloader = crawler.engine.downloader
        return {
            'active': [cls._request_info(req) for req in downloader.active],
            'slots':
            sorted([
                cls._slot_info(key, slot)
                for key, slot in downloader.slots.items()
            ],
                   key=operator.itemgetter('key'))
        }

    @classmethod
    def _request_info(cls, request):
        return {'url': request.url, 'method': request.method}

    @classmethod
    def _slot_info(cls, key, slot):
        return {
            'key': key,
            'concurrency': slot.concurrency,
            'delay': slot.delay,
            'lastseen': slot.lastseen,
            'len(queue)': len(slot.queue),
            'transferring':
            [cls._request_info(req) for req in slot.transferring],
            'active': [cls._request_info(req) for req in slot.active],
        }

    def _get_crawler_status(self, crawler):
        if crawler.spider is None:
            return "unknown"
        if not crawler.crawling:
            return "stopping"
        if int(crawler.spider.crawl_id) in self._paused_jobs:
            return "suspended"
        return "crawling"

    @property
    def jobs(self):
        """ Current crawl state """
        # filter out active jobs which are in fact finished
        finished_ids = {job['id'] for job in self._finished_jobs}
        active_jobs = [
            job for job in self.get_jobs() if job['id'] not in finished_ids
        ]

        return active_jobs + self._finished_jobs
class ArachnadoCrawlerProcess(CrawlerProcess):
    """
    CrawlerProcess which sets up a global signals manager,
    assigns unique ids to each spider job, workarounds some Scrapy
    issues and provides extra stats.
    """

    crawl_ids = itertools.count(start=1)

    def __init__(self, settings=None):
        self.signals = SignalManager(self)
        self.signals.connect(self.on_spider_closed, CrawlerProcessSignals.spider_closed)
        self._finished_jobs = []
        self._paused_jobs = set()
        self.procmon = ProcessStatsMonitor()
        self.procmon.start()

        super(ArachnadoCrawlerProcess, self).__init__(settings or {})

        # don't log DepthMiddleware messages
        # see https://github.com/scrapy/scrapy/issues/1308
        logging.getLogger("scrapy.spidermiddlewares.depth").setLevel(logging.INFO)

    def crawl(self, crawler_or_spidercls, *args, **kwargs):
        kwargs["crawl_id"] = next(self.crawl_ids)

        crawler = crawler_or_spidercls
        if not isinstance(crawler_or_spidercls, Crawler):
            crawler = self._create_crawler(crawler_or_spidercls)

        # aggregate all crawler signals
        for name in SCRAPY_SIGNAL_NAMES:
            crawler.signals.connect(self._resend_signal, getattr(signals, name))

        # aggregate signals from crawler EventedStatsCollectors
        if hasattr(crawler.stats, "signals"):
            crawler.stats.signals.connect(self._resend_signal, stats.stats_changed)

        d = super(ArachnadoCrawlerProcess, self).crawl(crawler_or_spidercls, *args, **kwargs)
        return d

    def _create_crawler(self, spidercls):
        if isinstance(spidercls, six.string_types):
            spidercls = self.spider_loader.load(spidercls)
        return ArachnadoCrawler(spidercls, self.settings)

    def stop_job(self, crawl_id):
        """ Stop a single crawl job """
        self.get_crawler(crawl_id).stop()

    def pause_job(self, crawl_id):
        """ Pause a crawling job """
        self._paused_jobs.add(crawl_id)
        self.get_crawler(crawl_id).engine.pause()

    def resume_job(self, crawl_id):
        """ Resume a crawling job """
        self._paused_jobs.remove(crawl_id)
        self.get_crawler(crawl_id).engine.unpause()

    def get_crawler(self, crawl_id):
        for crawler in self.crawlers:
            if getattr(crawler.spider, "crawl_id") == crawl_id:
                return crawler
        raise KeyError("Job is not known: %s" % crawl_id)

    def _resend_signal(self, **kwargs):
        # FIXME: this is a mess. Signal handling should be unified somehow:
        # there shouldn't be two separate code paths
        # for CrawlerProcessSignals and STAT_SIGNALS.
        signal = kwargs["signal"]
        if signal in STAT_SIGNALS:
            signal = STAT_SIGNALS[signal]
            kwargs["crawler"] = kwargs.pop("sender").crawler
        else:
            signal = CrawlerProcessSignals.signal(signal)
            kwargs["crawler"] = kwargs.pop("sender")

        kwargs["signal"] = signal
        if signal.supports_defer:
            return self.signals.send_catch_log_deferred(**kwargs)
        else:
            return self.signals.send_catch_log(**kwargs)

    def stop(self):
        """ Terminate the process (exit from application). """
        self.procmon.stop()
        return super(ArachnadoCrawlerProcess, self).stop()

    def on_spider_closed(self, spider, reason):
        # spiders are closed not that often, insert(0,...) should be fine
        self._finished_jobs.insert(
            0,
            {
                "id": spider.crawl_id,
                "job_id": getattr(spider, "motor_job_id"),
                "seed": spider.domain,
                "status": reason,
                "stats": spider.crawler.stats.get_stats(spider),
                "downloads": self._downloader_stats(spider.crawler),
            },
        )

    # FIXME: methods below are ugly for two reasons:
    # 1. they assume spiders have certain attributes;
    # 2. they try to get crawling status based on auxilary information.

    def get_jobs(self):
        """ Return a list of active jobs """
        crawlers = [crawler for crawler in self.crawlers if crawler.spider is not None]
        return [
            {
                "id": crawler.spider.crawl_id,
                "job_id": getattr(crawler.spider, "motor_job_id"),
                "seed": crawler.spider.domain,
                "status": self._get_crawler_status(crawler),
                "stats": crawler.spider.crawler.stats.get_stats(crawler.spider),
                "downloads": self._downloader_stats(crawler)
                # 'engine_info': dict(get_engine_status(crawler.engine))
            }
            for crawler in crawlers
        ]

    @classmethod
    def _downloader_stats(cls, crawler):
        downloader = crawler.engine.downloader
        return {
            "active": [cls._request_info(req) for req in downloader.active],
            "slots": sorted(
                [cls._slot_info(key, slot) for key, slot in downloader.slots.items()], key=operator.itemgetter("key")
            ),
        }

    @classmethod
    def _request_info(cls, request):
        return {"url": request.url, "method": request.method}

    @classmethod
    def _slot_info(cls, key, slot):
        return {
            "key": key,
            "concurrency": slot.concurrency,
            "delay": slot.delay,
            "lastseen": slot.lastseen,
            "len(queue)": len(slot.queue),
            "transferring": [cls._request_info(req) for req in slot.transferring],
            "active": [cls._request_info(req) for req in slot.active],
        }

    def _get_crawler_status(self, crawler):
        if crawler.spider is None:
            return "unknown"
        if not crawler.crawling:
            return "stopping"
        if int(crawler.spider.crawl_id) in self._paused_jobs:
            return "suspended"
        return "crawling"

    @property
    def jobs(self):
        """ Current crawl state """
        # filter out active jobs which are in fact finished
        finished_ids = {job["id"] for job in self._finished_jobs}
        active_jobs = [job for job in self.get_jobs() if job["id"] not in finished_ids]

        return active_jobs + self._finished_jobs