Beispiel #1
0
class EventedStatsCollector(StatsCollector):
    """
    Stats Collector which allows to subscribe to value changes.
    Update notifications are throttled: interval between updates is no shorter
    than ``accumulate_time``.

    It is assumed that stat keys are never deleted.
    """
    accumulate_time = 0.1  # value is in seconds

    def __init__(self, crawler):
        super(EventedStatsCollector, self).__init__(crawler)
        self.signals = SignalManager(self)
        self._changes = {}
        self._task = PeriodicCallback(self.emit_changes,
                                      self.accumulate_time * 1000)
        self._task.start()

        # FIXME: this is ugly
        self.crawler = crawler  # used by ArachnadoCrawlerProcess

    def emit_changes(self):
        if self._changes:
            changes, self._changes = self._changes, {}
            self.signals.send_catch_log(stats_changed, changes=changes)

    def open_spider(self, spider):
        super(EventedStatsCollector, self).open_spider(spider)
        self._task.start()

    def close_spider(self, spider, reason):
        super(EventedStatsCollector, self).close_spider(spider, reason)
        self._task.stop()
Beispiel #2
0
class EventedStatsCollector(StatsCollector):
    """
    Stats Collector which allows to subscribe to value changes.
    Update notifications are throttled: interval between updates is no shorter
    than ``accumulate_time``.

    It is assumed that stat keys are never deleted.
    """
    accumulate_time = 0.1  # value is in seconds

    def __init__(self, crawler):
        super(EventedStatsCollector, self).__init__(crawler)
        self.signals = SignalManager(self)
        self._changes = {}
        self._task = PeriodicCallback(self.emit_changes, self.accumulate_time*1000)
        self._task.start()

        # FIXME: this is ugly
        self.crawler = crawler  # used by ArachnadoCrawlerProcess

    def emit_changes(self):
        if self._changes:
            changes, self._changes = self._changes, {}
            self.signals.send_catch_log(stats_changed, changes=changes)

    def open_spider(self, spider):
        super(EventedStatsCollector, self).open_spider(spider)
        self._task.start()

    def close_spider(self, spider, reason):
        super(EventedStatsCollector, self).close_spider(spider, reason)
        self._task.stop()
Beispiel #3
0
class ProcessStatsMonitor(object):
    """ A class which emits process stats periodically """

    signal_updated = object()

    def __init__(self, interval=1.0):
        self.signals = SignalManager(self)
        self.process = psutil.Process(os.getpid())
        self.interval = interval
        self._task = PeriodicCallback(self._emit, self.interval * 1000)
        self._recent = {}

    def start(self):
        # yappi.start()
        self._task.start()

    def stop(self):
        self._task.stop()
        # stats = yappi.get_func_stats()
        # stats.sort('tsub', 'desc')
        # with open("func-stats.txt", 'wt') as f:
        #     stats.print_all(f, columns={
        #         0: ("name", 80),
        #         1: ("ncall", 10),
        #         2: ("tsub", 8),
        #         3: ("ttot", 8),
        #         4: ("tavg",8)
        #     })
        #
        # pstats = yappi.convert2pstats(stats)
        # pstats.dump_stats("func-stats.prof")

    def get_recent(self):
        return self._recent

    def _emit(self):
        cpu_times = self.process.cpu_times()
        ram_usage = self.process.memory_info()
        stats = {
            'ram_percent': self.process.memory_percent(),
            'ram_rss': ram_usage.rss,
            'ram_vms': ram_usage.vms,
            'cpu_percent': self.process.cpu_percent(),
            'cpu_time_user': cpu_times.user,
            'cpu_time_system': cpu_times.system,
            'num_fds': self.process.num_fds(),
            'context_switches': self.process.num_ctx_switches(),
            'num_threads': self.process.num_threads(),
            'server_time': int(time.time() * 1000),
        }
        self._recent = stats
        self.signals.send_catch_log(self.signal_updated, stats=stats)
Beispiel #4
0
class ProcessStatsMonitor(object):
    """ A class which emits process stats periodically """

    signal_updated = object()

    def __init__(self, interval=1.0):
        self.signals = SignalManager(self)
        self.process = psutil.Process(os.getpid())
        self.interval = interval
        self._task = PeriodicCallback(self._emit, self.interval*1000)
        self._recent = {}

    def start(self):
        # yappi.start()
        self._task.start()

    def stop(self):
        self._task.stop()
        # stats = yappi.get_func_stats()
        # stats.sort('tsub', 'desc')
        # with open("func-stats.txt", 'wt') as f:
        #     stats.print_all(f, columns={
        #         0: ("name", 80),
        #         1: ("ncall", 10),
        #         2: ("tsub", 8),
        #         3: ("ttot", 8),
        #         4: ("tavg",8)
        #     })
        #
        # pstats = yappi.convert2pstats(stats)
        # pstats.dump_stats("func-stats.prof")

    def get_recent(self):
        return self._recent

    def _emit(self):
        cpu_times = self.process.cpu_times()
        ram_usage = self.process.memory_info()
        stats = {
            'ram_percent': self.process.memory_percent(),
            'ram_rss': ram_usage.rss,
            'ram_vms': ram_usage.vms,
            'cpu_percent': self.process.cpu_percent(),
            'cpu_time_user': cpu_times.user,
            'cpu_time_system': cpu_times.system,
            'num_fds': self.process.num_fds(),
            'context_switches': self.process.num_ctx_switches(),
            'num_threads': self.process.num_threads(),
            'server_time': int(time.time()*1000),
        }
        self._recent = stats
        self.signals.send_catch_log(self.signal_updated, stats=stats)
Beispiel #5
0
class ProcessStatsMonitor(object):
    """查看进程状态, 每秒发布一次"""
    signal_updated = object()

    def __init__(self, interval=1.0):
        self.signals = SignalManager(self)
        self.process = psutil.Process(os.getpid())
        self.interval = interval
        self._task = PeriodicCallback(self._emit, self.interval * 1000)
        self._recent = {}

    def start(self):
        """启动进程"""
        self._task.start()

    def stop(self):
        """停止进程"""
        self._task.stop()

    def get_recent(self):
        """当前进程信息"""
        return self._recent

    def _emit(self):
        """进程属性"""
        cpu_times = self.process.cpu_times()
        ram_usage = self.process.memory_info()
        stats = {
            # 内存使用率
            'ram_percent': self.process.memory_percent(),
            # 内存rss
            'ram_rss': ram_usage.rss,
            # 内存vms
            'ram_vms': ram_usage.vms,
            # cpu百分比
            'cpu_percent': self.process.cpu_percent(),
            # 用户cpu时间
            'cpu_time_user': cpu_times.user,
            # 系统cpu时间
            'cpu_time_system': cpu_times.system,
            # 上下文
            'context_switches': self.process.num_ctx_switches(),
            # 线程数
            'num_threads': self.process.num_threads(),
            # 运行时间
            'server_time': int(time.time() * 1000)
        }
        # 当前系统状态
        self._recent = stats
        self.signals.send_catch_log(self.signal_updated, stats=stats)
Beispiel #6
0
class MongoStorage(object):
    """
    Utility class for working with MongoDB data.
    It supports CRUD operations and allows to subscribe to
    created/updated/deleted events.
    """
    def __init__(self, mongo_uri, cache=False):
        self.mongo_uri = mongo_uri
        _, _, _, _, self.col = motor_from_uri(mongo_uri)
        self.signal_manager = SignalManager()
        # Used for unsubscribe
        # disconnect() requires reference to original callback
        self._callbacks = {}
        self.fetching = False
        self.signals = {
            'created': object(),
            'updated': object(),
            'deleted': object(),
        }
        # XXX: cache is used in arachnado.cron and arachnado.site_checker.
        # Is it needed?
        self.cache_flag = cache
        if cache:
            self.cache = defaultdict(dict)
        else:
            self.cache = None

    def subscribe(self, events=None, callback=None):
        if events is None:
            events = self.available_events
        if not isinstance(events, list):
            events = [events]
        for event_name in events:
            if event_name not in self.signals:
                raise ValueError('Invalid event name: {}'.format(event_name))
            self.signal_manager.connect(callback,
                                        self.signals[event_name],
                                        weak=False)
            self._callbacks[event_name] = callback

    def unsubscribe(self, events=None):
        if events is None:
            events = self.available_events
        if not isinstance(events, list):
            events = [events]
        for event_name in events:
            try:
                self.signal_manager.disconnect(
                    self._callbacks[event_name],
                    self.signals[event_name],
                    weak=False
                )
                self._callbacks.pop(event_name, None)
            except KeyError:
                # FIXME: when can it happen?
                pass

    @property
    def available_events(self):
        return list(self.signals.keys())

    @coroutine
    def fetch(self, query=None):
        if self.fetching:
            return
        self.fetching = True
        docs = []
        cursor = self.col.find(query)
        while (yield cursor.fetch_next):
            doc = cursor.next_object()
            docs.append(doc)
            #if self.cache is not None:
            #    self.cache[str(doc['_id'])] = doc
            #    if str(doc['_id']) not in self.cache:
            #        self.signal_manager.send_catch_log(
            #            self.signals['created'], data=doc
            #        )
        self.fetching = False
        raise Return(docs)

    @coroutine
    def create(self, doc):
        doc = replace_dots(doc)
        result = yield self.col.insert(doc)
        if self.cache is not None:
            self.cache[str(doc['_id'])] = doc
        self.signal_manager.send_catch_log(self.signals['created'], data=doc)
        raise Return(result)

    @coroutine
    def ensure_index(self, key_or_list):
        result = yield self.col.ensure_index(key_or_list)
        raise Return(result)

    @coroutine
    def update(self, doc):
        doc = replace_dots(doc)
        doc_copy = deepcopy(doc)
        doc_copy.pop('_id')
        result = yield self.col.update({
            '_id': ObjectId(doc['_id'])
        }, {
            '$set': doc_copy
        })
        if self.cache is not None:
            self.cache[str(doc['_id'])].update(doc)
        self.signal_manager.send_catch_log(self.signals['updated'], data=doc)
        raise Return(result)

    @coroutine
    def delete(self, doc):
        result = yield self.col.remove({'_id': ObjectId(doc['_id'])})
        if self.cache is not None:
            self.cache.pop(str(doc['_id']), None)
        self.signal_manager.send_catch_log(self.signals['deleted'], data=doc)
        raise Return(result)
class ArachnadoCrawlerProcess(CrawlerProcess):
    """
    CrawlerProcess which sets up a global signals manager,
    assigns unique ids to each spider job, workarounds some Scrapy
    issues and provides extra stats.
    """
    crawl_ids = itertools.count(start=1)

    def __init__(self, settings=None):
        self.signals = SignalManager(self)
        self.signals.connect(self.on_spider_closed,
                             CrawlerProcessSignals.spider_closed)
        self._finished_jobs = []
        self._paused_jobs = set()
        self.procmon = ProcessStatsMonitor()
        self.procmon.start()

        super(ArachnadoCrawlerProcess, self).__init__(settings or {})

        # don't log DepthMiddleware messages
        # see https://github.com/scrapy/scrapy/issues/1308
        logging.getLogger("scrapy.spidermiddlewares.depth").setLevel(
            logging.INFO)

    def crawl(self, crawler_or_spidercls, *args, **kwargs):
        kwargs['crawl_id'] = next(self.crawl_ids)

        crawler = crawler_or_spidercls
        if not isinstance(crawler_or_spidercls, Crawler):
            crawler = self._create_crawler(crawler_or_spidercls)

        # aggregate all crawler signals
        for name in SCRAPY_SIGNAL_NAMES:
            crawler.signals.connect(self._resend_signal,
                                    getattr(signals, name))

        # aggregate signals from crawler EventedStatsCollectors
        if hasattr(crawler.stats, "signals"):
            crawler.stats.signals.connect(self._resend_signal,
                                          stats.stats_changed)

        d = super(ArachnadoCrawlerProcess, self).crawl(crawler_or_spidercls,
                                                       *args, **kwargs)
        return d

    def _create_crawler(self, spidercls):
        if isinstance(spidercls, six.string_types):
            spidercls = self.spider_loader.load(spidercls)
        return ArachnadoCrawler(spidercls, self.settings)

    def stop_job(self, crawl_id):
        """ Stop a single crawl job """
        self.get_crawler(crawl_id).stop()

    def pause_job(self, crawl_id):
        """ Pause a crawling job """
        self._paused_jobs.add(crawl_id)
        self.get_crawler(crawl_id).engine.pause()

    def resume_job(self, crawl_id):
        """ Resume a crawling job """
        self._paused_jobs.remove(crawl_id)
        self.get_crawler(crawl_id).engine.unpause()

    def get_crawler(self, crawl_id):
        for crawler in self.crawlers:
            if getattr(crawler.spider, "crawl_id") == crawl_id:
                return crawler
        raise KeyError("Job is not known: %s" % crawl_id)

    def _resend_signal(self, **kwargs):
        # FIXME: this is a mess. Signal handling should be unified somehow:
        # there shouldn't be two separate code paths
        # for CrawlerProcessSignals and STAT_SIGNALS.
        signal = kwargs['signal']
        if signal in STAT_SIGNALS:
            signal = STAT_SIGNALS[signal]
            kwargs['crawler'] = kwargs.pop('sender').crawler
        else:
            signal = CrawlerProcessSignals.signal(signal)
            kwargs['crawler'] = kwargs.pop('sender')

        kwargs['signal'] = signal
        if signal.supports_defer:
            return self.signals.send_catch_log_deferred(**kwargs)
        else:
            return self.signals.send_catch_log(**kwargs)

    def stop(self):
        """ Terminate the process (exit from application). """
        self.procmon.stop()
        return super(ArachnadoCrawlerProcess, self).stop()

    def on_spider_closed(self, spider, reason):
        # spiders are closed not that often, insert(0,...) should be fine
        self._finished_jobs.insert(
            0, {
                'id': spider.crawl_id,
                'job_id': getattr(spider, 'motor_job_id'),
                'seed': spider.domain,
                'status': reason,
                'stats': spider.crawler.stats.get_stats(spider),
                'downloads': self._downloader_stats(spider.crawler)
            })

    # FIXME: methods below are ugly for two reasons:
    # 1. they assume spiders have certain attributes;
    # 2. they try to get crawling status based on auxilary information.

    def get_jobs(self):
        """ Return a list of active jobs """
        crawlers = [
            crawler for crawler in self.crawlers if crawler.spider is not None
        ]
        return [
            {
                'id': crawler.spider.crawl_id,
                'job_id': getattr(crawler.spider, 'motor_job_id'),
                'seed': crawler.spider.domain,
                'status': self._get_crawler_status(crawler),
                'stats':
                crawler.spider.crawler.stats.get_stats(crawler.spider),
                'downloads': self._downloader_stats(crawler)
                # 'engine_info': dict(get_engine_status(crawler.engine))
            } for crawler in crawlers
        ]

    @classmethod
    def _downloader_stats(cls, crawler):
        downloader = crawler.engine.downloader
        return {
            'active': [cls._request_info(req) for req in downloader.active],
            'slots':
            sorted([
                cls._slot_info(key, slot)
                for key, slot in downloader.slots.items()
            ],
                   key=operator.itemgetter('key'))
        }

    @classmethod
    def _request_info(cls, request):
        return {'url': request.url, 'method': request.method}

    @classmethod
    def _slot_info(cls, key, slot):
        return {
            'key': key,
            'concurrency': slot.concurrency,
            'delay': slot.delay,
            'lastseen': slot.lastseen,
            'len(queue)': len(slot.queue),
            'transferring':
            [cls._request_info(req) for req in slot.transferring],
            'active': [cls._request_info(req) for req in slot.active],
        }

    def _get_crawler_status(self, crawler):
        if crawler.spider is None:
            return "unknown"
        if not crawler.crawling:
            return "stopping"
        if int(crawler.spider.crawl_id) in self._paused_jobs:
            return "suspended"
        return "crawling"

    @property
    def jobs(self):
        """ Current crawl state """
        # filter out active jobs which are in fact finished
        finished_ids = {job['id'] for job in self._finished_jobs}
        active_jobs = [
            job for job in self.get_jobs() if job['id'] not in finished_ids
        ]

        return active_jobs + self._finished_jobs
Beispiel #8
0
class MyselfCrawlerProcess(CrawlerProcess):
    """signals管理, spider管理"""
    crawl_ids = itertools.count(start=1)

    def __init__(self, settings=None):
        self.signals = SignalManager(self)
        self.signals.connect(self.on_spider_closed,
                             CrawlerProcessSignals.spider_closed)
        self._finished_jobs = []
        self._paused_jobs = set()
        self.procmon = ProcessStatsMonitor()
        self.procmon.start()

        super(MyselfCrawlerProcess, self).__init__(settings or {})
        logging.getLogger('scrapy.spidermiddlewares.depth').setLevel(
            logging.INFO)

    def crawl(self, crawler_or_spidercls, *args, **kwargs):
        kwargs['crawl_id'] = next(self.crawl_ids)
        crawler = crawler_or_spidercls
        if not isinstance(crawler_or_spidercls, Crawler):
            crawler = self._create_crawler(crawler_or_spidercls)

        # 爬虫信号状态
        for name in SCRAPY_SIGNAL_NAMES:
            crawler.signals.connect(self._resend_signal,
                                    getattr(signals, name))

        if hasattr(crawler.stats, "signals"):
            crawler.stats.signals.connect(self._resend_signal,
                                          stats.stats_changed)

        d = super(MyselfCrawlerProcess, self).crawl(crawler_or_spidercls,
                                                    *args, **kwargs)
        return d

    def _create_crawler(self, spidercls):
        """新建crawler"""
        if isinstance(spidercls, six.string_types):
            spidercls = self.spider_loader.load(spidercls)
        return MyselfCrawlerProcess(spidercls, self.settings)

    def stop_job(self, crawl_id):
        """crawl job停止信号"""
        self.get_crawler(crawl_id).stop()

    def pause_job(self, crawl_id):
        """crawl job暂停"""
        self._paused_jobs.add(crawl_id)
        self.get_crawler(crawl_id).engine.pause()

    def resume_job(self, crawl_id):
        """crawl job恢复"""
        self._paused_jobs.remove(crawl_id)
        self.get_crawler(crawl_id).engine.unpause()

    def get_crawler(self, crawl_id):
        """获取crawl"""
        for crawler in self.crawlers:
            if getattr(crawler.spider, "crawl_id") == crawl_id:
                return crawler
        raise KeyError("Job is not known: %s" % crawl_id)

    def _resend_signal(self, **kwargs):
        # FIXME: signal and crawl are mess.
        signal = kwargs['signal']
        if signal in STAT_SIGNALS:
            signal = STAT_SIGNALS[signal]
            kwargs['crawler'] = kwargs.pop('sender').crawler
        else:
            signal = CrawlerProcessSignals.signal(signal)
            kwargs['crawler'] = kwargs.pop('sender')

        kwargs['signal'] = signal
        if signal.supports_defer:
            return self.signals.send_catch_log_deferred(**kwargs)
        else:
            return self.signals.send_catch_log(**kwargs)

    def stop(self):
        """停止crawl process"""
        self.procmon.stop()
        return super(MyselfCrawlerProcess, self).stop()

    def on_spider_closed(self, spider, reason):
        """spider关闭时写入"""
        self._finished_jobs.insert(
            0, {
                'id': spider.crawl_id,
                'job_id': getattr(spider, 'motor_job_id'),
                'seed': spider.domain,
                'status': reason,
                'stats': spider.crawler.stats.get_stats(spider),
                'downloads': self._downloader_stats(spider.crawler)
            })

    def get_jobs(self):
        """获取运行中的job"""
        crawlers = [
            crawler for crawler in self.crawlers if crawler.spider is not None
        ]
        return [{
            'id': crawler.spider.crawl_id,
            'job_id': getattr(crawler.spider, 'motor_job_id'),
            'seed': crawler.spider.domain,
            'status': self._get_crawler_status(crawler),
            'stats': crawler.spider.crawler.stats.get_stats(crawler.spider),
            'downloads': self._downloader_stats(crawler)
        } for crawler in crawlers]

    @classmethod
    def _downloader_stats(cls, crawler):
        """下载器状态"""
        downloader = crawler.engine.downloader
        return {
            'active': [cls._request_info(req) for req in downloader.active],
            'slots':
            sorted([
                cls._slot_info(key, slot)
                for key, slot in downloader.slots.items()
            ],
                   key=operator.itemgetter('key'))
        }

    @classmethod
    def _request_info(cls, request):
        """request消息"""
        return {'url': request.url, 'method': request.method}

    @classmethod
    def _slot_info(cls, key, slot):
        """slot消息"""
        return {
            'key': key,
            'concurrency': slot.concurrency,
            'delay': slot.delay,
            'lastseen': slot.lastseen,
            'len(queue)': len(slot.queue),
            'transferring':
            [cls._request_info(req) for req in slot.transferring],
            'active': [cls._request_info(req) for req in slot.active]
        }

    def _get_crawler_status(self, crawler):
        """crawler运行状态"""
        if crawler.spider is None:
            return "unknown"
        if not crawler.crawling:
            return "stopping"
        if int(crawler.spider.crawl_id) in self._paused_jobs:
            return "suspended"
        return "crawling"

    @property
    def jobs(self):
        """完成和未完成crawl状态"""
        finished_ids = {job['id'] for job in self._finished_jobs}
        active_jobs = [
            job for job in self.get_jobs() if job['id'] not in finished_ids
        ]

        return active_jobs + self._finished_jobs
class ArachnadoCrawlerProcess(CrawlerProcess):
    """
    CrawlerProcess which sets up a global signals manager,
    assigns unique ids to each spider job, workarounds some Scrapy
    issues and provides extra stats.
    """

    crawl_ids = itertools.count(start=1)

    def __init__(self, settings=None):
        self.signals = SignalManager(self)
        self.signals.connect(self.on_spider_closed, CrawlerProcessSignals.spider_closed)
        self._finished_jobs = []
        self._paused_jobs = set()
        self.procmon = ProcessStatsMonitor()
        self.procmon.start()

        super(ArachnadoCrawlerProcess, self).__init__(settings or {})

        # don't log DepthMiddleware messages
        # see https://github.com/scrapy/scrapy/issues/1308
        logging.getLogger("scrapy.spidermiddlewares.depth").setLevel(logging.INFO)

    def crawl(self, crawler_or_spidercls, *args, **kwargs):
        kwargs["crawl_id"] = next(self.crawl_ids)

        crawler = crawler_or_spidercls
        if not isinstance(crawler_or_spidercls, Crawler):
            crawler = self._create_crawler(crawler_or_spidercls)

        # aggregate all crawler signals
        for name in SCRAPY_SIGNAL_NAMES:
            crawler.signals.connect(self._resend_signal, getattr(signals, name))

        # aggregate signals from crawler EventedStatsCollectors
        if hasattr(crawler.stats, "signals"):
            crawler.stats.signals.connect(self._resend_signal, stats.stats_changed)

        d = super(ArachnadoCrawlerProcess, self).crawl(crawler_or_spidercls, *args, **kwargs)
        return d

    def _create_crawler(self, spidercls):
        if isinstance(spidercls, six.string_types):
            spidercls = self.spider_loader.load(spidercls)
        return ArachnadoCrawler(spidercls, self.settings)

    def stop_job(self, crawl_id):
        """ Stop a single crawl job """
        self.get_crawler(crawl_id).stop()

    def pause_job(self, crawl_id):
        """ Pause a crawling job """
        self._paused_jobs.add(crawl_id)
        self.get_crawler(crawl_id).engine.pause()

    def resume_job(self, crawl_id):
        """ Resume a crawling job """
        self._paused_jobs.remove(crawl_id)
        self.get_crawler(crawl_id).engine.unpause()

    def get_crawler(self, crawl_id):
        for crawler in self.crawlers:
            if getattr(crawler.spider, "crawl_id") == crawl_id:
                return crawler
        raise KeyError("Job is not known: %s" % crawl_id)

    def _resend_signal(self, **kwargs):
        # FIXME: this is a mess. Signal handling should be unified somehow:
        # there shouldn't be two separate code paths
        # for CrawlerProcessSignals and STAT_SIGNALS.
        signal = kwargs["signal"]
        if signal in STAT_SIGNALS:
            signal = STAT_SIGNALS[signal]
            kwargs["crawler"] = kwargs.pop("sender").crawler
        else:
            signal = CrawlerProcessSignals.signal(signal)
            kwargs["crawler"] = kwargs.pop("sender")

        kwargs["signal"] = signal
        if signal.supports_defer:
            return self.signals.send_catch_log_deferred(**kwargs)
        else:
            return self.signals.send_catch_log(**kwargs)

    def stop(self):
        """ Terminate the process (exit from application). """
        self.procmon.stop()
        return super(ArachnadoCrawlerProcess, self).stop()

    def on_spider_closed(self, spider, reason):
        # spiders are closed not that often, insert(0,...) should be fine
        self._finished_jobs.insert(
            0,
            {
                "id": spider.crawl_id,
                "job_id": getattr(spider, "motor_job_id"),
                "seed": spider.domain,
                "status": reason,
                "stats": spider.crawler.stats.get_stats(spider),
                "downloads": self._downloader_stats(spider.crawler),
            },
        )

    # FIXME: methods below are ugly for two reasons:
    # 1. they assume spiders have certain attributes;
    # 2. they try to get crawling status based on auxilary information.

    def get_jobs(self):
        """ Return a list of active jobs """
        crawlers = [crawler for crawler in self.crawlers if crawler.spider is not None]
        return [
            {
                "id": crawler.spider.crawl_id,
                "job_id": getattr(crawler.spider, "motor_job_id"),
                "seed": crawler.spider.domain,
                "status": self._get_crawler_status(crawler),
                "stats": crawler.spider.crawler.stats.get_stats(crawler.spider),
                "downloads": self._downloader_stats(crawler)
                # 'engine_info': dict(get_engine_status(crawler.engine))
            }
            for crawler in crawlers
        ]

    @classmethod
    def _downloader_stats(cls, crawler):
        downloader = crawler.engine.downloader
        return {
            "active": [cls._request_info(req) for req in downloader.active],
            "slots": sorted(
                [cls._slot_info(key, slot) for key, slot in downloader.slots.items()], key=operator.itemgetter("key")
            ),
        }

    @classmethod
    def _request_info(cls, request):
        return {"url": request.url, "method": request.method}

    @classmethod
    def _slot_info(cls, key, slot):
        return {
            "key": key,
            "concurrency": slot.concurrency,
            "delay": slot.delay,
            "lastseen": slot.lastseen,
            "len(queue)": len(slot.queue),
            "transferring": [cls._request_info(req) for req in slot.transferring],
            "active": [cls._request_info(req) for req in slot.active],
        }

    def _get_crawler_status(self, crawler):
        if crawler.spider is None:
            return "unknown"
        if not crawler.crawling:
            return "stopping"
        if int(crawler.spider.crawl_id) in self._paused_jobs:
            return "suspended"
        return "crawling"

    @property
    def jobs(self):
        """ Current crawl state """
        # filter out active jobs which are in fact finished
        finished_ids = {job["id"] for job in self._finished_jobs}
        active_jobs = [job for job in self.get_jobs() if job["id"] not in finished_ids]

        return active_jobs + self._finished_jobs