Ejemplo n.º 1
0
class CustomCrawler(Crawler):
    def __init__(self, spidercls, settings=None):
        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings: %(settings)r", {'settings': d})

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)
        if custom_get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            # install_scrapy_root_handler(self.settings)
            custom_install_scrapy_root_handler(self.settings, spidercls.name)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None
Ejemplo n.º 2
0
    def __init__(self,**kwargs):
        
        if not 'config' in kwargs:
            err =  'failed to find seed file (config=*.conf)'
            print err
#         if not 'keywords' in kwargs:
#             err =  'failed to find seed file (keywords=*.dat)'
#             print err
        config = kwargs['config']
#         self.keywords = kwargs['keywords']
        self.load_conf(config)
        if self.Sleep_Flag=='SEARCH_ENGINE_SLEEP' or self.Sleep_Flag=='true' or not self.Sleep_Flag:
            settings.set('RANDOMIZE_DOWNLOAD_DELAY', True, priority='cmdline')
            settings.set('DOWNLOAD_DELAY', float(self.SE_Sleep_Base), priority='cmdline')
        else:
            settings.set('RANDOMIZE_DOWNLOAD_DELAY', False, priority='cmdline')
        
        log_filename = self.conf_name.replace('.conf','')+'.log'
        settings.set('LOG_FILE', log_filename, priority='cmdline')
        #redis key
        self.meta_next_url = meta_redis_key()
        #初始化redis
        self.init_redis()
        self.redis_keyword = get_redis_key(self.conf_name)
        #注册signal
        sig = SignalManager(dispatcher.Any)
        sig.connect(self.idle,signal=signals.spider_idle)
        self.metatime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        #保存该次获取的url,用于判断该次抓取是否和上次重复{keyword:md5(url)}
        self.urlmd5 = dict()
        self.log_writer = open('log.dat','a+') 
        self.date_from_url_re = re.compile("[-_/][a-zA-Z]*[-_]?(?P<year>(20)?([0-1][0-9]))([-_/])?(?P<m>(10|11|12|(0?[1-9])){1})([-_/])?(?P<day>(10|20|30|31|([0-2]?[1-9])){1})([-_/])")
Ejemplo n.º 3
0
    def __init__(self, **kwargs):

        if not 'config' in kwargs:
            err = 'failed to find seed file (config=*.conf)'
            print err
        if 'startdate' in kwargs:
            self.startdate = kwargs['startdate']
        else:
            self.startdate = (
                datetime.datetime.now() -
                datetime.timedelta(days=2)).strftime('%Y-%m-%d %H:%M:%S')
        if 'enddate' in kwargs:
            self.enddate = kwargs['enddate']
        else:
            self.enddate = datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')
#         if not 'keywords' in kwargs:
#             err =  'failed to find seed file (keywords=*.dat)'
#             print err
        config = kwargs['config']
        self.load_conf(config)
        if self.Sleep_Flag == 'SEARCH_ENGINE_SLEEP' or self.Sleep_Flag == 'true' or not self.Sleep_Flag:
            settings.set('RANDOMIZE_DOWNLOAD_DELAY', True, priority='cmdline')
            settings.set('DOWNLOAD_DELAY',
                         float(self.SE_Sleep_Base),
                         priority='cmdline')
        else:
            settings.set('RANDOMIZE_DOWNLOAD_DELAY', False, priority='cmdline')

        log_filename = self.conf_name.replace('.conf', '') + '.log'
        settings.set('LOG_FILE', log_filename, priority='cmdline')
        #初始化redis
        self.init_redis()
        self.redis_keyword = get_redis_key()
        #注册signal
        sig = SignalManager(dispatcher.Any)
        sig.connect(self.idle, signal=signals.spider_idle)
        sig.connect(self.close, signal=signals.spider_closed)
        self.metatime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

        self.conn_local = mysql.connect('meta', host='localhost')
        self.conn_local_cursor = self.conn_local.cursor()
        #        self.conn_local_cursor.execute('set global autocommit=1')
        try:
            self.meta_ip = get_meta_ip(network_card='enp7s0')
        except:
            self.meta_ip = get_meta_ip(network_card='eth0')
        #初始化meta库的state
        self.init_state()
Ejemplo n.º 4
0
    def __init__(self,**kwargs):
        
        if not 'config' in kwargs:
            err =  'failed to find seed file (config=*.conf)'
            print err
        if 'startdate' in kwargs:
            self.startdate = kwargs['startdate']
        else:
            self.startdate = (datetime.datetime.now()-datetime.timedelta(days=2)).strftime('%Y-%m-%d %H:%M:%S')
        if 'enddate' in kwargs:
            self.enddate = kwargs['enddate']
        else:
            self.enddate = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
#         if not 'keywords' in kwargs:
#             err =  'failed to find seed file (keywords=*.dat)'
#             print err
        config = kwargs['config']
        self.load_conf(config)
        if self.Sleep_Flag=='SEARCH_ENGINE_SLEEP' or self.Sleep_Flag=='true' or not self.Sleep_Flag:
            settings.set('RANDOMIZE_DOWNLOAD_DELAY', True, priority='cmdline')
            settings.set('DOWNLOAD_DELAY', float(self.SE_Sleep_Base), priority='cmdline')
        else:
            settings.set('RANDOMIZE_DOWNLOAD_DELAY', False, priority='cmdline')
        
        log_filename = self.conf_name.replace('.conf','')+'.log'
        settings.set('LOG_FILE', log_filename, priority='cmdline')
        #初始化redis
        self.init_redis()
        self.redis_keyword = get_redis_key()
        #注册signal
        sig = SignalManager(dispatcher.Any)
        sig.connect(self.idle,signal=signals.spider_idle)
        sig.connect(self.close,signal=signals.spider_closed)
        self.metatime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        
        self.conn_local = mysql.connect('meta',host='localhost')
        self.conn_local_cursor = self.conn_local.cursor()
#        self.conn_local_cursor.execute('set global autocommit=1')
	try:
            self.meta_ip = get_meta_ip(network_card='enp7s0')
	except:
	    self.meta_ip = get_meta_ip(network_card='eth0')
        #初始化meta库的state
        self.init_state()
Ejemplo n.º 5
0
class Crawler(object):
    def __init__(self, spidercls, settings=None):
        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None

    @property
    def spiders(self):
        if not hasattr(self, '_spiders'):
            warnings.warn(
                "Crawler.spiders is deprecated, use "
                "CrawlerRunner.spider_loader or instantiate "
                "scrapy.spiderloader.SpiderLoader with your "
                "settings.",
                category=ScrapyDeprecationWarning,
                stacklevel=2)
            self._spiders = _get_spider_loader(self.settings.frozencopy())
        return self._spiders

    @defer.inlineCallbacks
    def crawl(self, *args, **kwargs):
        assert not self.crawling, "Crawling already taking place"
        self.crawling = True

        try:
            self.spider = self._create_spider(*args, **kwargs)
            self.engine = self._create_engine()
            start_requests = iter(self.spider.start_requests())
            yield self.engine.open_spider(self.spider, start_requests)
            yield defer.maybeDeferred(self.engine.start)
        except Exception:
            exc = defer.fail()
            self.crawling = False
            if self.engine is not None:
                yield self.engine.close()
            yield exc

    def _create_spider(self, *args, **kwargs):
        return self.spidercls.from_crawler(self, *args, **kwargs)

    def _create_engine(self):
        return ExecutionEngine(self, lambda _: self.stop())

    @defer.inlineCallbacks
    def stop(self):
        if self.crawling:
            self.crawling = False
            yield defer.maybeDeferred(self.engine.stop)
Ejemplo n.º 6
0
class Crawler(object):

    def __init__(self, spidercls, settings=None):
        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings: %(settings)r", {'settings': d})

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)
        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None

    @property
    def spiders(self):
        if not hasattr(self, '_spiders'):
            warnings.warn("Crawler.spiders is deprecated, use "
                          "CrawlerRunner.spider_loader or instantiate "
                          "scrapy.spiderloader.SpiderLoader with your "
                          "settings.",
                          category=ScrapyDeprecationWarning, stacklevel=2)
            self._spiders = _get_spider_loader(self.settings.frozencopy())
        return self._spiders

    @defer.inlineCallbacks
    def crawl(self, *args, **kwargs):
        assert not self.crawling, "Crawling already taking place"
        self.crawling = True

        try:
            self.spider = self._create_spider(*args, **kwargs)
            self.engine = self._create_engine()
            start_requests = iter(self.spider.start_requests())
            yield self.engine.open_spider(self.spider, start_requests)
            yield defer.maybeDeferred(self.engine.start)
        except Exception:
            # In Python 2 reraising an exception after yield discards
            # the original traceback (see https://bugs.python.org/issue7563),
            # so sys.exc_info() workaround is used.
            # This workaround also works in Python 3, but it is not needed,
            # and it is slower, so in Python 3 we use native `raise`.
            if six.PY2:
                exc_info = sys.exc_info()

            self.crawling = False
            if self.engine is not None:
                yield self.engine.close()

            if six.PY2:
                six.reraise(*exc_info)
            raise

    def _create_spider(self, *args, **kwargs):
        return self.spidercls.from_crawler(self, *args, **kwargs)

    def _create_engine(self):
        return ExecutionEngine(self, lambda _: self.stop())

    @defer.inlineCallbacks
    def stop(self):
        if self.crawling:
            self.crawling = False
            yield defer.maybeDeferred(self.engine.stop)
Ejemplo n.º 7
0
class Crawler(object):

    def __init__(self, spidercls, settings=None):
        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None

    @property
    def spiders(self):
        if not hasattr(self, '_spiders'):
            warnings.warn("Crawler.spiders is deprecated, use "
                          "CrawlerRunner.spider_loader or instantiate "
                          "scrapy.spiderloader.SpiderLoader with your "
                          "settings.",
                          category=ScrapyDeprecationWarning, stacklevel=2)
            self._spiders = _get_spider_loader(self.settings.frozencopy())
        return self._spiders

    @defer.inlineCallbacks
    def crawl(self, *args, **kwargs):
        assert not self.crawling, "Crawling already taking place"
        self.crawling = True

        try:
            self.spider = self._create_spider(*args, **kwargs)
            self.engine = self._create_engine()
            start_requests = iter(self.spider.start_requests())
            yield self.engine.open_spider(self.spider, start_requests)
            yield defer.maybeDeferred(self.engine.start)
        except Exception:
            self.crawling = False
            raise

    def _create_spider(self, *args, **kwargs):
        return self.spidercls.from_crawler(self, *args, **kwargs)

    def _create_engine(self):
        return ExecutionEngine(self, lambda _: self.stop())

    @defer.inlineCallbacks
    def stop(self):
        if self.crawling:
            self.crawling = False
            yield defer.maybeDeferred(self.engine.stop)
Ejemplo n.º 8
0
class Crawler(object):

    def __init__(self, spidercls, settings=None):
        if isinstance(spidercls, Spider):
            raise ValueError(
                'The spidercls argument must be a class, not an object')

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings: %(settings)r", {'settings': d})

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)
        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None

    @property
    def spiders(self):
        if not hasattr(self, '_spiders'):
            warnings.warn("Crawler.spiders is deprecated, use "
                          "CrawlerRunner.spider_loader or instantiate "
                          "scrapy.spiderloader.SpiderLoader with your "
                          "settings.",
                          category=ScrapyDeprecationWarning, stacklevel=2)
            self._spiders = _get_spider_loader(self.settings.frozencopy())
        return self._spiders

    @defer.inlineCallbacks
    def crawl(self, *args, **kwargs):
        assert not self.crawling, "Crawling already taking place"
        self.crawling = True

        try:
            self.spider = self._create_spider(*args, **kwargs)
            self.engine = self._create_engine()
            start_requests = iter(self.spider.start_requests())
            yield self.engine.open_spider(self.spider, start_requests)
            yield defer.maybeDeferred(self.engine.start)
        except Exception:
            # In Python 2 reraising an exception after yield discards
            # the original traceback (see https://bugs.python.org/issue7563),
            # so sys.exc_info() workaround is used.
            # This workaround also works in Python 3, but it is not needed,
            # and it is slower, so in Python 3 we use native `raise`.
            if six.PY2:
                exc_info = sys.exc_info()

            self.crawling = False
            if self.engine is not None:
                yield self.engine.close()

            if six.PY2:
                six.reraise(*exc_info)
            raise

    def _create_spider(self, *args, **kwargs):
        return self.spidercls.from_crawler(self, *args, **kwargs)

    def _create_engine(self):
        return ExecutionEngine(self, lambda _: self.stop())

    @defer.inlineCallbacks
    def stop(self):
        if self.crawling:
            self.crawling = False
            yield defer.maybeDeferred(self.engine.stop)
Ejemplo n.º 9
0
 def __init__(self):
     self.conn = None
     sig=SignalManager(sender=dispatcher.Any)
     sig.connect(self.initialize,signals.engine_started)
     sig.connect(self.finalize, signals.engine_stopped)
 def from_crawler(cls, crawler):
     instance = cls(db_config=crawler.settings.get('MYSQL_DB_KWARGS'))
     sm = SignalManager()
     sm.connect(instance.add_to_db_and_pop, signal=signals.email_sent_ok)
     return instance
Ejemplo n.º 11
0
class MongoStorage(object):
    """
    Utility class for working with MongoDB data.
    It supports CRUD operations and allows to subscribe to
    created/updated/deleted events.
    """
    def __init__(self, mongo_uri, cache=False):
        self.mongo_uri = mongo_uri
        _, _, _, _, self.col = motor_from_uri(mongo_uri)
        self.signal_manager = SignalManager()
        # Used for unsubscribe
        # disconnect() requires reference to original callback
        self._callbacks = {}
        self.fetching = False
        self.signals = {
            'created': object(),
            'updated': object(),
            'deleted': object(),
        }
        # XXX: cache is used in arachnado.cron and arachnado.site_checker.
        # Is it needed?
        self.cache_flag = cache
        if cache:
            self.cache = defaultdict(dict)
        else:
            self.cache = None

    def subscribe(self, events=None, callback=None):
        if events is None:
            events = self.available_events
        if not isinstance(events, list):
            events = [events]
        for event_name in events:
            if event_name not in self.signals:
                raise ValueError('Invalid event name: {}'.format(event_name))
            self.signal_manager.connect(callback,
                                        self.signals[event_name],
                                        weak=False)
            self._callbacks[event_name] = callback

    def unsubscribe(self, events=None):
        if events is None:
            events = self.available_events
        if not isinstance(events, list):
            events = [events]
        for event_name in events:
            try:
                self.signal_manager.disconnect(
                    self._callbacks[event_name],
                    self.signals[event_name],
                    weak=False
                )
                self._callbacks.pop(event_name, None)
            except KeyError:
                # FIXME: when can it happen?
                pass

    @property
    def available_events(self):
        return list(self.signals.keys())

    @coroutine
    def fetch(self, query=None):
        if self.fetching:
            return
        self.fetching = True
        docs = []
        cursor = self.col.find(query)
        while (yield cursor.fetch_next):
            doc = cursor.next_object()
            docs.append(doc)
            #if self.cache is not None:
            #    self.cache[str(doc['_id'])] = doc
            #    if str(doc['_id']) not in self.cache:
            #        self.signal_manager.send_catch_log(
            #            self.signals['created'], data=doc
            #        )
        self.fetching = False
        raise Return(docs)

    @coroutine
    def create(self, doc):
        doc = replace_dots(doc)
        result = yield self.col.insert(doc)
        if self.cache is not None:
            self.cache[str(doc['_id'])] = doc
        self.signal_manager.send_catch_log(self.signals['created'], data=doc)
        raise Return(result)

    @coroutine
    def ensure_index(self, key_or_list):
        result = yield self.col.ensure_index(key_or_list)
        raise Return(result)

    @coroutine
    def update(self, doc):
        doc = replace_dots(doc)
        doc_copy = deepcopy(doc)
        doc_copy.pop('_id')
        result = yield self.col.update({
            '_id': ObjectId(doc['_id'])
        }, {
            '$set': doc_copy
        })
        if self.cache is not None:
            self.cache[str(doc['_id'])].update(doc)
        self.signal_manager.send_catch_log(self.signals['updated'], data=doc)
        raise Return(result)

    @coroutine
    def delete(self, doc):
        result = yield self.col.remove({'_id': ObjectId(doc['_id'])})
        if self.cache is not None:
            self.cache.pop(str(doc['_id']), None)
        self.signal_manager.send_catch_log(self.signals['deleted'], data=doc)
        raise Return(result)
Ejemplo n.º 12
0
class Crawler(object):
    def __init__(self, spidercls, settings=None):
        if isinstance(settings, dict) or settings is None:  # 此处应该直接就是一个False
            settings = Settings(settings)

        self.spidercls = spidercls  # 获取了爬虫对象,,但是还是没有实例化  # 所以流程就是获取爬虫对象,在实例化之前,执行了一次update_settings,就是针对custom setting的一次操作嘛
        self.settings = settings.copy()
        self.spidercls.update_settings(
            self.settings
        )  # 就是在这里执行了对custom_setting的设置嘛,可以很强,把爬虫里面的custom_setting更新到setting里面?好吧没事共同维护的一个setting对象,确实是直接更新到了setting里面
        # 所以在实例化之前,他只是更新了settings而已,你写在__init__其实一点用都得的,根本就没有执行到哪一步
        # todo 总结。我以前是想把custiom_settings卸载init里面,但是并没有起作用,因为此时的爬虫也就是 self.spidercls 根本就还没有进行初始化,而是先执行的update_settings
        d = dict(overridden_settings(self.settings))  # 找出相同的,取overridden里面的值
        logger.info(
            "Overridden settings: %(settings)r",
            {'settings': d
             })  # 这就是打印那句log的地方,打印出Overridden属性。现遍历默认属性,找出已有属性中对应的值,看那些有修改

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(
            self
        )  # STATS_CLASS = 'scrapy.statscollectors.MemoryStatsCollector' -- 统计机制

        handler = LogCounterHandler(
            self, level=self.settings.get('LOG_LEVEL'))  # LOG_LEVEL = 'DEBUG'
        logging.root.addHandler(handler)
        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(
            self.settings['LOG_FORMATTER']
        )  # LOG_FORMATTER = 'scrapy.logformatter.LogFormatter'
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(
            self)  # extensions是干嘛用的,杵这干啥呢,闹呢。卧槽,貌似不需要使用这些扩展件呀

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None

    @property
    def spiders(self):
        if not hasattr(self, '_spiders'):
            warnings.warn(
                "Crawler.spiders is deprecated, use "
                "CrawlerRunner.spider_loader or instantiate "
                "scrapy.spiderloader.SpiderLoader with your "
                "settings.",
                category=ScrapyDeprecationWarning,
                stacklevel=2)
            self._spiders = _get_spider_loader(self.settings.frozencopy())
        return self._spiders  # SpiderLoader类,还不是直接的爬虫类

    @defer.inlineCallbacks
    def crawl(self, *args, **kwargs):
        assert not self.crawling, "Crawling already taking place"
        self.crawling = True

        try:  # 终于实例化了啊
            self.spider = self._create_spider(
                *args, **kwargs)  # 这个spider就是我们编写的爬虫,如MySpider
            self.engine = self._create_engine()
            start_requests = iter(self.spider.start_requests(
            ))  # Request / FormRequest,第一步就在这里获取所有的start_requests函数逻辑
            yield self.engine.open_spider(self.spider, start_requests)
            yield defer.maybeDeferred(
                self.engine.start)  # todo, 在这列插入日志,看下到底是谁先执行,这个yield是不是可以随意设置啊
        except Exception:
            # In Python 2 reraising an exception after yield discards
            # the original traceback (see https://bugs.python.org/issue7563),
            # so sys.exc_info() workaround is used.
            # This workaround also works in Python 3, but it is not needed,
            # and it is slower, so in Python 3 we use native `raise`.
            if six.PY2:
                exc_info = sys.exc_info()

            self.crawling = False
            if self.engine is not None:
                yield self.engine.close()

            if six.PY2:
                six.reraise(*exc_info)
            raise

    def _create_spider(self, *args, **kwargs):
        return self.spidercls.from_crawler(
            self, *args, **kwargs
        )  # 找到了,在这里使用from_crawler进行实例化的,实例化一个Spider类,主要是初始化crawler和setting

    def _create_engine(self):
        return ExecutionEngine(self, lambda _: self.stop())

    @defer.inlineCallbacks
    def stop(self):
        if self.crawling:
            self.crawling = False
            yield defer.maybeDeferred(self.engine.stop)
Ejemplo n.º 13
0
class Crawler(object):

    def __init__(self, spidercls, settings=None):
        if isinstance(spidercls, Spider):  # 这个是检测是否是实例。spidercls是Spider子类,而不是实例
            raise ValueError(
                'The spidercls argument must be a class, not an object')

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()  # 这里又进行了一次深拷贝
        self.spidercls.update_settings(self.settings)

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)

        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings: %(settings)r", {'settings': d})

        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()  # 不允许修改setting了
        self.crawling = False
        self.spider = None
        self.engine = None

    @property
    def spiders(self):
        if not hasattr(self, '_spiders'):
            warnings.warn("Crawler.spiders is deprecated, use "
                          "CrawlerRunner.spider_loader or instantiate "
                          "scrapy.spiderloader.SpiderLoader with your "
                          "settings.",
                          category=ScrapyDeprecationWarning, stacklevel=2)
            self._spiders = _get_spider_loader(self.settings.frozencopy())
        return self._spiders

    """
    在yield_from中也定义了一个Crawler类,其中有个fetch方法就类似于此处的crawl方法。
    @defer.inlineCallbacks可以理解为asyncio的async关键字,即声明为一个协程,当然
    这个装饰器干了不少事。
    """
    @defer.inlineCallbacks  
    def crawl(self, *args, **kwargs):  # defer就相当于future,有点不一样
        assert not self.crawling, "Crawling already taking place"
        self.crawling = True

        try:
            self.spider = self._create_spider(*args, **kwargs)  # 实例化爬虫类(即我们自定义的spider)
            self.engine = self._create_engine()  # 创建引擎
            start_requests = iter(self.spider.start_requests())  # 调用爬虫类的start_requests方法
            yield self.engine.open_spider(self.spider, start_requests)  # 执行引擎的open_spider,并传入爬虫实例和初始请求
            yield defer.maybeDeferred(self.engine.start)
        except Exception:
            # In Python 2 reraising an exception after yield discards
            # the original traceback (see https://bugs.python.org/issue7563),
            # so sys.exc_info() workaround is used.
            # This workaround also works in Python 3, but it is not needed,
            # and it is slower, so in Python 3 we use native `raise`.
            if six.PY2:
                exc_info = sys.exc_info()

            self.crawling = False
            if self.engine is not None:
                yield self.engine.close()

            if six.PY2:
                six.reraise(*exc_info)
            raise

    def _create_spider(self, *args, **kwargs):
        return self.spidercls.from_crawler(self, *args, **kwargs)

    def _create_engine(self):
        return ExecutionEngine(self, lambda _: self.stop())

    @defer.inlineCallbacks
    def stop(self):
        """Starts a graceful stop of the crawler and returns a deferred that is
        fired when the crawler is stopped."""
        if self.crawling:
            self.crawling = False
            yield defer.maybeDeferred(self.engine.stop)
Ejemplo n.º 14
0
class ArachnadoCrawlerProcess(CrawlerProcess):
    """
    CrawlerProcess which sets up a global signals manager,
    assigns unique ids to each spider job, workarounds some Scrapy
    issues and provides extra stats.
    """
    crawl_ids = itertools.count(start=1)

    def __init__(self, settings=None):
        self.signals = SignalManager(self)
        self.signals.connect(self.on_spider_closed,
                             CrawlerProcessSignals.spider_closed)
        self._finished_jobs = []
        self._paused_jobs = set()
        self.procmon = ProcessStatsMonitor()
        self.procmon.start()

        super(ArachnadoCrawlerProcess, self).__init__(settings or {})

        # don't log DepthMiddleware messages
        # see https://github.com/scrapy/scrapy/issues/1308
        logging.getLogger("scrapy.spidermiddlewares.depth").setLevel(
            logging.INFO)

    def crawl(self, crawler_or_spidercls, *args, **kwargs):
        kwargs['crawl_id'] = next(self.crawl_ids)

        crawler = crawler_or_spidercls
        if not isinstance(crawler_or_spidercls, Crawler):
            crawler = self._create_crawler(crawler_or_spidercls)

        # aggregate all crawler signals
        for name in SCRAPY_SIGNAL_NAMES:
            crawler.signals.connect(self._resend_signal,
                                    getattr(signals, name))

        # aggregate signals from crawler EventedStatsCollectors
        if hasattr(crawler.stats, "signals"):
            crawler.stats.signals.connect(self._resend_signal,
                                          stats.stats_changed)

        d = super(ArachnadoCrawlerProcess, self).crawl(crawler_or_spidercls,
                                                       *args, **kwargs)
        return d

    def _create_crawler(self, spidercls):
        if isinstance(spidercls, six.string_types):
            spidercls = self.spider_loader.load(spidercls)
        return ArachnadoCrawler(spidercls, self.settings)

    def stop_job(self, crawl_id):
        """ Stop a single crawl job """
        self.get_crawler(crawl_id).stop()

    def pause_job(self, crawl_id):
        """ Pause a crawling job """
        self._paused_jobs.add(crawl_id)
        self.get_crawler(crawl_id).engine.pause()

    def resume_job(self, crawl_id):
        """ Resume a crawling job """
        self._paused_jobs.remove(crawl_id)
        self.get_crawler(crawl_id).engine.unpause()

    def get_crawler(self, crawl_id):
        for crawler in self.crawlers:
            if getattr(crawler.spider, "crawl_id") == crawl_id:
                return crawler
        raise KeyError("Job is not known: %s" % crawl_id)

    def _resend_signal(self, **kwargs):
        # FIXME: this is a mess. Signal handling should be unified somehow:
        # there shouldn't be two separate code paths
        # for CrawlerProcessSignals and STAT_SIGNALS.
        signal = kwargs['signal']
        if signal in STAT_SIGNALS:
            signal = STAT_SIGNALS[signal]
            kwargs['crawler'] = kwargs.pop('sender').crawler
        else:
            signal = CrawlerProcessSignals.signal(signal)
            kwargs['crawler'] = kwargs.pop('sender')

        kwargs['signal'] = signal
        if signal.supports_defer:
            return self.signals.send_catch_log_deferred(**kwargs)
        else:
            return self.signals.send_catch_log(**kwargs)

    def stop(self):
        """ Terminate the process (exit from application). """
        self.procmon.stop()
        return super(ArachnadoCrawlerProcess, self).stop()

    def on_spider_closed(self, spider, reason):
        # spiders are closed not that often, insert(0,...) should be fine
        self._finished_jobs.insert(
            0, {
                'id': spider.crawl_id,
                'job_id': getattr(spider, 'motor_job_id'),
                'seed': spider.domain,
                'status': reason,
                'stats': spider.crawler.stats.get_stats(spider),
                'downloads': self._downloader_stats(spider.crawler)
            })

    # FIXME: methods below are ugly for two reasons:
    # 1. they assume spiders have certain attributes;
    # 2. they try to get crawling status based on auxilary information.

    def get_jobs(self):
        """ Return a list of active jobs """
        crawlers = [
            crawler for crawler in self.crawlers if crawler.spider is not None
        ]
        return [
            {
                'id': crawler.spider.crawl_id,
                'job_id': getattr(crawler.spider, 'motor_job_id'),
                'seed': crawler.spider.domain,
                'status': self._get_crawler_status(crawler),
                'stats':
                crawler.spider.crawler.stats.get_stats(crawler.spider),
                'downloads': self._downloader_stats(crawler)
                # 'engine_info': dict(get_engine_status(crawler.engine))
            } for crawler in crawlers
        ]

    @classmethod
    def _downloader_stats(cls, crawler):
        downloader = crawler.engine.downloader
        return {
            'active': [cls._request_info(req) for req in downloader.active],
            'slots':
            sorted([
                cls._slot_info(key, slot)
                for key, slot in downloader.slots.items()
            ],
                   key=operator.itemgetter('key'))
        }

    @classmethod
    def _request_info(cls, request):
        return {'url': request.url, 'method': request.method}

    @classmethod
    def _slot_info(cls, key, slot):
        return {
            'key': key,
            'concurrency': slot.concurrency,
            'delay': slot.delay,
            'lastseen': slot.lastseen,
            'len(queue)': len(slot.queue),
            'transferring':
            [cls._request_info(req) for req in slot.transferring],
            'active': [cls._request_info(req) for req in slot.active],
        }

    def _get_crawler_status(self, crawler):
        if crawler.spider is None:
            return "unknown"
        if not crawler.crawling:
            return "stopping"
        if int(crawler.spider.crawl_id) in self._paused_jobs:
            return "suspended"
        return "crawling"

    @property
    def jobs(self):
        """ Current crawl state """
        # filter out active jobs which are in fact finished
        finished_ids = {job['id'] for job in self._finished_jobs}
        active_jobs = [
            job for job in self.get_jobs() if job['id'] not in finished_ids
        ]

        return active_jobs + self._finished_jobs
Ejemplo n.º 15
0
class Crawler:
    def __init__(self, spidercls, settings=None, init_reactor: bool = False):
        if isinstance(spidercls, Spider):
            raise ValueError(
                'The spidercls argument must be a class, not an object')

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        self.signals = SignalManager(self)

        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)

        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings:\n%(settings)s",
                    {'settings': pprint.pformat(d)})

        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)

        self.request_fingerprinter = create_instance(
            load_object(self.settings['REQUEST_FINGERPRINTER_CLASS']),
            settings=self.settings,
            crawler=self,
        )

        reactor_class = self.settings.get("TWISTED_REACTOR")
        if init_reactor:
            # this needs to be done after the spider settings are merged,
            # but before something imports twisted.internet.reactor
            if reactor_class:
                install_reactor(reactor_class,
                                self.settings["ASYNCIO_EVENT_LOOP"])
            else:
                from twisted.internet import reactor  # noqa: F401
            log_reactor_info()
        if reactor_class:
            verify_installed_reactor(reactor_class)

        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None

    @defer.inlineCallbacks
    def crawl(self, *args, **kwargs):
        if self.crawling:
            raise RuntimeError("Crawling already taking place")
        self.crawling = True

        try:
            self.spider = self._create_spider(*args, **kwargs)
            self.engine = self._create_engine()
            start_requests = iter(self.spider.start_requests())
            yield self.engine.open_spider(self.spider, start_requests)
            yield defer.maybeDeferred(self.engine.start)
        except Exception:
            self.crawling = False
            if self.engine is not None:
                yield self.engine.close()
            raise

    def _create_spider(self, *args, **kwargs):
        return self.spidercls.from_crawler(self, *args, **kwargs)

    def _create_engine(self):
        return ExecutionEngine(self, lambda _: self.stop())

    @defer.inlineCallbacks
    def stop(self):
        """Starts a graceful stop of the crawler and returns a deferred that is
        fired when the crawler is stopped."""
        if self.crawling:
            self.crawling = False
            yield defer.maybeDeferred(self.engine.stop)
Ejemplo n.º 16
0
class Crawler(object):
    """ 爬虫启动主类: 实例化各个spider; 创建engine; 启动指定的爬虫; 开启engine """

    def __init__(self, spidercls, settings=None):
        if isinstance(spidercls, Spider):
            raise ValueError(
                'The spidercls argument must be a class, not an object')

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)

        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings: %(settings)r", {'settings': d})

        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new
            # settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None

    @property
    def spiders(self):
        if not hasattr(self, '_spiders'):
            warnings.warn("Crawler.spiders is deprecated, use "
                          "CrawlerRunner.spider_loader or instantiate "
                          "scrapy.spiderloader.SpiderLoader with your "
                          "settings.",
                          category=ScrapyDeprecationWarning, stacklevel=2)
            self._spiders = _get_spider_loader(self.settings.frozencopy())
        return self._spiders

    @defer.inlineCallbacks
    def crawl(self, *args, **kwargs):
        """ 命令入口执行最后调用函数, 功能: 创建爬虫, 启动引擎"""
        # inlineCallbacks: 帮助你使用看起来像有规整的顺序的函数代码去写回调函数, 要求函数返回迭代器
        #   其会将一系列的生成器变为一系列的callbacks并执行
        assert not self.crawling, "Crawling already taking place"
        self.crawling = True

        try:
            # 从spiderloader中找到爬虫类(爬虫实际位置), 并实例化爬虫实例scrapy/spiders/__init__.py中父类Spider类
            self.spider = self._create_spider(*args, **kwargs)
            # 实例化引擎对象, 见scrapy/core/engine.py
            self.engine = self._create_engine()
            # 调用start_requests, 获取种子URL
            start_requests = iter(self.spider.start_requests())
            # 调用open_spider方法,传入爬虫实例/初始请求, 见scrapy/core/engine.py, 后续交由引擎调度
            yield self.engine.open_spider(self.spider, start_requests)
            yield defer.maybeDeferred(self.engine.start)
        except Exception:
            # In Python 2 reraising an exception after yield discards
            # the original traceback (see https://bugs.python.org/issue7563),
            # so sys.exc_info() workaround is used.
            # This workaround also works in Python 3, but it is not needed,
            # and it is slower, so in Python 3 we use native `raise`.
            if six.PY2:
                exc_info = sys.exc_info()

            self.crawling = False
            if self.engine is not None:
                yield self.engine.close()

            if six.PY2:
                six.reraise(*exc_info)
            raise

    def _create_spider(self, *args, **kwargs):
        # 所有的爬虫都是利用from_crawler来进行实例化, 见scrapy/spiders/__init__.py文件
        return self.spidercls.from_crawler(self, *args, **kwargs)

    def _create_engine(self):
        return ExecutionEngine(self, lambda _: self.stop())

    @defer.inlineCallbacks
    def stop(self):
        """Starts a graceful stop of the crawler and returns a deferred that is
        fired when the crawler is stopped."""
        if self.crawling:
            self.crawling = False
            yield defer.maybeDeferred(self.engine.stop)
Ejemplo n.º 17
0
class Crawler:
    # 这个类相当于全局变量
    def __init__(self, spidercls, settings=None):
        if isinstance(spidercls, Spider):
            # spidercls参数必须是类,而不是对象
            raise ValueError(
                'The spidercls argument must be a class, not an object')

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        # 加载配置文件
        self.settings = settings.copy()
        # 这里调用了更新设置
        self.spidercls.update_settings(self.settings)

        # 初始化信号管理类
        self.signals = SignalManager(self)
        # 初始化日志收集类
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)

        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings:\n%(settings)s",
                    {'settings': pprint.pformat(d)})

        # 这个是关于根日志的
        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        # 加lambda为了防止垃圾回收机制?
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        # 加载日志格式化的东西
        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        # 好像是个扩展
        self.extensions = ExtensionManager.from_crawler(self)

        # settings初始化完成,禁止修改了
        self.settings.freeze()
        # 未开始抓取
        self.crawling = False
        # 这里做准备操作,后面crawl才进行赋值
        self.spider = None
        self.engine = None

    @defer.inlineCallbacks
    def crawl(self, *args, **kwargs):
        # p.7 调用此方法,生成一个d
        if self.crawling:
            raise RuntimeError("Crawling already taking place")
        self.crawling = True

        try:
            # p.8 实例化spider
            self.spider = self._create_spider(*args, **kwargs)
            # p.10 创建引擎
            self.engine = self._create_engine()

            # p.12 生成最开始的请求,并转成迭代器
            start_requests = iter(self.spider.start_requests())
            # p.13 生成抓爬请求的行为
            yield self.engine.open_spider(self.spider, start_requests)
            # p.23 一切准备工作完成,启动引擎
            yield defer.maybeDeferred(self.engine.start)
        except Exception:
            self.crawling = False
            if self.engine is not None:
                yield self.engine.close()
            raise

    def _create_spider(self, *args, **kwargs):
        # p.9 调用spider类的from_crawler方法,实例化
        return self.spidercls.from_crawler(self, *args, **kwargs)

    def _create_engine(self):
        # p.11 实例化引擎,并挂载引擎正常停止的回调
        return ExecutionEngine(self, lambda _: self.stop())

    @defer.inlineCallbacks
    def stop(self):
        """Starts a graceful stop of the crawler and returns a deferred that is
        fired when the crawler is stopped.
        启动搜寻器的正常停止,并返回在搜寻器停止时触发的延迟。
        """
        if self.crawling:
            self.crawling = False
            yield defer.maybeDeferred(self.engine.stop)
Ejemplo n.º 18
0
class MyselfCrawlerProcess(CrawlerProcess):
    """signals管理, spider管理"""
    crawl_ids = itertools.count(start=1)

    def __init__(self, settings=None):
        self.signals = SignalManager(self)
        self.signals.connect(self.on_spider_closed,
                             CrawlerProcessSignals.spider_closed)
        self._finished_jobs = []
        self._paused_jobs = set()
        self.procmon = ProcessStatsMonitor()
        self.procmon.start()

        super(MyselfCrawlerProcess, self).__init__(settings or {})
        logging.getLogger('scrapy.spidermiddlewares.depth').setLevel(
            logging.INFO)

    def crawl(self, crawler_or_spidercls, *args, **kwargs):
        kwargs['crawl_id'] = next(self.crawl_ids)
        crawler = crawler_or_spidercls
        if not isinstance(crawler_or_spidercls, Crawler):
            crawler = self._create_crawler(crawler_or_spidercls)

        # 爬虫信号状态
        for name in SCRAPY_SIGNAL_NAMES:
            crawler.signals.connect(self._resend_signal,
                                    getattr(signals, name))

        if hasattr(crawler.stats, "signals"):
            crawler.stats.signals.connect(self._resend_signal,
                                          stats.stats_changed)

        d = super(MyselfCrawlerProcess, self).crawl(crawler_or_spidercls,
                                                    *args, **kwargs)
        return d

    def _create_crawler(self, spidercls):
        """新建crawler"""
        if isinstance(spidercls, six.string_types):
            spidercls = self.spider_loader.load(spidercls)
        return MyselfCrawlerProcess(spidercls, self.settings)

    def stop_job(self, crawl_id):
        """crawl job停止信号"""
        self.get_crawler(crawl_id).stop()

    def pause_job(self, crawl_id):
        """crawl job暂停"""
        self._paused_jobs.add(crawl_id)
        self.get_crawler(crawl_id).engine.pause()

    def resume_job(self, crawl_id):
        """crawl job恢复"""
        self._paused_jobs.remove(crawl_id)
        self.get_crawler(crawl_id).engine.unpause()

    def get_crawler(self, crawl_id):
        """获取crawl"""
        for crawler in self.crawlers:
            if getattr(crawler.spider, "crawl_id") == crawl_id:
                return crawler
        raise KeyError("Job is not known: %s" % crawl_id)

    def _resend_signal(self, **kwargs):
        # FIXME: signal and crawl are mess.
        signal = kwargs['signal']
        if signal in STAT_SIGNALS:
            signal = STAT_SIGNALS[signal]
            kwargs['crawler'] = kwargs.pop('sender').crawler
        else:
            signal = CrawlerProcessSignals.signal(signal)
            kwargs['crawler'] = kwargs.pop('sender')

        kwargs['signal'] = signal
        if signal.supports_defer:
            return self.signals.send_catch_log_deferred(**kwargs)
        else:
            return self.signals.send_catch_log(**kwargs)

    def stop(self):
        """停止crawl process"""
        self.procmon.stop()
        return super(MyselfCrawlerProcess, self).stop()

    def on_spider_closed(self, spider, reason):
        """spider关闭时写入"""
        self._finished_jobs.insert(
            0, {
                'id': spider.crawl_id,
                'job_id': getattr(spider, 'motor_job_id'),
                'seed': spider.domain,
                'status': reason,
                'stats': spider.crawler.stats.get_stats(spider),
                'downloads': self._downloader_stats(spider.crawler)
            })

    def get_jobs(self):
        """获取运行中的job"""
        crawlers = [
            crawler for crawler in self.crawlers if crawler.spider is not None
        ]
        return [{
            'id': crawler.spider.crawl_id,
            'job_id': getattr(crawler.spider, 'motor_job_id'),
            'seed': crawler.spider.domain,
            'status': self._get_crawler_status(crawler),
            'stats': crawler.spider.crawler.stats.get_stats(crawler.spider),
            'downloads': self._downloader_stats(crawler)
        } for crawler in crawlers]

    @classmethod
    def _downloader_stats(cls, crawler):
        """下载器状态"""
        downloader = crawler.engine.downloader
        return {
            'active': [cls._request_info(req) for req in downloader.active],
            'slots':
            sorted([
                cls._slot_info(key, slot)
                for key, slot in downloader.slots.items()
            ],
                   key=operator.itemgetter('key'))
        }

    @classmethod
    def _request_info(cls, request):
        """request消息"""
        return {'url': request.url, 'method': request.method}

    @classmethod
    def _slot_info(cls, key, slot):
        """slot消息"""
        return {
            'key': key,
            'concurrency': slot.concurrency,
            'delay': slot.delay,
            'lastseen': slot.lastseen,
            'len(queue)': len(slot.queue),
            'transferring':
            [cls._request_info(req) for req in slot.transferring],
            'active': [cls._request_info(req) for req in slot.active]
        }

    def _get_crawler_status(self, crawler):
        """crawler运行状态"""
        if crawler.spider is None:
            return "unknown"
        if not crawler.crawling:
            return "stopping"
        if int(crawler.spider.crawl_id) in self._paused_jobs:
            return "suspended"
        return "crawling"

    @property
    def jobs(self):
        """完成和未完成crawl状态"""
        finished_ids = {job['id'] for job in self._finished_jobs}
        active_jobs = [
            job for job in self.get_jobs() if job['id'] not in finished_ids
        ]

        return active_jobs + self._finished_jobs
Ejemplo n.º 19
0
class ArachnadoCrawlerProcess(CrawlerProcess):
    """
    CrawlerProcess which sets up a global signals manager,
    assigns unique ids to each spider job, workarounds some Scrapy
    issues and provides extra stats.
    """

    crawl_ids = itertools.count(start=1)

    def __init__(self, settings=None):
        self.signals = SignalManager(self)
        self.signals.connect(self.on_spider_closed, CrawlerProcessSignals.spider_closed)
        self._finished_jobs = []
        self._paused_jobs = set()
        self.procmon = ProcessStatsMonitor()
        self.procmon.start()

        super(ArachnadoCrawlerProcess, self).__init__(settings or {})

        # don't log DepthMiddleware messages
        # see https://github.com/scrapy/scrapy/issues/1308
        logging.getLogger("scrapy.spidermiddlewares.depth").setLevel(logging.INFO)

    def crawl(self, crawler_or_spidercls, *args, **kwargs):
        kwargs["crawl_id"] = next(self.crawl_ids)

        crawler = crawler_or_spidercls
        if not isinstance(crawler_or_spidercls, Crawler):
            crawler = self._create_crawler(crawler_or_spidercls)

        # aggregate all crawler signals
        for name in SCRAPY_SIGNAL_NAMES:
            crawler.signals.connect(self._resend_signal, getattr(signals, name))

        # aggregate signals from crawler EventedStatsCollectors
        if hasattr(crawler.stats, "signals"):
            crawler.stats.signals.connect(self._resend_signal, stats.stats_changed)

        d = super(ArachnadoCrawlerProcess, self).crawl(crawler_or_spidercls, *args, **kwargs)
        return d

    def _create_crawler(self, spidercls):
        if isinstance(spidercls, six.string_types):
            spidercls = self.spider_loader.load(spidercls)
        return ArachnadoCrawler(spidercls, self.settings)

    def stop_job(self, crawl_id):
        """ Stop a single crawl job """
        self.get_crawler(crawl_id).stop()

    def pause_job(self, crawl_id):
        """ Pause a crawling job """
        self._paused_jobs.add(crawl_id)
        self.get_crawler(crawl_id).engine.pause()

    def resume_job(self, crawl_id):
        """ Resume a crawling job """
        self._paused_jobs.remove(crawl_id)
        self.get_crawler(crawl_id).engine.unpause()

    def get_crawler(self, crawl_id):
        for crawler in self.crawlers:
            if getattr(crawler.spider, "crawl_id") == crawl_id:
                return crawler
        raise KeyError("Job is not known: %s" % crawl_id)

    def _resend_signal(self, **kwargs):
        # FIXME: this is a mess. Signal handling should be unified somehow:
        # there shouldn't be two separate code paths
        # for CrawlerProcessSignals and STAT_SIGNALS.
        signal = kwargs["signal"]
        if signal in STAT_SIGNALS:
            signal = STAT_SIGNALS[signal]
            kwargs["crawler"] = kwargs.pop("sender").crawler
        else:
            signal = CrawlerProcessSignals.signal(signal)
            kwargs["crawler"] = kwargs.pop("sender")

        kwargs["signal"] = signal
        if signal.supports_defer:
            return self.signals.send_catch_log_deferred(**kwargs)
        else:
            return self.signals.send_catch_log(**kwargs)

    def stop(self):
        """ Terminate the process (exit from application). """
        self.procmon.stop()
        return super(ArachnadoCrawlerProcess, self).stop()

    def on_spider_closed(self, spider, reason):
        # spiders are closed not that often, insert(0,...) should be fine
        self._finished_jobs.insert(
            0,
            {
                "id": spider.crawl_id,
                "job_id": getattr(spider, "motor_job_id"),
                "seed": spider.domain,
                "status": reason,
                "stats": spider.crawler.stats.get_stats(spider),
                "downloads": self._downloader_stats(spider.crawler),
            },
        )

    # FIXME: methods below are ugly for two reasons:
    # 1. they assume spiders have certain attributes;
    # 2. they try to get crawling status based on auxilary information.

    def get_jobs(self):
        """ Return a list of active jobs """
        crawlers = [crawler for crawler in self.crawlers if crawler.spider is not None]
        return [
            {
                "id": crawler.spider.crawl_id,
                "job_id": getattr(crawler.spider, "motor_job_id"),
                "seed": crawler.spider.domain,
                "status": self._get_crawler_status(crawler),
                "stats": crawler.spider.crawler.stats.get_stats(crawler.spider),
                "downloads": self._downloader_stats(crawler)
                # 'engine_info': dict(get_engine_status(crawler.engine))
            }
            for crawler in crawlers
        ]

    @classmethod
    def _downloader_stats(cls, crawler):
        downloader = crawler.engine.downloader
        return {
            "active": [cls._request_info(req) for req in downloader.active],
            "slots": sorted(
                [cls._slot_info(key, slot) for key, slot in downloader.slots.items()], key=operator.itemgetter("key")
            ),
        }

    @classmethod
    def _request_info(cls, request):
        return {"url": request.url, "method": request.method}

    @classmethod
    def _slot_info(cls, key, slot):
        return {
            "key": key,
            "concurrency": slot.concurrency,
            "delay": slot.delay,
            "lastseen": slot.lastseen,
            "len(queue)": len(slot.queue),
            "transferring": [cls._request_info(req) for req in slot.transferring],
            "active": [cls._request_info(req) for req in slot.active],
        }

    def _get_crawler_status(self, crawler):
        if crawler.spider is None:
            return "unknown"
        if not crawler.crawling:
            return "stopping"
        if int(crawler.spider.crawl_id) in self._paused_jobs:
            return "suspended"
        return "crawling"

    @property
    def jobs(self):
        """ Current crawl state """
        # filter out active jobs which are in fact finished
        finished_ids = {job["id"] for job in self._finished_jobs}
        active_jobs = [job for job in self.get_jobs() if job["id"] not in finished_ids]

        return active_jobs + self._finished_jobs
Ejemplo n.º 20
0
class Crawler(object):
    def __init__(self, spidercls, settings=None):
        ## crawler 对象必须用 scrapy.spiders.Spider 的子类和一个 scrapy.settings.Settings
        ## 对象来实例化

        if isinstance(spidercls, Spider):
            raise ValueError(
                'The spidercls argument must be a class, not an object')

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        ## 自定义爬虫类
        self.spidercls = spidercls
        ## crawler 的配置管理器,用来为插件和中间件提供访问该 crawler 的 Scrapy 配置的入口
        self.settings = settings.copy()
        ## 根据自定义爬虫类中的可能定义的 custom_settigns 属性更新配置
        ## 优先级为 spider
        self.spidercls.update_settings(self.settings)

        ## 这里得到的只是被覆盖过的配置项,并将其转换为字典
        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings: %(settings)r", {'settings': d})

        ## crawler 的信号管理器,被插件和中间件用来将它们自身集成到 Scrapy 功能中
        self.signals = SignalManager(self)
        ## crawler 的 stats 收集器,用来从插件和中间件中记录它们的行为和访问其他插件收集到的数据
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        ## 用于对爬虫运行过程中产生的日志的级别数量,进行统计
        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)
        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        ## 为 engine_stopped 信号注册 __remove_handler 处理器
        ## 当产生引擎停止信号时,将会由 __remove_handler 处理器进行处理
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        ## 初始化日志格式化器实例
        self.logformatter = lf_cls.from_crawler(self)
        ## 用来追踪可用插件的插件管理器
        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        ## 标志爬虫运行状态
        self.crawling = False
        ## 当前正在爬取的 spider
        self.spider = None
        ## 执行引擎,用来协调调度器、下载器、spiders 之间的爬取逻辑
        self.engine = None

    @property
    def spiders(self):
        if not hasattr(self, '_spiders'):
            warnings.warn(
                "Crawler.spiders is deprecated, use "
                "CrawlerRunner.spider_loader or instantiate "
                "scrapy.spiderloader.SpiderLoader with your "
                "settings.",
                category=ScrapyDeprecationWarning,
                stacklevel=2)
            self._spiders = _get_spider_loader(self.settings.frozencopy())
        return self._spiders

    @defer.inlineCallbacks
    def crawl(self, *args, **kwargs):
        ## Starts the crawler by instantiating its spider class with the given
        ## args and kwargs arguments, while setting the execution engine in motion.

        assert not self.crawling, "Crawling already taking place"
        ## 将爬虫运行状态置为 True
        self.crawling = True

        try:
            ## 创建爬虫实例
            self.spider = self._create_spider(*args, **kwargs)
            ## 创建引擎
            self.engine = self._create_engine()
            ## 调用爬虫实例的 start_requests 方法,获取种子 URL(请求对象)
            start_requests = iter(self.spider.start_requests())
            ## 执行引擎的 open_spider 方法,传入爬虫实例和初始请求对象,交由引擎调度
            yield self.engine.open_spider(self.spider, start_requests)
            yield defer.maybeDeferred(self.engine.start)
        except Exception:
            # In Python 2 reraising an exception after yield discards
            # the original traceback (see https://bugs.python.org/issue7563),
            # so sys.exc_info() workaround is used.
            # This workaround also works in Python 3, but it is not needed,
            # and it is slower, so in Python 3 we use native `raise`.
            if six.PY2:
                exc_info = sys.exc_info()

            self.crawling = False
            if self.engine is not None:
                yield self.engine.close()

            if six.PY2:
                six.reraise(*exc_info)
            raise

    def _create_spider(self, *args, **kwargs):
        ## 调用之定义爬虫类的 from_crawler 方法实例化爬虫类
        return self.spidercls.from_crawler(self, *args, **kwargs)

    def _create_engine(self):
        ## 返回一个执行引擎类的实例
        return ExecutionEngine(self, lambda _: self.stop())

    @defer.inlineCallbacks
    def stop(self):
        if self.crawling:
            self.crawling = False
            yield defer.maybeDeferred(self.engine.stop)
Ejemplo n.º 21
0
class Crawler:
    def __init__(self, spidercls, settings=None):
        if isinstance(spidercls, Spider):
            raise ValueError(
                'The spidercls argument must be a class, not an object')

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)

        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings:\n%(settings)s",
                    {'settings': pprint.pformat(d)})

        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None

    @defer.inlineCallbacks
    def crawl(self, *args, **kwargs):
        assert not self.crawling, "Crawling already taking place"
        self.crawling = True

        try:
            self.spider = self._create_spider(*args, **kwargs)
            self.engine = self._create_engine()
            start_requests = iter(self.spider.start_requests())
            yield self.engine.open_spider(self.spider, start_requests)
            yield defer.maybeDeferred(self.engine.start)
        except Exception:
            self.crawling = False
            if self.engine is not None:
                yield self.engine.close()
            raise

    def _create_spider(self, *args, **kwargs):
        return self.spidercls.from_crawler(self, *args, **kwargs)

    def _create_engine(self):
        return ExecutionEngine(self, lambda _: self.stop())

    @defer.inlineCallbacks
    def stop(self):
        """Starts a graceful stop of the crawler and returns a deferred that is
        fired when the crawler is stopped."""
        if self.crawling:
            self.crawling = False
            yield defer.maybeDeferred(self.engine.stop)
Ejemplo n.º 22
0
 def __init__(self):
     super(StoreToMongoDB, self).__init__()
     manager = SignalManager()
     manager.connect(self.initialize, scrapy.signals.spider_opened)
     manager.connect(self.finalize, scrapy.signals.spider_idle)