class CustomCrawler(Crawler): def __init__(self, spidercls, settings=None): if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings: %(settings)r", {'settings': d}) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) if custom_get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings # install_scrapy_root_handler(self.settings) custom_install_scrapy_root_handler(self.settings, spidercls.name) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None
def __init__(self,**kwargs): if not 'config' in kwargs: err = 'failed to find seed file (config=*.conf)' print err # if not 'keywords' in kwargs: # err = 'failed to find seed file (keywords=*.dat)' # print err config = kwargs['config'] # self.keywords = kwargs['keywords'] self.load_conf(config) if self.Sleep_Flag=='SEARCH_ENGINE_SLEEP' or self.Sleep_Flag=='true' or not self.Sleep_Flag: settings.set('RANDOMIZE_DOWNLOAD_DELAY', True, priority='cmdline') settings.set('DOWNLOAD_DELAY', float(self.SE_Sleep_Base), priority='cmdline') else: settings.set('RANDOMIZE_DOWNLOAD_DELAY', False, priority='cmdline') log_filename = self.conf_name.replace('.conf','')+'.log' settings.set('LOG_FILE', log_filename, priority='cmdline') #redis key self.meta_next_url = meta_redis_key() #初始化redis self.init_redis() self.redis_keyword = get_redis_key(self.conf_name) #注册signal sig = SignalManager(dispatcher.Any) sig.connect(self.idle,signal=signals.spider_idle) self.metatime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') #保存该次获取的url,用于判断该次抓取是否和上次重复{keyword:md5(url)} self.urlmd5 = dict() self.log_writer = open('log.dat','a+') self.date_from_url_re = re.compile("[-_/][a-zA-Z]*[-_]?(?P<year>(20)?([0-1][0-9]))([-_/])?(?P<m>(10|11|12|(0?[1-9])){1})([-_/])?(?P<day>(10|20|30|31|([0-2]?[1-9])){1})([-_/])")
def __init__(self, **kwargs): if not 'config' in kwargs: err = 'failed to find seed file (config=*.conf)' print err if 'startdate' in kwargs: self.startdate = kwargs['startdate'] else: self.startdate = ( datetime.datetime.now() - datetime.timedelta(days=2)).strftime('%Y-%m-%d %H:%M:%S') if 'enddate' in kwargs: self.enddate = kwargs['enddate'] else: self.enddate = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') # if not 'keywords' in kwargs: # err = 'failed to find seed file (keywords=*.dat)' # print err config = kwargs['config'] self.load_conf(config) if self.Sleep_Flag == 'SEARCH_ENGINE_SLEEP' or self.Sleep_Flag == 'true' or not self.Sleep_Flag: settings.set('RANDOMIZE_DOWNLOAD_DELAY', True, priority='cmdline') settings.set('DOWNLOAD_DELAY', float(self.SE_Sleep_Base), priority='cmdline') else: settings.set('RANDOMIZE_DOWNLOAD_DELAY', False, priority='cmdline') log_filename = self.conf_name.replace('.conf', '') + '.log' settings.set('LOG_FILE', log_filename, priority='cmdline') #初始化redis self.init_redis() self.redis_keyword = get_redis_key() #注册signal sig = SignalManager(dispatcher.Any) sig.connect(self.idle, signal=signals.spider_idle) sig.connect(self.close, signal=signals.spider_closed) self.metatime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') self.conn_local = mysql.connect('meta', host='localhost') self.conn_local_cursor = self.conn_local.cursor() # self.conn_local_cursor.execute('set global autocommit=1') try: self.meta_ip = get_meta_ip(network_card='enp7s0') except: self.meta_ip = get_meta_ip(network_card='eth0') #初始化meta库的state self.init_state()
def __init__(self,**kwargs): if not 'config' in kwargs: err = 'failed to find seed file (config=*.conf)' print err if 'startdate' in kwargs: self.startdate = kwargs['startdate'] else: self.startdate = (datetime.datetime.now()-datetime.timedelta(days=2)).strftime('%Y-%m-%d %H:%M:%S') if 'enddate' in kwargs: self.enddate = kwargs['enddate'] else: self.enddate = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # if not 'keywords' in kwargs: # err = 'failed to find seed file (keywords=*.dat)' # print err config = kwargs['config'] self.load_conf(config) if self.Sleep_Flag=='SEARCH_ENGINE_SLEEP' or self.Sleep_Flag=='true' or not self.Sleep_Flag: settings.set('RANDOMIZE_DOWNLOAD_DELAY', True, priority='cmdline') settings.set('DOWNLOAD_DELAY', float(self.SE_Sleep_Base), priority='cmdline') else: settings.set('RANDOMIZE_DOWNLOAD_DELAY', False, priority='cmdline') log_filename = self.conf_name.replace('.conf','')+'.log' settings.set('LOG_FILE', log_filename, priority='cmdline') #初始化redis self.init_redis() self.redis_keyword = get_redis_key() #注册signal sig = SignalManager(dispatcher.Any) sig.connect(self.idle,signal=signals.spider_idle) sig.connect(self.close,signal=signals.spider_closed) self.metatime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') self.conn_local = mysql.connect('meta',host='localhost') self.conn_local_cursor = self.conn_local.cursor() # self.conn_local_cursor.execute('set global autocommit=1') try: self.meta_ip = get_meta_ip(network_card='enp7s0') except: self.meta_ip = get_meta_ip(network_card='eth0') #初始化meta库的state self.init_state()
class Crawler(object): def __init__(self, spidercls, settings=None): if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL')) logging.root.addHandler(handler) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None @property def spiders(self): if not hasattr(self, '_spiders'): warnings.warn( "Crawler.spiders is deprecated, use " "CrawlerRunner.spider_loader or instantiate " "scrapy.spiderloader.SpiderLoader with your " "settings.", category=ScrapyDeprecationWarning, stacklevel=2) self._spiders = _get_spider_loader(self.settings.frozencopy()) return self._spiders @defer.inlineCallbacks def crawl(self, *args, **kwargs): assert not self.crawling, "Crawling already taking place" self.crawling = True try: self.spider = self._create_spider(*args, **kwargs) self.engine = self._create_engine() start_requests = iter(self.spider.start_requests()) yield self.engine.open_spider(self.spider, start_requests) yield defer.maybeDeferred(self.engine.start) except Exception: exc = defer.fail() self.crawling = False if self.engine is not None: yield self.engine.close() yield exc def _create_spider(self, *args, **kwargs): return self.spidercls.from_crawler(self, *args, **kwargs) def _create_engine(self): return ExecutionEngine(self, lambda _: self.stop()) @defer.inlineCallbacks def stop(self): if self.crawling: self.crawling = False yield defer.maybeDeferred(self.engine.stop)
class Crawler(object): def __init__(self, spidercls, settings=None): if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings: %(settings)r", {'settings': d}) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None @property def spiders(self): if not hasattr(self, '_spiders'): warnings.warn("Crawler.spiders is deprecated, use " "CrawlerRunner.spider_loader or instantiate " "scrapy.spiderloader.SpiderLoader with your " "settings.", category=ScrapyDeprecationWarning, stacklevel=2) self._spiders = _get_spider_loader(self.settings.frozencopy()) return self._spiders @defer.inlineCallbacks def crawl(self, *args, **kwargs): assert not self.crawling, "Crawling already taking place" self.crawling = True try: self.spider = self._create_spider(*args, **kwargs) self.engine = self._create_engine() start_requests = iter(self.spider.start_requests()) yield self.engine.open_spider(self.spider, start_requests) yield defer.maybeDeferred(self.engine.start) except Exception: # In Python 2 reraising an exception after yield discards # the original traceback (see https://bugs.python.org/issue7563), # so sys.exc_info() workaround is used. # This workaround also works in Python 3, but it is not needed, # and it is slower, so in Python 3 we use native `raise`. if six.PY2: exc_info = sys.exc_info() self.crawling = False if self.engine is not None: yield self.engine.close() if six.PY2: six.reraise(*exc_info) raise def _create_spider(self, *args, **kwargs): return self.spidercls.from_crawler(self, *args, **kwargs) def _create_engine(self): return ExecutionEngine(self, lambda _: self.stop()) @defer.inlineCallbacks def stop(self): if self.crawling: self.crawling = False yield defer.maybeDeferred(self.engine.stop)
class Crawler(object): def __init__(self, spidercls, settings=None): if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL')) logging.root.addHandler(handler) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None @property def spiders(self): if not hasattr(self, '_spiders'): warnings.warn("Crawler.spiders is deprecated, use " "CrawlerRunner.spider_loader or instantiate " "scrapy.spiderloader.SpiderLoader with your " "settings.", category=ScrapyDeprecationWarning, stacklevel=2) self._spiders = _get_spider_loader(self.settings.frozencopy()) return self._spiders @defer.inlineCallbacks def crawl(self, *args, **kwargs): assert not self.crawling, "Crawling already taking place" self.crawling = True try: self.spider = self._create_spider(*args, **kwargs) self.engine = self._create_engine() start_requests = iter(self.spider.start_requests()) yield self.engine.open_spider(self.spider, start_requests) yield defer.maybeDeferred(self.engine.start) except Exception: self.crawling = False raise def _create_spider(self, *args, **kwargs): return self.spidercls.from_crawler(self, *args, **kwargs) def _create_engine(self): return ExecutionEngine(self, lambda _: self.stop()) @defer.inlineCallbacks def stop(self): if self.crawling: self.crawling = False yield defer.maybeDeferred(self.engine.stop)
class Crawler(object): def __init__(self, spidercls, settings=None): if isinstance(spidercls, Spider): raise ValueError( 'The spidercls argument must be a class, not an object') if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings: %(settings)r", {'settings': d}) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None @property def spiders(self): if not hasattr(self, '_spiders'): warnings.warn("Crawler.spiders is deprecated, use " "CrawlerRunner.spider_loader or instantiate " "scrapy.spiderloader.SpiderLoader with your " "settings.", category=ScrapyDeprecationWarning, stacklevel=2) self._spiders = _get_spider_loader(self.settings.frozencopy()) return self._spiders @defer.inlineCallbacks def crawl(self, *args, **kwargs): assert not self.crawling, "Crawling already taking place" self.crawling = True try: self.spider = self._create_spider(*args, **kwargs) self.engine = self._create_engine() start_requests = iter(self.spider.start_requests()) yield self.engine.open_spider(self.spider, start_requests) yield defer.maybeDeferred(self.engine.start) except Exception: # In Python 2 reraising an exception after yield discards # the original traceback (see https://bugs.python.org/issue7563), # so sys.exc_info() workaround is used. # This workaround also works in Python 3, but it is not needed, # and it is slower, so in Python 3 we use native `raise`. if six.PY2: exc_info = sys.exc_info() self.crawling = False if self.engine is not None: yield self.engine.close() if six.PY2: six.reraise(*exc_info) raise def _create_spider(self, *args, **kwargs): return self.spidercls.from_crawler(self, *args, **kwargs) def _create_engine(self): return ExecutionEngine(self, lambda _: self.stop()) @defer.inlineCallbacks def stop(self): if self.crawling: self.crawling = False yield defer.maybeDeferred(self.engine.stop)
def __init__(self): self.conn = None sig=SignalManager(sender=dispatcher.Any) sig.connect(self.initialize,signals.engine_started) sig.connect(self.finalize, signals.engine_stopped)
def from_crawler(cls, crawler): instance = cls(db_config=crawler.settings.get('MYSQL_DB_KWARGS')) sm = SignalManager() sm.connect(instance.add_to_db_and_pop, signal=signals.email_sent_ok) return instance
class MongoStorage(object): """ Utility class for working with MongoDB data. It supports CRUD operations and allows to subscribe to created/updated/deleted events. """ def __init__(self, mongo_uri, cache=False): self.mongo_uri = mongo_uri _, _, _, _, self.col = motor_from_uri(mongo_uri) self.signal_manager = SignalManager() # Used for unsubscribe # disconnect() requires reference to original callback self._callbacks = {} self.fetching = False self.signals = { 'created': object(), 'updated': object(), 'deleted': object(), } # XXX: cache is used in arachnado.cron and arachnado.site_checker. # Is it needed? self.cache_flag = cache if cache: self.cache = defaultdict(dict) else: self.cache = None def subscribe(self, events=None, callback=None): if events is None: events = self.available_events if not isinstance(events, list): events = [events] for event_name in events: if event_name not in self.signals: raise ValueError('Invalid event name: {}'.format(event_name)) self.signal_manager.connect(callback, self.signals[event_name], weak=False) self._callbacks[event_name] = callback def unsubscribe(self, events=None): if events is None: events = self.available_events if not isinstance(events, list): events = [events] for event_name in events: try: self.signal_manager.disconnect( self._callbacks[event_name], self.signals[event_name], weak=False ) self._callbacks.pop(event_name, None) except KeyError: # FIXME: when can it happen? pass @property def available_events(self): return list(self.signals.keys()) @coroutine def fetch(self, query=None): if self.fetching: return self.fetching = True docs = [] cursor = self.col.find(query) while (yield cursor.fetch_next): doc = cursor.next_object() docs.append(doc) #if self.cache is not None: # self.cache[str(doc['_id'])] = doc # if str(doc['_id']) not in self.cache: # self.signal_manager.send_catch_log( # self.signals['created'], data=doc # ) self.fetching = False raise Return(docs) @coroutine def create(self, doc): doc = replace_dots(doc) result = yield self.col.insert(doc) if self.cache is not None: self.cache[str(doc['_id'])] = doc self.signal_manager.send_catch_log(self.signals['created'], data=doc) raise Return(result) @coroutine def ensure_index(self, key_or_list): result = yield self.col.ensure_index(key_or_list) raise Return(result) @coroutine def update(self, doc): doc = replace_dots(doc) doc_copy = deepcopy(doc) doc_copy.pop('_id') result = yield self.col.update({ '_id': ObjectId(doc['_id']) }, { '$set': doc_copy }) if self.cache is not None: self.cache[str(doc['_id'])].update(doc) self.signal_manager.send_catch_log(self.signals['updated'], data=doc) raise Return(result) @coroutine def delete(self, doc): result = yield self.col.remove({'_id': ObjectId(doc['_id'])}) if self.cache is not None: self.cache.pop(str(doc['_id']), None) self.signal_manager.send_catch_log(self.signals['deleted'], data=doc) raise Return(result)
class Crawler(object): def __init__(self, spidercls, settings=None): if isinstance(settings, dict) or settings is None: # 此处应该直接就是一个False settings = Settings(settings) self.spidercls = spidercls # 获取了爬虫对象,,但是还是没有实例化 # 所以流程就是获取爬虫对象,在实例化之前,执行了一次update_settings,就是针对custom setting的一次操作嘛 self.settings = settings.copy() self.spidercls.update_settings( self.settings ) # 就是在这里执行了对custom_setting的设置嘛,可以很强,把爬虫里面的custom_setting更新到setting里面?好吧没事共同维护的一个setting对象,确实是直接更新到了setting里面 # 所以在实例化之前,他只是更新了settings而已,你写在__init__其实一点用都得的,根本就没有执行到哪一步 # todo 总结。我以前是想把custiom_settings卸载init里面,但是并没有起作用,因为此时的爬虫也就是 self.spidercls 根本就还没有进行初始化,而是先执行的update_settings d = dict(overridden_settings(self.settings)) # 找出相同的,取overridden里面的值 logger.info( "Overridden settings: %(settings)r", {'settings': d }) # 这就是打印那句log的地方,打印出Overridden属性。现遍历默认属性,找出已有属性中对应的值,看那些有修改 self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])( self ) # STATS_CLASS = 'scrapy.statscollectors.MemoryStatsCollector' -- 统计机制 handler = LogCounterHandler( self, level=self.settings.get('LOG_LEVEL')) # LOG_LEVEL = 'DEBUG' logging.root.addHandler(handler) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object( self.settings['LOG_FORMATTER'] ) # LOG_FORMATTER = 'scrapy.logformatter.LogFormatter' self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler( self) # extensions是干嘛用的,杵这干啥呢,闹呢。卧槽,貌似不需要使用这些扩展件呀 self.settings.freeze() self.crawling = False self.spider = None self.engine = None @property def spiders(self): if not hasattr(self, '_spiders'): warnings.warn( "Crawler.spiders is deprecated, use " "CrawlerRunner.spider_loader or instantiate " "scrapy.spiderloader.SpiderLoader with your " "settings.", category=ScrapyDeprecationWarning, stacklevel=2) self._spiders = _get_spider_loader(self.settings.frozencopy()) return self._spiders # SpiderLoader类,还不是直接的爬虫类 @defer.inlineCallbacks def crawl(self, *args, **kwargs): assert not self.crawling, "Crawling already taking place" self.crawling = True try: # 终于实例化了啊 self.spider = self._create_spider( *args, **kwargs) # 这个spider就是我们编写的爬虫,如MySpider self.engine = self._create_engine() start_requests = iter(self.spider.start_requests( )) # Request / FormRequest,第一步就在这里获取所有的start_requests函数逻辑 yield self.engine.open_spider(self.spider, start_requests) yield defer.maybeDeferred( self.engine.start) # todo, 在这列插入日志,看下到底是谁先执行,这个yield是不是可以随意设置啊 except Exception: # In Python 2 reraising an exception after yield discards # the original traceback (see https://bugs.python.org/issue7563), # so sys.exc_info() workaround is used. # This workaround also works in Python 3, but it is not needed, # and it is slower, so in Python 3 we use native `raise`. if six.PY2: exc_info = sys.exc_info() self.crawling = False if self.engine is not None: yield self.engine.close() if six.PY2: six.reraise(*exc_info) raise def _create_spider(self, *args, **kwargs): return self.spidercls.from_crawler( self, *args, **kwargs ) # 找到了,在这里使用from_crawler进行实例化的,实例化一个Spider类,主要是初始化crawler和setting def _create_engine(self): return ExecutionEngine(self, lambda _: self.stop()) @defer.inlineCallbacks def stop(self): if self.crawling: self.crawling = False yield defer.maybeDeferred(self.engine.stop)
class Crawler(object): def __init__(self, spidercls, settings=None): if isinstance(spidercls, Spider): # 这个是检测是否是实例。spidercls是Spider子类,而不是实例 raise ValueError( 'The spidercls argument must be a class, not an object') if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() # 这里又进行了一次深拷贝 self.spidercls.update_settings(self.settings) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings: %(settings)r", {'settings': d}) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() # 不允许修改setting了 self.crawling = False self.spider = None self.engine = None @property def spiders(self): if not hasattr(self, '_spiders'): warnings.warn("Crawler.spiders is deprecated, use " "CrawlerRunner.spider_loader or instantiate " "scrapy.spiderloader.SpiderLoader with your " "settings.", category=ScrapyDeprecationWarning, stacklevel=2) self._spiders = _get_spider_loader(self.settings.frozencopy()) return self._spiders """ 在yield_from中也定义了一个Crawler类,其中有个fetch方法就类似于此处的crawl方法。 @defer.inlineCallbacks可以理解为asyncio的async关键字,即声明为一个协程,当然 这个装饰器干了不少事。 """ @defer.inlineCallbacks def crawl(self, *args, **kwargs): # defer就相当于future,有点不一样 assert not self.crawling, "Crawling already taking place" self.crawling = True try: self.spider = self._create_spider(*args, **kwargs) # 实例化爬虫类(即我们自定义的spider) self.engine = self._create_engine() # 创建引擎 start_requests = iter(self.spider.start_requests()) # 调用爬虫类的start_requests方法 yield self.engine.open_spider(self.spider, start_requests) # 执行引擎的open_spider,并传入爬虫实例和初始请求 yield defer.maybeDeferred(self.engine.start) except Exception: # In Python 2 reraising an exception after yield discards # the original traceback (see https://bugs.python.org/issue7563), # so sys.exc_info() workaround is used. # This workaround also works in Python 3, but it is not needed, # and it is slower, so in Python 3 we use native `raise`. if six.PY2: exc_info = sys.exc_info() self.crawling = False if self.engine is not None: yield self.engine.close() if six.PY2: six.reraise(*exc_info) raise def _create_spider(self, *args, **kwargs): return self.spidercls.from_crawler(self, *args, **kwargs) def _create_engine(self): return ExecutionEngine(self, lambda _: self.stop()) @defer.inlineCallbacks def stop(self): """Starts a graceful stop of the crawler and returns a deferred that is fired when the crawler is stopped.""" if self.crawling: self.crawling = False yield defer.maybeDeferred(self.engine.stop)
class ArachnadoCrawlerProcess(CrawlerProcess): """ CrawlerProcess which sets up a global signals manager, assigns unique ids to each spider job, workarounds some Scrapy issues and provides extra stats. """ crawl_ids = itertools.count(start=1) def __init__(self, settings=None): self.signals = SignalManager(self) self.signals.connect(self.on_spider_closed, CrawlerProcessSignals.spider_closed) self._finished_jobs = [] self._paused_jobs = set() self.procmon = ProcessStatsMonitor() self.procmon.start() super(ArachnadoCrawlerProcess, self).__init__(settings or {}) # don't log DepthMiddleware messages # see https://github.com/scrapy/scrapy/issues/1308 logging.getLogger("scrapy.spidermiddlewares.depth").setLevel( logging.INFO) def crawl(self, crawler_or_spidercls, *args, **kwargs): kwargs['crawl_id'] = next(self.crawl_ids) crawler = crawler_or_spidercls if not isinstance(crawler_or_spidercls, Crawler): crawler = self._create_crawler(crawler_or_spidercls) # aggregate all crawler signals for name in SCRAPY_SIGNAL_NAMES: crawler.signals.connect(self._resend_signal, getattr(signals, name)) # aggregate signals from crawler EventedStatsCollectors if hasattr(crawler.stats, "signals"): crawler.stats.signals.connect(self._resend_signal, stats.stats_changed) d = super(ArachnadoCrawlerProcess, self).crawl(crawler_or_spidercls, *args, **kwargs) return d def _create_crawler(self, spidercls): if isinstance(spidercls, six.string_types): spidercls = self.spider_loader.load(spidercls) return ArachnadoCrawler(spidercls, self.settings) def stop_job(self, crawl_id): """ Stop a single crawl job """ self.get_crawler(crawl_id).stop() def pause_job(self, crawl_id): """ Pause a crawling job """ self._paused_jobs.add(crawl_id) self.get_crawler(crawl_id).engine.pause() def resume_job(self, crawl_id): """ Resume a crawling job """ self._paused_jobs.remove(crawl_id) self.get_crawler(crawl_id).engine.unpause() def get_crawler(self, crawl_id): for crawler in self.crawlers: if getattr(crawler.spider, "crawl_id") == crawl_id: return crawler raise KeyError("Job is not known: %s" % crawl_id) def _resend_signal(self, **kwargs): # FIXME: this is a mess. Signal handling should be unified somehow: # there shouldn't be two separate code paths # for CrawlerProcessSignals and STAT_SIGNALS. signal = kwargs['signal'] if signal in STAT_SIGNALS: signal = STAT_SIGNALS[signal] kwargs['crawler'] = kwargs.pop('sender').crawler else: signal = CrawlerProcessSignals.signal(signal) kwargs['crawler'] = kwargs.pop('sender') kwargs['signal'] = signal if signal.supports_defer: return self.signals.send_catch_log_deferred(**kwargs) else: return self.signals.send_catch_log(**kwargs) def stop(self): """ Terminate the process (exit from application). """ self.procmon.stop() return super(ArachnadoCrawlerProcess, self).stop() def on_spider_closed(self, spider, reason): # spiders are closed not that often, insert(0,...) should be fine self._finished_jobs.insert( 0, { 'id': spider.crawl_id, 'job_id': getattr(spider, 'motor_job_id'), 'seed': spider.domain, 'status': reason, 'stats': spider.crawler.stats.get_stats(spider), 'downloads': self._downloader_stats(spider.crawler) }) # FIXME: methods below are ugly for two reasons: # 1. they assume spiders have certain attributes; # 2. they try to get crawling status based on auxilary information. def get_jobs(self): """ Return a list of active jobs """ crawlers = [ crawler for crawler in self.crawlers if crawler.spider is not None ] return [ { 'id': crawler.spider.crawl_id, 'job_id': getattr(crawler.spider, 'motor_job_id'), 'seed': crawler.spider.domain, 'status': self._get_crawler_status(crawler), 'stats': crawler.spider.crawler.stats.get_stats(crawler.spider), 'downloads': self._downloader_stats(crawler) # 'engine_info': dict(get_engine_status(crawler.engine)) } for crawler in crawlers ] @classmethod def _downloader_stats(cls, crawler): downloader = crawler.engine.downloader return { 'active': [cls._request_info(req) for req in downloader.active], 'slots': sorted([ cls._slot_info(key, slot) for key, slot in downloader.slots.items() ], key=operator.itemgetter('key')) } @classmethod def _request_info(cls, request): return {'url': request.url, 'method': request.method} @classmethod def _slot_info(cls, key, slot): return { 'key': key, 'concurrency': slot.concurrency, 'delay': slot.delay, 'lastseen': slot.lastseen, 'len(queue)': len(slot.queue), 'transferring': [cls._request_info(req) for req in slot.transferring], 'active': [cls._request_info(req) for req in slot.active], } def _get_crawler_status(self, crawler): if crawler.spider is None: return "unknown" if not crawler.crawling: return "stopping" if int(crawler.spider.crawl_id) in self._paused_jobs: return "suspended" return "crawling" @property def jobs(self): """ Current crawl state """ # filter out active jobs which are in fact finished finished_ids = {job['id'] for job in self._finished_jobs} active_jobs = [ job for job in self.get_jobs() if job['id'] not in finished_ids ] return active_jobs + self._finished_jobs
class Crawler: def __init__(self, spidercls, settings=None, init_reactor: bool = False): if isinstance(spidercls, Spider): raise ValueError( 'The spidercls argument must be a class, not an object') if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings:\n%(settings)s", {'settings': pprint.pformat(d)}) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.request_fingerprinter = create_instance( load_object(self.settings['REQUEST_FINGERPRINTER_CLASS']), settings=self.settings, crawler=self, ) reactor_class = self.settings.get("TWISTED_REACTOR") if init_reactor: # this needs to be done after the spider settings are merged, # but before something imports twisted.internet.reactor if reactor_class: install_reactor(reactor_class, self.settings["ASYNCIO_EVENT_LOOP"]) else: from twisted.internet import reactor # noqa: F401 log_reactor_info() if reactor_class: verify_installed_reactor(reactor_class) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None @defer.inlineCallbacks def crawl(self, *args, **kwargs): if self.crawling: raise RuntimeError("Crawling already taking place") self.crawling = True try: self.spider = self._create_spider(*args, **kwargs) self.engine = self._create_engine() start_requests = iter(self.spider.start_requests()) yield self.engine.open_spider(self.spider, start_requests) yield defer.maybeDeferred(self.engine.start) except Exception: self.crawling = False if self.engine is not None: yield self.engine.close() raise def _create_spider(self, *args, **kwargs): return self.spidercls.from_crawler(self, *args, **kwargs) def _create_engine(self): return ExecutionEngine(self, lambda _: self.stop()) @defer.inlineCallbacks def stop(self): """Starts a graceful stop of the crawler and returns a deferred that is fired when the crawler is stopped.""" if self.crawling: self.crawling = False yield defer.maybeDeferred(self.engine.stop)
class Crawler(object): """ 爬虫启动主类: 实例化各个spider; 创建engine; 启动指定的爬虫; 开启engine """ def __init__(self, spidercls, settings=None): if isinstance(spidercls, Spider): raise ValueError( 'The spidercls argument must be a class, not an object') if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings: %(settings)r", {'settings': d}) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new # settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None @property def spiders(self): if not hasattr(self, '_spiders'): warnings.warn("Crawler.spiders is deprecated, use " "CrawlerRunner.spider_loader or instantiate " "scrapy.spiderloader.SpiderLoader with your " "settings.", category=ScrapyDeprecationWarning, stacklevel=2) self._spiders = _get_spider_loader(self.settings.frozencopy()) return self._spiders @defer.inlineCallbacks def crawl(self, *args, **kwargs): """ 命令入口执行最后调用函数, 功能: 创建爬虫, 启动引擎""" # inlineCallbacks: 帮助你使用看起来像有规整的顺序的函数代码去写回调函数, 要求函数返回迭代器 # 其会将一系列的生成器变为一系列的callbacks并执行 assert not self.crawling, "Crawling already taking place" self.crawling = True try: # 从spiderloader中找到爬虫类(爬虫实际位置), 并实例化爬虫实例scrapy/spiders/__init__.py中父类Spider类 self.spider = self._create_spider(*args, **kwargs) # 实例化引擎对象, 见scrapy/core/engine.py self.engine = self._create_engine() # 调用start_requests, 获取种子URL start_requests = iter(self.spider.start_requests()) # 调用open_spider方法,传入爬虫实例/初始请求, 见scrapy/core/engine.py, 后续交由引擎调度 yield self.engine.open_spider(self.spider, start_requests) yield defer.maybeDeferred(self.engine.start) except Exception: # In Python 2 reraising an exception after yield discards # the original traceback (see https://bugs.python.org/issue7563), # so sys.exc_info() workaround is used. # This workaround also works in Python 3, but it is not needed, # and it is slower, so in Python 3 we use native `raise`. if six.PY2: exc_info = sys.exc_info() self.crawling = False if self.engine is not None: yield self.engine.close() if six.PY2: six.reraise(*exc_info) raise def _create_spider(self, *args, **kwargs): # 所有的爬虫都是利用from_crawler来进行实例化, 见scrapy/spiders/__init__.py文件 return self.spidercls.from_crawler(self, *args, **kwargs) def _create_engine(self): return ExecutionEngine(self, lambda _: self.stop()) @defer.inlineCallbacks def stop(self): """Starts a graceful stop of the crawler and returns a deferred that is fired when the crawler is stopped.""" if self.crawling: self.crawling = False yield defer.maybeDeferred(self.engine.stop)
class Crawler: # 这个类相当于全局变量 def __init__(self, spidercls, settings=None): if isinstance(spidercls, Spider): # spidercls参数必须是类,而不是对象 raise ValueError( 'The spidercls argument must be a class, not an object') if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls # 加载配置文件 self.settings = settings.copy() # 这里调用了更新设置 self.spidercls.update_settings(self.settings) # 初始化信号管理类 self.signals = SignalManager(self) # 初始化日志收集类 self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings:\n%(settings)s", {'settings': pprint.pformat(d)}) # 这个是关于根日志的 if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope # 加lambda为了防止垃圾回收机制? self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) # 加载日志格式化的东西 lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) # 好像是个扩展 self.extensions = ExtensionManager.from_crawler(self) # settings初始化完成,禁止修改了 self.settings.freeze() # 未开始抓取 self.crawling = False # 这里做准备操作,后面crawl才进行赋值 self.spider = None self.engine = None @defer.inlineCallbacks def crawl(self, *args, **kwargs): # p.7 调用此方法,生成一个d if self.crawling: raise RuntimeError("Crawling already taking place") self.crawling = True try: # p.8 实例化spider self.spider = self._create_spider(*args, **kwargs) # p.10 创建引擎 self.engine = self._create_engine() # p.12 生成最开始的请求,并转成迭代器 start_requests = iter(self.spider.start_requests()) # p.13 生成抓爬请求的行为 yield self.engine.open_spider(self.spider, start_requests) # p.23 一切准备工作完成,启动引擎 yield defer.maybeDeferred(self.engine.start) except Exception: self.crawling = False if self.engine is not None: yield self.engine.close() raise def _create_spider(self, *args, **kwargs): # p.9 调用spider类的from_crawler方法,实例化 return self.spidercls.from_crawler(self, *args, **kwargs) def _create_engine(self): # p.11 实例化引擎,并挂载引擎正常停止的回调 return ExecutionEngine(self, lambda _: self.stop()) @defer.inlineCallbacks def stop(self): """Starts a graceful stop of the crawler and returns a deferred that is fired when the crawler is stopped. 启动搜寻器的正常停止,并返回在搜寻器停止时触发的延迟。 """ if self.crawling: self.crawling = False yield defer.maybeDeferred(self.engine.stop)
class MyselfCrawlerProcess(CrawlerProcess): """signals管理, spider管理""" crawl_ids = itertools.count(start=1) def __init__(self, settings=None): self.signals = SignalManager(self) self.signals.connect(self.on_spider_closed, CrawlerProcessSignals.spider_closed) self._finished_jobs = [] self._paused_jobs = set() self.procmon = ProcessStatsMonitor() self.procmon.start() super(MyselfCrawlerProcess, self).__init__(settings or {}) logging.getLogger('scrapy.spidermiddlewares.depth').setLevel( logging.INFO) def crawl(self, crawler_or_spidercls, *args, **kwargs): kwargs['crawl_id'] = next(self.crawl_ids) crawler = crawler_or_spidercls if not isinstance(crawler_or_spidercls, Crawler): crawler = self._create_crawler(crawler_or_spidercls) # 爬虫信号状态 for name in SCRAPY_SIGNAL_NAMES: crawler.signals.connect(self._resend_signal, getattr(signals, name)) if hasattr(crawler.stats, "signals"): crawler.stats.signals.connect(self._resend_signal, stats.stats_changed) d = super(MyselfCrawlerProcess, self).crawl(crawler_or_spidercls, *args, **kwargs) return d def _create_crawler(self, spidercls): """新建crawler""" if isinstance(spidercls, six.string_types): spidercls = self.spider_loader.load(spidercls) return MyselfCrawlerProcess(spidercls, self.settings) def stop_job(self, crawl_id): """crawl job停止信号""" self.get_crawler(crawl_id).stop() def pause_job(self, crawl_id): """crawl job暂停""" self._paused_jobs.add(crawl_id) self.get_crawler(crawl_id).engine.pause() def resume_job(self, crawl_id): """crawl job恢复""" self._paused_jobs.remove(crawl_id) self.get_crawler(crawl_id).engine.unpause() def get_crawler(self, crawl_id): """获取crawl""" for crawler in self.crawlers: if getattr(crawler.spider, "crawl_id") == crawl_id: return crawler raise KeyError("Job is not known: %s" % crawl_id) def _resend_signal(self, **kwargs): # FIXME: signal and crawl are mess. signal = kwargs['signal'] if signal in STAT_SIGNALS: signal = STAT_SIGNALS[signal] kwargs['crawler'] = kwargs.pop('sender').crawler else: signal = CrawlerProcessSignals.signal(signal) kwargs['crawler'] = kwargs.pop('sender') kwargs['signal'] = signal if signal.supports_defer: return self.signals.send_catch_log_deferred(**kwargs) else: return self.signals.send_catch_log(**kwargs) def stop(self): """停止crawl process""" self.procmon.stop() return super(MyselfCrawlerProcess, self).stop() def on_spider_closed(self, spider, reason): """spider关闭时写入""" self._finished_jobs.insert( 0, { 'id': spider.crawl_id, 'job_id': getattr(spider, 'motor_job_id'), 'seed': spider.domain, 'status': reason, 'stats': spider.crawler.stats.get_stats(spider), 'downloads': self._downloader_stats(spider.crawler) }) def get_jobs(self): """获取运行中的job""" crawlers = [ crawler for crawler in self.crawlers if crawler.spider is not None ] return [{ 'id': crawler.spider.crawl_id, 'job_id': getattr(crawler.spider, 'motor_job_id'), 'seed': crawler.spider.domain, 'status': self._get_crawler_status(crawler), 'stats': crawler.spider.crawler.stats.get_stats(crawler.spider), 'downloads': self._downloader_stats(crawler) } for crawler in crawlers] @classmethod def _downloader_stats(cls, crawler): """下载器状态""" downloader = crawler.engine.downloader return { 'active': [cls._request_info(req) for req in downloader.active], 'slots': sorted([ cls._slot_info(key, slot) for key, slot in downloader.slots.items() ], key=operator.itemgetter('key')) } @classmethod def _request_info(cls, request): """request消息""" return {'url': request.url, 'method': request.method} @classmethod def _slot_info(cls, key, slot): """slot消息""" return { 'key': key, 'concurrency': slot.concurrency, 'delay': slot.delay, 'lastseen': slot.lastseen, 'len(queue)': len(slot.queue), 'transferring': [cls._request_info(req) for req in slot.transferring], 'active': [cls._request_info(req) for req in slot.active] } def _get_crawler_status(self, crawler): """crawler运行状态""" if crawler.spider is None: return "unknown" if not crawler.crawling: return "stopping" if int(crawler.spider.crawl_id) in self._paused_jobs: return "suspended" return "crawling" @property def jobs(self): """完成和未完成crawl状态""" finished_ids = {job['id'] for job in self._finished_jobs} active_jobs = [ job for job in self.get_jobs() if job['id'] not in finished_ids ] return active_jobs + self._finished_jobs
class ArachnadoCrawlerProcess(CrawlerProcess): """ CrawlerProcess which sets up a global signals manager, assigns unique ids to each spider job, workarounds some Scrapy issues and provides extra stats. """ crawl_ids = itertools.count(start=1) def __init__(self, settings=None): self.signals = SignalManager(self) self.signals.connect(self.on_spider_closed, CrawlerProcessSignals.spider_closed) self._finished_jobs = [] self._paused_jobs = set() self.procmon = ProcessStatsMonitor() self.procmon.start() super(ArachnadoCrawlerProcess, self).__init__(settings or {}) # don't log DepthMiddleware messages # see https://github.com/scrapy/scrapy/issues/1308 logging.getLogger("scrapy.spidermiddlewares.depth").setLevel(logging.INFO) def crawl(self, crawler_or_spidercls, *args, **kwargs): kwargs["crawl_id"] = next(self.crawl_ids) crawler = crawler_or_spidercls if not isinstance(crawler_or_spidercls, Crawler): crawler = self._create_crawler(crawler_or_spidercls) # aggregate all crawler signals for name in SCRAPY_SIGNAL_NAMES: crawler.signals.connect(self._resend_signal, getattr(signals, name)) # aggregate signals from crawler EventedStatsCollectors if hasattr(crawler.stats, "signals"): crawler.stats.signals.connect(self._resend_signal, stats.stats_changed) d = super(ArachnadoCrawlerProcess, self).crawl(crawler_or_spidercls, *args, **kwargs) return d def _create_crawler(self, spidercls): if isinstance(spidercls, six.string_types): spidercls = self.spider_loader.load(spidercls) return ArachnadoCrawler(spidercls, self.settings) def stop_job(self, crawl_id): """ Stop a single crawl job """ self.get_crawler(crawl_id).stop() def pause_job(self, crawl_id): """ Pause a crawling job """ self._paused_jobs.add(crawl_id) self.get_crawler(crawl_id).engine.pause() def resume_job(self, crawl_id): """ Resume a crawling job """ self._paused_jobs.remove(crawl_id) self.get_crawler(crawl_id).engine.unpause() def get_crawler(self, crawl_id): for crawler in self.crawlers: if getattr(crawler.spider, "crawl_id") == crawl_id: return crawler raise KeyError("Job is not known: %s" % crawl_id) def _resend_signal(self, **kwargs): # FIXME: this is a mess. Signal handling should be unified somehow: # there shouldn't be two separate code paths # for CrawlerProcessSignals and STAT_SIGNALS. signal = kwargs["signal"] if signal in STAT_SIGNALS: signal = STAT_SIGNALS[signal] kwargs["crawler"] = kwargs.pop("sender").crawler else: signal = CrawlerProcessSignals.signal(signal) kwargs["crawler"] = kwargs.pop("sender") kwargs["signal"] = signal if signal.supports_defer: return self.signals.send_catch_log_deferred(**kwargs) else: return self.signals.send_catch_log(**kwargs) def stop(self): """ Terminate the process (exit from application). """ self.procmon.stop() return super(ArachnadoCrawlerProcess, self).stop() def on_spider_closed(self, spider, reason): # spiders are closed not that often, insert(0,...) should be fine self._finished_jobs.insert( 0, { "id": spider.crawl_id, "job_id": getattr(spider, "motor_job_id"), "seed": spider.domain, "status": reason, "stats": spider.crawler.stats.get_stats(spider), "downloads": self._downloader_stats(spider.crawler), }, ) # FIXME: methods below are ugly for two reasons: # 1. they assume spiders have certain attributes; # 2. they try to get crawling status based on auxilary information. def get_jobs(self): """ Return a list of active jobs """ crawlers = [crawler for crawler in self.crawlers if crawler.spider is not None] return [ { "id": crawler.spider.crawl_id, "job_id": getattr(crawler.spider, "motor_job_id"), "seed": crawler.spider.domain, "status": self._get_crawler_status(crawler), "stats": crawler.spider.crawler.stats.get_stats(crawler.spider), "downloads": self._downloader_stats(crawler) # 'engine_info': dict(get_engine_status(crawler.engine)) } for crawler in crawlers ] @classmethod def _downloader_stats(cls, crawler): downloader = crawler.engine.downloader return { "active": [cls._request_info(req) for req in downloader.active], "slots": sorted( [cls._slot_info(key, slot) for key, slot in downloader.slots.items()], key=operator.itemgetter("key") ), } @classmethod def _request_info(cls, request): return {"url": request.url, "method": request.method} @classmethod def _slot_info(cls, key, slot): return { "key": key, "concurrency": slot.concurrency, "delay": slot.delay, "lastseen": slot.lastseen, "len(queue)": len(slot.queue), "transferring": [cls._request_info(req) for req in slot.transferring], "active": [cls._request_info(req) for req in slot.active], } def _get_crawler_status(self, crawler): if crawler.spider is None: return "unknown" if not crawler.crawling: return "stopping" if int(crawler.spider.crawl_id) in self._paused_jobs: return "suspended" return "crawling" @property def jobs(self): """ Current crawl state """ # filter out active jobs which are in fact finished finished_ids = {job["id"] for job in self._finished_jobs} active_jobs = [job for job in self.get_jobs() if job["id"] not in finished_ids] return active_jobs + self._finished_jobs
class Crawler(object): def __init__(self, spidercls, settings=None): ## crawler 对象必须用 scrapy.spiders.Spider 的子类和一个 scrapy.settings.Settings ## 对象来实例化 if isinstance(spidercls, Spider): raise ValueError( 'The spidercls argument must be a class, not an object') if isinstance(settings, dict) or settings is None: settings = Settings(settings) ## 自定义爬虫类 self.spidercls = spidercls ## crawler 的配置管理器,用来为插件和中间件提供访问该 crawler 的 Scrapy 配置的入口 self.settings = settings.copy() ## 根据自定义爬虫类中的可能定义的 custom_settigns 属性更新配置 ## 优先级为 spider self.spidercls.update_settings(self.settings) ## 这里得到的只是被覆盖过的配置项,并将其转换为字典 d = dict(overridden_settings(self.settings)) logger.info("Overridden settings: %(settings)r", {'settings': d}) ## crawler 的信号管理器,被插件和中间件用来将它们自身集成到 Scrapy 功能中 self.signals = SignalManager(self) ## crawler 的 stats 收集器,用来从插件和中间件中记录它们的行为和访问其他插件收集到的数据 self.stats = load_object(self.settings['STATS_CLASS'])(self) ## 用于对爬虫运行过程中产生的日志的级别数量,进行统计 handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) ## 为 engine_stopped 信号注册 __remove_handler 处理器 ## 当产生引擎停止信号时,将会由 __remove_handler 处理器进行处理 self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) ## 初始化日志格式化器实例 self.logformatter = lf_cls.from_crawler(self) ## 用来追踪可用插件的插件管理器 self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() ## 标志爬虫运行状态 self.crawling = False ## 当前正在爬取的 spider self.spider = None ## 执行引擎,用来协调调度器、下载器、spiders 之间的爬取逻辑 self.engine = None @property def spiders(self): if not hasattr(self, '_spiders'): warnings.warn( "Crawler.spiders is deprecated, use " "CrawlerRunner.spider_loader or instantiate " "scrapy.spiderloader.SpiderLoader with your " "settings.", category=ScrapyDeprecationWarning, stacklevel=2) self._spiders = _get_spider_loader(self.settings.frozencopy()) return self._spiders @defer.inlineCallbacks def crawl(self, *args, **kwargs): ## Starts the crawler by instantiating its spider class with the given ## args and kwargs arguments, while setting the execution engine in motion. assert not self.crawling, "Crawling already taking place" ## 将爬虫运行状态置为 True self.crawling = True try: ## 创建爬虫实例 self.spider = self._create_spider(*args, **kwargs) ## 创建引擎 self.engine = self._create_engine() ## 调用爬虫实例的 start_requests 方法,获取种子 URL(请求对象) start_requests = iter(self.spider.start_requests()) ## 执行引擎的 open_spider 方法,传入爬虫实例和初始请求对象,交由引擎调度 yield self.engine.open_spider(self.spider, start_requests) yield defer.maybeDeferred(self.engine.start) except Exception: # In Python 2 reraising an exception after yield discards # the original traceback (see https://bugs.python.org/issue7563), # so sys.exc_info() workaround is used. # This workaround also works in Python 3, but it is not needed, # and it is slower, so in Python 3 we use native `raise`. if six.PY2: exc_info = sys.exc_info() self.crawling = False if self.engine is not None: yield self.engine.close() if six.PY2: six.reraise(*exc_info) raise def _create_spider(self, *args, **kwargs): ## 调用之定义爬虫类的 from_crawler 方法实例化爬虫类 return self.spidercls.from_crawler(self, *args, **kwargs) def _create_engine(self): ## 返回一个执行引擎类的实例 return ExecutionEngine(self, lambda _: self.stop()) @defer.inlineCallbacks def stop(self): if self.crawling: self.crawling = False yield defer.maybeDeferred(self.engine.stop)
class Crawler: def __init__(self, spidercls, settings=None): if isinstance(spidercls, Spider): raise ValueError( 'The spidercls argument must be a class, not an object') if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings:\n%(settings)s", {'settings': pprint.pformat(d)}) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None @defer.inlineCallbacks def crawl(self, *args, **kwargs): assert not self.crawling, "Crawling already taking place" self.crawling = True try: self.spider = self._create_spider(*args, **kwargs) self.engine = self._create_engine() start_requests = iter(self.spider.start_requests()) yield self.engine.open_spider(self.spider, start_requests) yield defer.maybeDeferred(self.engine.start) except Exception: self.crawling = False if self.engine is not None: yield self.engine.close() raise def _create_spider(self, *args, **kwargs): return self.spidercls.from_crawler(self, *args, **kwargs) def _create_engine(self): return ExecutionEngine(self, lambda _: self.stop()) @defer.inlineCallbacks def stop(self): """Starts a graceful stop of the crawler and returns a deferred that is fired when the crawler is stopped.""" if self.crawling: self.crawling = False yield defer.maybeDeferred(self.engine.stop)
def __init__(self): super(StoreToMongoDB, self).__init__() manager = SignalManager() manager.connect(self.initialize, scrapy.signals.spider_opened) manager.connect(self.finalize, scrapy.signals.spider_idle)