def __init__(self, spidercls, settings): if isinstance(settings, dict): settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL')) logging.root.addHandler(handler) self.signals.connect(lambda: logging.root.removeHandler(handler), signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.spidercls.update_settings(self.settings) self.settings.freeze() self.crawling = False self.spider = None self.engine = None
def __init__(self, spidercls, settings=None): if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL')) logging.root.addHandler(handler) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None
def __init__(self, spidercls, settings=None): if isinstance(settings, dict) or settings is None: settings = Settings(settings) # 统一 self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings: %(settings)r", {'settings': d}) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) # 复位? self.settings.freeze() # settings不可改变了 self.crawling = False self.spider = None self.engine = None
def __init__(self, spidercls, settings=None, init_reactor: bool = False): if isinstance(spidercls, Spider): raise ValueError( 'The spidercls argument must be a class, not an object') if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings:\n%(settings)s", {'settings': pprint.pformat(d)}) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.request_fingerprinter = create_instance( load_object(self.settings['REQUEST_FINGERPRINTER_CLASS']), settings=self.settings, crawler=self, ) reactor_class = self.settings.get("TWISTED_REACTOR") if init_reactor: # this needs to be done after the spider settings are merged, # but before something imports twisted.internet.reactor if reactor_class: install_reactor(reactor_class, self.settings["ASYNCIO_EVENT_LOOP"]) else: from twisted.internet import reactor # noqa: F401 log_reactor_info() if reactor_class: verify_installed_reactor(reactor_class) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None
def configure(self): if self.configured: return self.configured = True self.extensions = ExtensionManager.from_crawler(self) spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_crawler(self) self.engine = ExecutionEngine(self, self._spider_closed)
def configure(self): if self.configured: return self.configured = True lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.engine = ExecutionEngine(self, self._spider_closed)
def __init__(self, spidercls, settings=None): ## crawler 对象必须用 scrapy.spiders.Spider 的子类和一个 scrapy.settings.Settings ## 对象来实例化 if isinstance(spidercls, Spider): raise ValueError( 'The spidercls argument must be a class, not an object') if isinstance(settings, dict) or settings is None: settings = Settings(settings) ## 自定义爬虫类 self.spidercls = spidercls ## crawler 的配置管理器,用来为插件和中间件提供访问该 crawler 的 Scrapy 配置的入口 self.settings = settings.copy() ## 根据自定义爬虫类中的可能定义的 custom_settigns 属性更新配置 ## 优先级为 spider self.spidercls.update_settings(self.settings) ## 这里得到的只是被覆盖过的配置项,并将其转换为字典 d = dict(overridden_settings(self.settings)) logger.info("Overridden settings: %(settings)r", {'settings': d}) ## crawler 的信号管理器,被插件和中间件用来将它们自身集成到 Scrapy 功能中 self.signals = SignalManager(self) ## crawler 的 stats 收集器,用来从插件和中间件中记录它们的行为和访问其他插件收集到的数据 self.stats = load_object(self.settings['STATS_CLASS'])(self) ## 用于对爬虫运行过程中产生的日志的级别数量,进行统计 handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) ## 为 engine_stopped 信号注册 __remove_handler 处理器 ## 当产生引擎停止信号时,将会由 __remove_handler 处理器进行处理 self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) ## 初始化日志格式化器实例 self.logformatter = lf_cls.from_crawler(self) ## 用来追踪可用插件的插件管理器 self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() ## 标志爬虫运行状态 self.crawling = False ## 当前正在爬取的 spider self.spider = None ## 执行引擎,用来协调调度器、下载器、spiders 之间的爬取逻辑 self.engine = None
def __init__(self, spidercls, settings): self.spidercls = spidercls self.settings = settings self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.crawling = False self.spider = None self.engine = None
def configure(self): if self.configured: return self.configured = True d = dict(overridden_settings(self.settings)) log.msg(format="Overridden settings: %(settings)r", settings=d, level=log.DEBUG) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_crawler(self) self.engine = ExecutionEngine(self, self._spider_closed)
def configure(self): if self.configured: return self.configured = True self.extensions = ExtensionManager.from_settings(self.settings) spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_settings(self.settings) spq_cls = load_object(self.settings['SPIDER_QUEUE_CLASS']) spq = spq_cls.from_settings(self.settings) keepalive = self.settings.getbool('KEEP_ALIVE') pollint = self.settings.getfloat('QUEUE_POLL_INTERVAL') self.queue = ExecutionQueue(self.spiders, spq, poll_interval=pollint, keep_alive=keepalive) self.engine = ExecutionEngine(self.settings, self._spider_closed)
def __init__(self, spidercls, settings=None): if isinstance(spidercls, Spider): # spidercls参数必须是类,而不是对象 raise ValueError( 'The spidercls argument must be a class, not an object') if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls # 加载配置文件 self.settings = settings.copy() # 这里调用了更新设置 self.spidercls.update_settings(self.settings) # 初始化信号管理类 self.signals = SignalManager(self) # 初始化日志收集类 self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings:\n%(settings)s", {'settings': pprint.pformat(d)}) # 这个是关于根日志的 if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope # 加lambda为了防止垃圾回收机制? self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) # 加载日志格式化的东西 lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) # 好像是个扩展 self.extensions = ExtensionManager.from_crawler(self) # settings初始化完成,禁止修改了 self.settings.freeze() # 未开始抓取 self.crawling = False # 这里做准备操作,后面crawl才进行赋值 self.spider = None self.engine = None
def __init__(self, spidercls, settings=None): if isinstance(settings, dict) or settings is None: # 此处应该直接就是一个False settings = Settings(settings) self.spidercls = spidercls # 获取了爬虫对象,,但是还是没有实例化 # 所以流程就是获取爬虫对象,在实例化之前,执行了一次update_settings,就是针对custom setting的一次操作嘛 self.settings = settings.copy() self.spidercls.update_settings( self.settings ) # 就是在这里执行了对custom_setting的设置嘛,可以很强,把爬虫里面的custom_setting更新到setting里面?好吧没事共同维护的一个setting对象,确实是直接更新到了setting里面 # 所以在实例化之前,他只是更新了settings而已,你写在__init__其实一点用都得的,根本就没有执行到哪一步 # todo 总结。我以前是想把custiom_settings卸载init里面,但是并没有起作用,因为此时的爬虫也就是 self.spidercls 根本就还没有进行初始化,而是先执行的update_settings d = dict(overridden_settings(self.settings)) # 找出相同的,取overridden里面的值 logger.info( "Overridden settings: %(settings)r", {'settings': d }) # 这就是打印那句log的地方,打印出Overridden属性。现遍历默认属性,找出已有属性中对应的值,看那些有修改 self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])( self ) # STATS_CLASS = 'scrapy.statscollectors.MemoryStatsCollector' -- 统计机制 handler = LogCounterHandler( self, level=self.settings.get('LOG_LEVEL')) # LOG_LEVEL = 'DEBUG' logging.root.addHandler(handler) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object( self.settings['LOG_FORMATTER'] ) # LOG_FORMATTER = 'scrapy.logformatter.LogFormatter' self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler( self) # extensions是干嘛用的,杵这干啥呢,闹呢。卧槽,貌似不需要使用这些扩展件呀 self.settings.freeze() self.crawling = False self.spider = None self.engine = None
def __init__(self, spidercls, settings): if isinstance(settings, dict): settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.spidercls.update_settings(self.settings) self.settings.freeze() self.crawling = False self.spider = None self.engine = None
def __init__(self, spidercls, settings): self.spidercls = spidercls self.settings = settings # 信号管理器 self.signals = SignalManager(self) # STATS_CLASS = 'scrapy.statscol.MemoryStatsCollector' self.stats = load_object(self.settings['STATS_CLASS'])(self) # LOG_FORMATTER = 'scrapy.logformatter.LogFormatter' lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) # 扩展管理器 self.extensions = ExtensionManager.from_crawler(self) # 运行状态 self.crawling = False self.spider = None self.engine = None
def __init__(self, spidercls, settings=None): if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings: %(settings)r", {'settings': d}) """ 通信机制: SignalManager主要是利用开源的python库pydispatch作消息的发送和路由. scrapy使用它发送关键的消息事件给关心者,如爬取开始,爬取结束等消息 通过send_catch_log_deferred来发送消息,通过connect方法来注册关心消息的处理函数 """ self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None
def __init__(self, spidercls, settings=None): if isinstance(spidercls, Spider): raise ValueError( "The spidercls argument must be a class, not an object") if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) self.signals = SignalManager(self) self.stats = load_object(self.settings["STATS_CLASS"])(self) handler = LogCounterHandler(self, level=self.settings.get("LOG_LEVEL")) logging.root.addHandler(handler) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings:\n%(settings)s", {"settings": pprint.pformat(d)}) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings["LOG_FORMATTER"]) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None
def __init__(self, spidercls, settings=None): if isinstance(spidercls, Spider): raise ValueError( 'The spidercls argument must be a class, not an object') if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings: %(settings)r", {'settings': d}) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None