def from_crawler(cls, crawler): settings = crawler.settings connection_url = settings.get("RABBITMQ_CONNECTION_PARAMETERS") queue_class = load_object(settings.get("SCHEDULER_QUEUE_CLASS")) dupefilter_cls = load_object(settings["DUPEFILTER_CLASS"]) dupefilter = create_instance(dupefilter_cls, settings, crawler) pqclass = load_object(settings["SCHEDULER_PRIORITY_QUEUE"]) if pqclass is PriorityQueue: warnings.warn( "SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'" " is no longer supported because of API changes; " "please use 'scrapy.pqueues.ScrapyPriorityQueue'", ScrapyDeprecationWarning, ) from scrapy.pqueues import ScrapyPriorityQueue pqclass = ScrapyPriorityQueue dqclass = load_object(settings["SCHEDULER_DISK_QUEUE"]) mqclass = load_object(settings["SCHEDULER_MEMORY_QUEUE"]) logunser = settings.getbool("SCHEDULER_DEBUG") return cls( dupefilter, connection_url, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass, crawler=crawler, queue_class=queue_class, )
def from_crawler(cls, crawler): settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = create_instance(dupefilter_cls, settings, crawler) pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']) if pqclass is PriorityQueue: warnings.warn( "SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'" " is no longer supported because of API changes; " "please use 'scrapy.pqueues.ScrapyPriorityQueue'", ScrapyDeprecationWarning) from scrapy.pqueues import ScrapyPriorityQueue pqclass = ScrapyPriorityQueue dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('SCHEDULER_DEBUG') return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass, crawler=crawler)
def from_crawler(cls, crawler, **spider_kwargs): settings = crawler.settings kwargs = { 'filter_storage_path': settings.get('FILTER_STORAGE_PATH', ''), 'item_storage_path': settings.get('ITEM_STORAGE_PATH', ''), } kwargs.update(spider_kwargs) spider_kwargs = kwargs spider = super(EndpointSpider, cls).from_crawler(crawler, **spider_kwargs) spider.stats = crawler.stats jobdir = job_dir(settings) generated = False if jobdir: queuecls = load_object(settings['SCHEDULER_DISK_QUEUE']) queuedir = os.path.join(jobdir, 'startrequests.queue') if os.path.exists(queuedir): generated = True spider.requestqueue = queuecls(os.path.join(queuedir, '0')) else: queuecls = load_object(settings['SCHEDULER_MEMORY_QUEUE']) spider.requestqueue = queuecls() if not generated: for x in spider.generate_start_requests(): spider.enqueue_start_request(x) crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed) return spider
def from_crawler(cls, crawler): obj = cls(job_dir(crawler.settings)) crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed) crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened) return obj
def from_crawler(cls, crawler): if not crawler.spider.islinkgenerator: settings = crawler.settings persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) queue_key = "%s:requests" % crawler.spider.name queue_cls = queue.SpiderQueue idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE) server = connection.from_settings(settings, crawler.spider.name) stats = crawler.stats return cls(server, persist, queue_key, queue_cls, idle_before_close, stats) else: settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG')) core_scheduler = load_object('scrapy.core.scheduler.Scheduler') return core_scheduler(dupefilter, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)
def from_crawler(cls, crawler): """ 类方法,按照配置文件settings中的配置项,生成调度器实例。 :param crawler: 爬虫类 :return: 实例化调度器(调用__init__方法) """ settings = crawler.settings # 链接去重器 scrapy.dupefilters.RFPDupeFilter dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) # 优先队列类 queuelib.PriorityQueue pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']) # 磁盘队列类 scrapy.squeues.PickleLifoDiskQueue dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) # 内存队列类 scrapy.squeues.LifoMemoryQueue mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) # 日志 logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG')) return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)
def from_settings(cls, settings): dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser)
def from_crawler(cls, crawler): settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) rqclass = load_object(settings['SCHEDULER_RABBIT_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') return cls(dupefilter, job_dir(settings), rqclass, logunser, crawler.stats)
def from_crawler(cls, crawler, **spider_kwargs): settings = crawler.settings kwargs = { 'filter_storage_path': settings.get('FILTER_STORAGE_PATH', ''), 'item_storage_path': settings.get('ITEM_STORAGE_PATH', ''), } kwargs.update(spider_kwargs) spider_kwargs = kwargs spider = super(EndpointSpider, cls).from_crawler(crawler, **spider_kwargs) spider.stats = crawler.stats jobdir = job_dir(settings) generated = False if jobdir: queuecls = load_object(settings['SCHEDULER_DISK_QUEUE']) queuedir = os.path.join(jobdir, 'startrequests.queue') if os.path.exists(queuedir): generated = True spider.requestqueue = queuecls(os.path.join(queuedir, '0')) else: queuecls = load_object(settings['SCHEDULER_MEMORY_QUEUE']) spider.requestqueue = queuecls() if not generated: for x in spider.generate_start_requests(): spider.enqueue_start_request(x) crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed) return spider
def from_crawler(cls, crawler): settings = crawler.settings # 从配置中获取指纹过滤器类, 见scrapy/dupefilters.py dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = create_instance(dupefilter_cls, settings, crawler) # 实例化指纹过滤器类, 最终复制为self.df # 任务队列, 见scrapy/squeues.py, 其中磁盘队列: 执行后会保存队列任务到磁盘; 内存队列: 重启消失 # 如果用户配置了JOBDIR, 则会同时影响过滤器, 磁盘队列的设置. pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']) if pqclass is PriorityQueue: warnings.warn( "SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'" " is no longer supported because of API changes; " "please use 'scrapy.pqueues.ScrapyPriorityQueue'", ScrapyDeprecationWarning) from scrapy.pqueues import ScrapyPriorityQueue pqclass = ScrapyPriorityQueue dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) # 日志序列化开关 logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG')) return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass, crawler=crawler) # 实例化scheduler
def from_settings(cls: Type[RFPDupeFilterTV], settings: BaseSettings, *, fingerprinter=None) -> RFPDupeFilterTV: debug = settings.getbool('DUPEFILTER_DEBUG') try: return cls(job_dir(settings), debug, fingerprinter=fingerprinter) except TypeError: warn( "RFPDupeFilter subclasses must either modify their '__init__' " "method to support a 'fingerprinter' parameter or reimplement " "the 'from_settings' class method.", ScrapyDeprecationWarning, ) result = cls(job_dir(settings), debug) result.fingerprinter = fingerprinter return result
def from_settings(cls, settings): """ 获得爬虫中的两个参数:JOBDIR缓存磁盘目录,和DUPEFILTER_DEBUG是否开启debug模式 :param settings: 爬虫配置 :return: 调用__init__方法,获得实例 """ debug = settings.getbool('DUPEFILTER_DEBUG') return cls(job_dir(settings), debug)
def from_crawler(cls, crawler): jobdir = job_dir(crawler.settings) if not jobdir: raise NotConfigured obj = cls(jobdir) crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed) crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened) return obj
def from_crawler(cls, crawler): jobdir = job_dir(crawler.settings) if not jobdir: raise NotConfigured obj = cls(jobdir) crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed) crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened) return obj
def from_crawler(cls, crawler): settings = crawler.settings run_as_daemon = settings.get('DAEMON') dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') return cls(crawler, dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats, run_as_daemon)
def from_crawler(cls, crawler): settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = create_instance(dupefilter_cls, settings, crawler) pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG')) return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)
def from_crawler(cls, crawler): settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG')) return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)
def from_settings(cls, settings): check_settings(settings) debug = settings.getbool('DUPEFILTER_DEBUG') config = settings.getdict('REQUEST_DUPEFILTER_CONFIG', {}) mongo_uri = settings.get('MONGO_URI') mongo_db = settings.get('MONGO_DATABASE') return cls(mongo_uri, mongo_db, config, path=job_dir(settings), debug=debug)
def from_settings(cls, global_settings, global_stats): settings = global_settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') total_concurrency = self.settings.getint('CONCURRENT_REQUESTS') domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP') return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser, global_stats, total_concurrency, domain_concurrency, ip_concurrency)
def from_crawler(cls, crawler): settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) if hasattr(dupefilter_cls, 'from_crawler'): dupefilter = dupefilter_cls.from_crawler(crawler) elif hasattr(dupefilter_cls, 'from_settings'): dupefilter = dupefilter_cls.from_settings(crawler.settings) else: dupefilter = dupefilter_cls() logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG')) return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats)
def from_crawler(cls, crawler): # 这里是真正的实例化入口 settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) # 'scrapy.dupefilters.RFPDupeFilter' 过滤的咯 dupefilter = create_instance(dupefilter_cls, settings, crawler) # objcls.from_crawler(crawler, *args, **kwargs)执行from-settings实例化 pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']) # 'queuelib.PriorityQueue' dqclass = load_object( settings['SCHEDULER_DISK_QUEUE']) # 'scrapy.squeues.PickleLifoDiskQueue',先进先出,使用pickle模块序列化 mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) # 'scrapy.squeues.LifoMemoryQueue' logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool( 'SCHEDULER_DEBUG')) # ('LOG_UNSERIALIZABLE_REQUESTS', 'use SCHEDULER_DEBUG instead') return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass) # 初始化时队列为空,并没有往里面推数据
def from_settings(cls, settings): from elasticsearch import Elasticsearch check_settings(settings) debug = settings.getbool('DUPEFILTER_DEBUG') config = settings.getdict('REQUEST_DUPEFILTER_CONFIG', {}) obj = cls(path=job_dir(settings), debug=debug) obj.settings = settings es_servers = obj.settings['ELASTICSEARCH_SERVERS'] es_servers = es_servers if isinstance(es_servers, list) else [es_servers] obj.items = get_item_dict(config.get('items'), settings) obj.es = Elasticsearch(hosts=es_servers, timeout=obj.settings.get( 'ELASTICSEARCH_TIMEOUT', 60)) return obj
def from_crawler(cls, crawler): settings = crawler.settings pqcls = load_object(settings["SCHEDULER_PRIORITY_QUEUE"]) dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats, pqcls)
def from_crawler(cls, crawler): settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG')) rabbitmq_queue_name = settings.get('RABBITMQ_INPUT_QUEUE_NAME') rabbitmq_url = settings.get('RABBITMQ_URL') return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass, rabbitmq_queue_name=rabbitmq_queue_name, rabbitmq_url=rabbitmq_url)
def from_crawler(cls: Type[SchedulerTV], crawler) -> SchedulerTV: """ Factory method, initializes the scheduler with arguments taken from the crawl settings """ dupefilter_cls = load_object(crawler.settings['DUPEFILTER_CLASS']) return cls( dupefilter=create_instance(dupefilter_cls, crawler.settings, crawler), jobdir=job_dir(crawler.settings), dqclass=load_object(crawler.settings['SCHEDULER_DISK_QUEUE']), mqclass=load_object(crawler.settings['SCHEDULER_MEMORY_QUEUE']), logunser=crawler.settings.getbool('SCHEDULER_DEBUG'), stats=crawler.stats, pqclass=load_object(crawler.settings['SCHEDULER_PRIORITY_QUEUE']), crawler=crawler, )
def from_crawler(cls, crawler): settings = crawler.settings # DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter' dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) # SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleLifoDiskQueue' dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.LifoMemoryQueue' mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) # 是否在LOG里记录不可序列化的request logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats)
def from_crawler(cls, crawler): #crawler 来实例化这个东西 settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = create_instance(dupefilter_cls, settings, crawler) pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('SCHEDULER_DEBUG') return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass, crawler=crawler)
def from_settings(cls, settings): debug = settings.getbool('DUPEFILTER_DEBUG') m_length = settings['MAX_LENGTH'] error_rate = settings['ERROR_RATE'] mongo_host = settings['DB_HOST'] mongo_port = settings['DB_PORT'] try: dbname = settings['DB_NAME'] dbcollections = settings['DB_COLLECTIONS_NAME'] except Exception: dbname = None dbcollections = None return cls(m_length, error_rate, mongo_host, mongo_port,\ dbname, dbcollections, job_dir(settings), debug)
def from_crawler(cls, crawler): """ init from crawler """ jobdir = job_dir(crawler.settings) if not jobdir: raise NotConfigured state_file = crawler.settings.get("STATE_TAG_FILE") or ".state" pid_file = crawler.settings.get("PID_TAG_FILE") or ".pid" obj = cls(jobdir, state_file, pid_file) crawler.signals.connect(obj._spider_opened, signals.spider_opened) crawler.signals.connect(obj._spider_closed, signals.spider_closed) return obj
def from_crawler(cls, crawler): print "initialize scheduler from crawler <-- wangyf" settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) db_host = settings['DB_HOST'] db_port = settings['DB_PORT'] try: db_name = settings['DB_NAME'] db_collections = settings['DB_COLLECTIONS_NAME'] except Exception: db_name = None db_collections = None logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG')) return cls(dupefilter, db_host, db_port, db_name, db_collections, \ jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats)
def from_crawler(cls, crawler): settings = crawler.settings #DUPEFILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter' dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) #SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue' dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) #SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue' mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) #LOG_UNSERIALIZABLE_REQUESTS:False logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats)
def from_crawler(cls, crawler): settings = crawler.settings host = settings.get('REDIS_HOST', 'localhost') port = settings.get('REDIS_PORT', 6379) server = redis.Redis(host, port) dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) rqclass = load_object(settings['SCHEDULER_REDIS_QUEUE']) next_urls_queue_key = settings.get('NEXT_URLS_QUEUE_KEY', '%(spider)s:next_urls') crawled_urls_queue_key = settings.get('CRAWLED_URLS_QUEUE_KEY', '%(spider)s:crawled_urls') logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') return cls(server, rqclass, next_urls_queue_key, crawled_urls_queue_key, dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats)
def from_crawler(cls, crawler): settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = create_instance(dupefilter_cls, settings, crawler) pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']) if pqclass is PriorityQueue: warnings.warn("SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'" " is no longer supported because of API changes; " "please use 'scrapy.pqueues.ScrapyPriorityQueue'", ScrapyDeprecationWarning) from scrapy.pqueues import ScrapyPriorityQueue pqclass = ScrapyPriorityQueue dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG')) return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass, crawler=crawler)
def from_crawler(cls, crawler): """ 调度器的初始化主要做了2件事: 实例化请求指纹过滤器:用来过滤重复请求,可自己重写替换之; 定义各种不同类型的任务队列:优先级任务队列、基于磁盘的任务队列、基于内存的任务队列; """ settings = crawler.settings dupefilter_cls = load_object( settings['DUPEFILTER_CLASS']) # 从配置文件中获取指纹过滤器类 dupefilter = create_instance(dupefilter_cls, settings, crawler) # 实例化指纹过滤器 pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'] ) # 基于优先级的任务队列类(priority queue) if pqclass is PriorityQueue: # 子类is父类??? warnings.warn( "SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'" " is no longer supported because of API changes; " "please use 'scrapy.pqueues.ScrapyPriorityQueue'", ScrapyDeprecationWarning) from scrapy.pqueues import ScrapyPriorityQueue pqclass = ScrapyPriorityQueue dqclass = load_object( settings['SCHEDULER_DISK_QUEUE']) # 基于磁盘的任务队列类(disk queue) mqclass = load_object( settings['SCHEDULER_MEMORY_QUEUE']) # 基于内存的任务队列类(memory queue) logunser = settings.getbool( 'LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG')) # 请求日志序列化开关 return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass, crawler=crawler)
def from_crawler(cls, crawler): ## 根据一个爬虫对象实例化一个调度器类 ## 配置文件 settings = crawler.settings ## 从配置文件中获取指纹过滤器类 dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) ##根据配置和爬虫对象创建一个指纹过滤器(用来过滤重复请求) dupefilter = create_instance(dupefilter_cls, settings, crawler) ## 从配置文件中依次获取基于优先级、基于磁盘任务、基于内存的任务队列类 pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) ## 日志是否序列化 logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG')) ## 返回一个调度器实例 return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)
def from_crawler(cls, crawler): debug = crawler.settings.getbool('DUPEFILTER_DEBUG') df = cls(job_dir(crawler.settings), debug) df.method = 'from_crawler' return df
def from_settings(cls, settings): return cls(job_dir(settings))
def from_settings(cls, settings): debug = settings.getbool('DUPEFILTER_DEBUG') return cls(job_dir(settings), debug)
def from_settings(cls, settings): debug = settings.getbool('DUPEFILTER_DEBUG') return cls(job_dir(settings), debug)
def from_settings(cls, settings): debug = settings.getbool('DUPEFILTER_DEBUG') use_anchors = settings.getbool('DUPEFILTER_USE_ANCHORS') return cls(job_dir(settings), debug, use_anchors)
def from_crawler(cls, crawler): obj = cls(job_dir(crawler.settings)) crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed) crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened) return obj
def from_settings(cls, settings): debug = settings.getbool('DUPEFILTER_DEBUG') df = cls(job_dir(settings), debug) df.method = 'from_settings' return df
def from_settings(cls, settings): dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) return cls(dupefilter, job_dir(settings), dqclass)
def from_settings(cls, settings): debug = settings.getbool('DUPEFILTER_DEBUG') df = cls(job_dir(settings), debug) df.method = 'from_settings' return df
def from_settings(cls, settings): return cls(job_dir(settings))
def from_settings(cls, settings): verbose_log = settings.getbool('DUPEFILTER_DEBUG') return cls(job_dir(settings), verbose_log)