def from_settings(cls, settings): dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser)
def application(config): app = Application("Scrapyd") http_port = config.getint('http_port', 6800) bind_address = config.get('bind_address', '127.0.0.1') poll_interval = config.getfloat('poll_interval', 5) poller = QueuePoller(config) eggstorage = FilesystemEggStorage(config) scheduler = SpiderScheduler(config) environment = Environment(config) app.setComponent(IPoller, poller) app.setComponent(IEggStorage, eggstorage) app.setComponent(ISpiderScheduler, scheduler) app.setComponent(IEnvironment, environment) laupath = config.get('launcher', 'scrapyd.launcher.Launcher') laucls = load_object(laupath) launcher = laucls(config, app) webpath = config.get('webroot', 'scrapyd.website.Root') webcls = load_object(webpath) timer = TimerService(poll_interval, poller.poll) webservice = TCPServer(http_port, server.Site(webcls(config, app)), interface=bind_address) log.msg(format="Scrapyd web console available at http://%(bind_address)s:%(http_port)s/", bind_address=bind_address, http_port=http_port) launcher.setServiceParent(app) timer.setServiceParent(app) webservice.setServiceParent(app) return app
def from_crawler(cls, crawler, **spider_kwargs): settings = crawler.settings kwargs = { 'filter_storage_path': settings.get('FILTER_STORAGE_PATH', ''), 'item_storage_path': settings.get('ITEM_STORAGE_PATH', ''), } kwargs.update(spider_kwargs) spider_kwargs = kwargs spider = super(EndpointSpider, cls).from_crawler(crawler, **spider_kwargs) spider.stats = crawler.stats jobdir = job_dir(settings) generated = False if jobdir: queuecls = load_object(settings['SCHEDULER_DISK_QUEUE']) queuedir = os.path.join(jobdir, 'startrequests.queue') if os.path.exists(queuedir): generated = True spider.requestqueue = queuecls(os.path.join(queuedir, '0')) else: queuecls = load_object(settings['SCHEDULER_MEMORY_QUEUE']) spider.requestqueue = queuecls() if not generated: for x in spider.generate_start_requests(): spider.enqueue_start_request(x) crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed) return spider
def open(self, spider): self.spider = spider try: self.queue = load_object(self.queue_cls)( server=self.server, spider=spider, key=self.queue_key % {'spider': spider.name}, serializer=self.serializer, ) except TypeError as e: raise ValueError("Failed to instantiate queue class '%s': %s", self.queue_cls, e) try: self.df = load_object(self.dupefilter_cls)( server=self.server, key=self.dupefilter_key % {'spider': spider.name}, debug=spider.settings.getbool('DUPEFILTER_DEBUG'), ) except TypeError as e: raise ValueError("Failed to instantiate dupefilter class '%s': %s", self.dupefilter_cls, e) if self.flush_on_start: self.flush() # notice if there are requests already in the queue to resume the crawl if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
def __init__(self, spidercls, settings=None): if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL')) logging.root.addHandler(handler) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None
def run(self, args, opts): # load contracts contracts = build_component_list(self.settings["SPIDER_CONTRACTS_BASE"], self.settings["SPIDER_CONTRACTS"]) self.conman = ContractsManager([load_object(c) for c in contracts]) self.results = TextTestRunner(verbosity=opts.verbose)._makeResult() # contract requests contract_reqs = defaultdict(list) spman_cls = load_object(self.settings["SPIDER_MANAGER_CLASS"]) spiders = spman_cls.from_settings(self.settings) for spider in args or spiders.list(): spider = spiders.create(spider) requests = self.get_requests(spider) if opts.list: for req in requests: contract_reqs[spider.name].append(req.callback.__name__) elif requests: crawler = self.crawler_process.create_crawler(spider.name) crawler.crawl(spider, requests) # start checks if opts.list: for spider, methods in sorted(contract_reqs.iteritems()): print spider for method in sorted(methods): print " * %s" % method else: self.crawler_process.start() self.results.printErrors()
def from_settings(cls, settings): persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) input_queue_key = settings.get( 'INPUT_QUEUE_KEY', INPUT_QUEUE_KEY) input_queue_cls = load_object(settings.get( 'INPUT_QUEUE_CLASS', INPUT_QUEUE_CLASS)) input_queue_shard_dist = settings.get( 'INPUT_QUEUE_SHARD_DIST', INPUT_QUEUE_SHARD_DIST) output_queue_key = settings.get( 'OUTPUT_QUEUE_KEY', OUTPUT_QUEUE_KEY) output_queue_cls = load_object(settings.get( 'OUTPUT_QUEUE_CLASS', OUTPUT_QUEUE_CLASS)) output_queue_shard_dist = settings.get( 'OUTPUT_QUEUE_SHARD_DIST', OUTPUT_QUEUE_SHARD_DIST) priority_queue_key = settings.get( 'PRIORITY_QUEUE_KEY', PRIORITY_QUEUE_KEY) priority_queue_cls = load_object(settings.get( 'PRIORITY_QUEUE_CLASS', PRIORITY_QUEUE_CLASS)) priority_queue_shard_dist = settings.get( 'PRIORITY_QUEUE_SHARD_DIST', PRIORITY_QUEUE_SHARD_DIST) dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY) idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE) servers = connection.from_settings(settings) dupefilter_ins = load_object( settings['DUPEFILTER_CLASS']).from_settings(settings) recrawl_key = settings.get('RECRAWL_LIST_KEY', RECRAWL_KEY) return cls(servers, persist, input_queue_key, input_queue_cls, input_queue_shard_dist, output_queue_key, output_queue_cls, output_queue_shard_dist, priority_queue_key, priority_queue_cls, priority_queue_shard_dist, recrawl_key, dupefilter_key, dupefilter_ins, idle_before_close)
def __init__(self, spidercls, settings): if isinstance(settings, dict): settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL')) logging.root.addHandler(handler) self.signals.connect(lambda: logging.root.removeHandler(handler), signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.spidercls.update_settings(self.settings) self.settings.freeze() self.crawling = False self.spider = None self.engine = None
def application(config, components=interfaces): app = Application("Scrapyd") http_port = config.getint('http_port', 6800) bind_address = config.get('bind_address', '0.0.0.0') for interface, key in interfaces: path = config.get(key) cls = load_object(path) component = cls(config) app.setComponent(interface, component) poller = component laupath = config.get('launcher', 'scrapyd.launcher.Launcher') laucls = load_object(laupath) launcher = laucls(config, app) poll_every = config.getint("poll_every", 5) timer = TimerService(poll_every, poller.poll) webservice = TCPServer(http_port, server.Site(Root(config, app)), interface=bind_address) log.msg(format="Scrapyd web console available at http://%(bind_address)s:%(http_port)s/", bind_address=bind_address, http_port=http_port) launcher.setServiceParent(app) timer.setServiceParent(app) webservice.setServiceParent(app) return app
def from_crawler(cls, crawler): settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) rqclass = load_object(settings['SCHEDULER_RABBIT_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') return cls(dupefilter, job_dir(settings), rqclass, logunser, crawler.stats)
def from_crawler(cls, crawler): settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) domainclass = load_object(settings['SCHEDULER_DOMAIN_CLASS']) flowclass = load_object(settings['SCHEDULER_FLOW_CLASS']) return cls(dupefilter, domainclass, flowclass, crawler.stats, settings)
def __init__(self, settings, stats): if not settings.getbool('HTTPCACHE_ENABLED'): raise NotConfigured self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings) self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings) self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING') self.stats = stats
def from_settings(cls, settings): persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY) queue_cls = load_object(settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS)) dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY) dupefilter_cls = load_object(settings.get('DUPEFILTER_CLASS', DUPEFILTER_CLASS)) idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE) server = connection.from_settings(settings) return cls(server, persist, queue_key, queue_cls, dupefilter_key, dupefilter_cls, idle_before_close)
def __init__(self, settings): self.configured = False self.settings = settings self.signals = SignalManager(self) self.stats = load_object(settings['STATS_CLASS'])(self) spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_crawler(self) self._scheduled = {}
def from_crawler(cls, crawler): settings = crawler.settings run_as_daemon = settings.get('DAEMON') dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') return cls(crawler, dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats, run_as_daemon)
def configure(self): """ Configure execution engine with the given scheduling policy and downloader. """ self.scheduler = load_object(settings['SCHEDULER'])() self.spider_scheduler = load_object(settings['SPIDER_SCHEDULER'])() self.downloader = Downloader() self.scraper = Scraper(self) self.configured = True
def from_crawler(cls, crawler): settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = create_instance(dupefilter_cls, settings, crawler) pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG')) return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser, stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)
def __init__(self, settings): self.configured = False self.settings = settings self.signals = SignalManager(self) self.stats = load_object(settings['STATS_CLASS'])(self) self._start_requests = lambda: () self._spider = None # TODO: move SpiderManager to CrawlerProcess spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_crawler(self)
def configure(self): if self.configured: return self.configured = True lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_crawler(self) self.engine = ExecutionEngine(self, self._spider_closed)
def __init__(self, spidercls, settings): self.spidercls = spidercls self.settings = settings self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.crawling = False self.spider = None self.engine = None
def configure(self): if self.configured: return self.configured = True d = dict(overridden_settings(self.settings)) log.msg(format="Overridden settings: %(settings)r", settings=d, level=log.DEBUG) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_crawler(self) self.engine = ExecutionEngine(self, self._spider_closed)
def from_settings(cls, global_settings, global_stats): settings = global_settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') total_concurrency = self.settings.getint('CONCURRENT_REQUESTS') domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP') return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser, global_stats, total_concurrency, domain_concurrency, ip_concurrency)
def configure(self): if self.configured: return self.configured = True self.extensions = ExtensionManager.from_settings(self.settings) spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS']) self.spiders = spman_cls.from_settings(self.settings) spq_cls = load_object(self.settings['SPIDER_QUEUE_CLASS']) spq = spq_cls.from_settings(self.settings) keepalive = self.settings.getbool('KEEP_ALIVE') pollint = self.settings.getfloat('QUEUE_POLL_INTERVAL') self.queue = ExecutionQueue(self.spiders, spq, poll_interval=pollint, keep_alive=keepalive) self.engine = ExecutionEngine(self.settings, self._spider_closed)
def __init__(self, settings): super(Scheduler, self).__init__(settings) dupefilter_cls = load_object(self.dupfilter_class.to_value()) dupefilter = dupefilter_cls(self.metas) dqclass = load_object(self.schedule_disk_queue.to_value()) mqclass = load_object(self.schedule_memory_queue.to_value()) logunser = self.log_unserailizable_requests.to_value() self.df = dupefilter self.jobpath = self.__job_dir(self.jobdir.to_value()) self.dqdir = self._dqdir(self.jobpath) self.dqclass = dqclass self.mqclass = mqclass self.logunser = logunser
def __init__(self, crawler, spider_closed_callback): self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals self.logformatter = crawler.logformatter self.slot = None self.spider = None self.running = False self.paused = False self.scheduler_cls = load_object(self.settings['SCHEDULER']) downloader_cls = load_object(self.settings['DOWNLOADER']) self.downloader = downloader_cls(crawler) self.scraper = Scraper(crawler) self._spider_closed_callback = spider_closed_callback
def from_crawler(cls, crawler): settings = crawler.settings pqcls = load_object(settings["SCHEDULER_PRIORITY_QUEUE"]) dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats, pqcls)
def application(config): app = Application("Scrapyd") http_port = config.getint('http_port', 6800) bind_address = config.get('bind_address', '127.0.0.1') poll_interval = config.getfloat('poll_interval', 5) poller = QueuePoller(config) eggstorage = FilesystemEggStorage(config) scheduler = SpiderScheduler(config) environment = Environment(config) app.setComponent(IPoller, poller) app.setComponent(IEggStorage, eggstorage) app.setComponent(ISpiderScheduler, scheduler) app.setComponent(IEnvironment, environment) laupath = config.get('launcher', 'scrapyd.launcher.Launcher') laucls = load_object(laupath) launcher = laucls(config, app) timer = TimerService(poll_interval, poller.poll) webpath = config.get('webroot', 'scrapyd.website.Root') webcls = load_object(webpath) username = config.get('username', '') password = config.get('password', '') if username and password: if ':' in username: sys.exit("The `username` option contains illegal character ':', " "check and update the configuration file of Scrapyd") portal = Portal(PublicHTMLRealm(webcls(config, app)), [StringCredentialsChecker(username, password)]) credential_factory = BasicCredentialFactory("Auth") resource = HTTPAuthSessionWrapper(portal, [credential_factory]) log.msg("Basic authentication enabled") else: resource = webcls(config, app) log.msg("Basic authentication disabled as either `username` or `password` is unset") webservice = TCPServer(http_port, server.Site(resource), interface=bind_address) log.msg(format="Scrapyd web console available at http://%(bind_address)s:%(http_port)s/", bind_address=bind_address, http_port=http_port) launcher.setServiceParent(app) timer.setServiceParent(app) webservice.setServiceParent(app) return app
def __init__(self, datadir, spider_cls=None, settings=None, **kwargs): self.spider_cls = load_object(spider_cls) if spider_cls else IblSpider self._specs = open_project_from_dir(datadir) settings = settings.copy() settings.frozen = False settings.set('PLUGINS', load_plugins(settings)) self.settings = settings
def __init__(self, engine, settings): self.sites = {} self.spidermw = SpiderMiddlewareManager.from_settings(settings) itemproc_cls = load_object(settings['ITEM_PROCESSOR']) self.itemproc = itemproc_cls.from_settings(settings) self.concurrent_items = settings.getint('CONCURRENT_ITEMS') self.engine = engine
def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) self._contextFactory = self._contextFactoryClass() self._disconnect_timeout = 1
def run(self, args, opts): # load contracts contracts = build_component_list( self.settings.getwithbase('SPIDER_CONTRACTS')) conman = ContractsManager(load_object(c) for c in contracts) runner = TextTestRunner(verbosity=2 if opts.verbose else 1) result = TextTestResult(runner.stream, runner.descriptions, runner.verbosity) # contract requests contract_reqs = defaultdict(list) spider_loader = self.crawler_process.spider_loader with set_environ(SCRAPY_CHECK='true'): for spidername in args or spider_loader.list(): spidercls = spider_loader.load(spidername) spidercls.start_requests = lambda s: conman.from_spider( s, result) tested_methods = conman.tested_methods_from_spidercls( spidercls) if opts.list: for method in tested_methods: contract_reqs[spidercls.name].append(method) elif tested_methods: self.crawler_process.crawl(spidercls) # start checks if opts.list: for spider, methods in sorted(contract_reqs.items()): if not methods and not opts.verbose: continue print(spider) for method in sorted(methods): print(' * %s' % method) else: start = time.time() self.crawler_process.start() stop = time.time() result.printErrors() result.printSummary(start, stop) self.exitcode = int(not result.wasSuccessful())
def __init__(self, crawler: Crawler) -> None: super().__init__(settings=crawler.settings, crawler=crawler) verify_installed_reactor( "twisted.internet.asyncioreactor.AsyncioSelectorReactor") crawler.signals.connect(self._engine_started, signals.engine_started) self.stats = crawler.stats self.browser_type: str = crawler.settings.get( "PLAYWRIGHT_BROWSER_TYPE") or "chromium" self.launch_options: dict = crawler.settings.getdict( "PLAYWRIGHT_LAUNCH_OPTIONS") or {} self.default_navigation_timeout: Optional[float] = None if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in crawler.settings: with suppress(TypeError, ValueError): self.default_navigation_timeout = float( crawler.settings.get( "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT")) if crawler.settings.get("PLAYWRIGHT_PROCESS_REQUEST_HEADERS"): self.process_request_headers = load_object( crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"]) else: self.process_request_headers = use_scrapy_headers default_context_kwargs: dict = {} if "PLAYWRIGHT_CONTEXT_ARGS" in crawler.settings: default_context_kwargs = crawler.settings.getdict( "PLAYWRIGHT_CONTEXT_ARGS") warnings.warn( "The PLAYWRIGHT_CONTEXT_ARGS setting is deprecated, please use" " PLAYWRIGHT_CONTEXTS instead. Keyword arguments defined in" " PLAYWRIGHT_CONTEXT_ARGS will be used when creating the 'default' context", category=DeprecationWarning, stacklevel=2, ) self.context_kwargs: defaultdict = defaultdict(dict) for name, kwargs in (crawler.settings.getdict("PLAYWRIGHT_CONTEXTS") or {}).items(): if name == "default": self.context_kwargs[name] = default_context_kwargs self.context_kwargs[name].update(kwargs) if "default" not in self.context_kwargs and default_context_kwargs: self.context_kwargs["default"] = default_context_kwargs
def from_settings(cls, settings): # kwargs = { # 'persist': settings.getbool('SCHEDULER_PERSIST'), # 'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'), # 'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'), # } # If these values are missing, it means we want to use the defaults. # optional = { # TODO: Use custom prefixes for this settings to note that are # # specific to scrapy-redis. # 'queue_key': 'SCHEDULER_QUEUE_KEY', # 'queue_cls': 'SCHEDULER_QUEUE_CLASS', # 'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY', # We use the default setting name to keep compatibility. # 'dupefilter_cls': 'DUPEFILTER_CLASS', # 'serializer': 'SCHEDULER_SERIALIZER', # } # for name, setting_name in optional.items(): # val = settings.get(setting_name) # if val: # kwargs[name] = val # Support serializer as a path to a module. # if isinstance(kwargs.get('serializer'), six.string_types): # kwargs['serializer'] = importlib.import_module(kwargs['serializer']) # server = connection.from_settings(settings) # Ensure the connection is working. # server.ping() # return cls(server=server, **kwargs) persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY) queue_cls = load_object( settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS)) queue_name = settings.get('REDIS_QUEUE_NAME', None) dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY) idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE) server = connection.from_settings(settings) server_filter = connection.from_settings_filter(settings) return cls(server, server_filter, persist, queue_key, queue_cls, dupefilter_key, idle_before_close, queue_name)
def get_redis_from_settings(settings): """Returns a redis client instance from given Scrapy settings object. This function uses ``get_client`` to instantiate the client and uses ``defaults.REDIS_PARAMS`` global as defaults values for the parameters. You can override them using the ``REDIS_PARAMS`` setting. Parameters ---------- settings : Settings A scrapy settings object. See the supported settings below. Returns ------- server Redis client instance. Other Parameters ---------------- REDIS_URL : str, optional Server connection URL. REDIS_HOST : str, optional Server host. REDIS_PORT : str, optional Server port. REDIS_ENCODING : str, optional Data encoding. REDIS_PARAMS : dict, optional Additional client parameters. """ params = defaults.REDIS_PARAMS.copy() params.update(settings.getdict("REDIS_PARAMS")) # XXX: Deprecate REDIS_* settings. for source, dest in SETTINGS_PARAMS_MAP.items(): val = settings.get(source) if val: params[dest] = val # Allow ``redis_cls`` to be a path to a class. if isinstance(params.get("redis_cls"), six.string_types): params["redis_cls"] = load_object(params["redis_cls"]) return get_redis(**params)
def load_providers(self, default_providers: Optional[Mapping] = None): providers_dict = { **(default_providers or {}), **self.spider.settings.getdict("SCRAPY_POET_PROVIDERS") } provider_classes = build_component_list(providers_dict) logger.info(f"Loading providers:\n {pprint.pformat(provider_classes)}") self.providers = [ load_object(cls)(self.crawler) for cls in provider_classes ] check_all_providers_are_callable(self.providers) # Caching whether each provider requires the scrapy response self.is_provider_requiring_scrapy_response = { provider: is_provider_requiring_scrapy_response(provider) for provider in self.providers } # Caching the function for faster execution self.is_class_provided_by_any_provider = \ is_class_provided_by_any_provider_fn(self.providers)
def __init__(self, config, app): resource.Resource.__init__(self) self.debug = config.getboolean('debug', False) self.runner = config.get('runner') logsdir = config.get('logs_dir') itemsdir = config.get('items_dir') self.app = app self.putChild('', Home(self)) if logsdir: self.putChild('logs', static.File(logsdir, 'text/plain')) if itemsdir: self.putChild('items', static.File(itemsdir, 'text/plain')) self.putChild('jobs', Jobs(self)) self.putChild('static', static.File('static')) # adding UI support services = config.items('services', ()) for servName, servClsName in services: servCls = load_object(servClsName) self.putChild(servName, servCls(self)) self.update_projects()
def __init__(self, crawler): self._handlers = {} self._notconfigured = {} handlers = crawler.settings.get('DOWNLOAD_HANDLERS_BASE') handlers.update(crawler.settings.get('DOWNLOAD_HANDLERS', {})) for scheme, clspath in handlers.iteritems(): # Allow to disable a handler just like any other # component (extension, middleware, etc). if clspath is None: continue cls = load_object(clspath) try: dh = cls(crawler.settings) except NotConfigured as ex: self._notconfigured[scheme] = str(ex) else: self._handlers[scheme] = dh crawler.signals.connect(self._close, signals.engine_stopped)
def test(self): fixture_objects = data['result'] request = request_from_dict(data['request'], spider) response = HtmlResponse(request=request, **data['response']) middlewares = [] middleware_paths = data['middlewares'] for mw_path in middleware_paths: try: mw_cls = load_object(mw_path) mw = create_instance(mw_cls, settings, crawler) middlewares.append(mw) except NotConfigured: continue middlewares.append(mw) crawler.signals.send_catch_log(signal=signals.spider_opened, spider=spider) for mw in middlewares: if hasattr(mw, 'process_spider_input'): mw.process_spider_input(response, spider) result = request.callback(response) or [] middlewares.reverse() for mw in middlewares: if hasattr(mw, 'process_spider_output'): result = mw.process_spider_output(response, result, spider) if isinstance(result, (Item, Request, dict)): result = [result] for index, _object in enumerate(result): fixture_data = fixture_objects[index]['data'] if fixture_objects[index].get('type') == 'request': clean_request(fixture_data, settings) else: clean_item(fixture_data, settings) _object = parse_object(_object, spider) self.assertEqual(fixture_data, _object, 'Not equal!')
def __init__(self, config, app): resource.Resource.__init__(self) self.debug = config.getboolean('debug', False) self.runner = config.get('runner') logsdir = config.get('logs_dir') itemsdir = config.get('items_dir') local_items = itemsdir and (urlparse(itemsdir).scheme.lower() in ['', 'file']) self.app = app self.nodename = config.get('node_name', socket.gethostname()) self.putChild(b'', Home(self, local_items)) if logsdir: self.putChild(b'logs', static.File(logsdir.encode('ascii', 'ignore'), 'text/plain')) if local_items: self.putChild(b'items', static.File(itemsdir, 'text/plain')) self.putChild(b'jobs', Jobs(self, local_items)) services = config.items('services', ()) for servName, servClsName in services: servCls = load_object(servClsName) self.putChild(servName.encode('utf-8'), servCls(self)) self.update_projects()
def from_settings(cls, settings): """ 获得redis连接池 """ pool = None pipeline = None try: if settings.get('REDIS_PIPELINE'): pipeline = load_object(settings['REDIS_PIPELINE']) else: raise NotConfigured if settings.getdict("REDIS_BACKEND_PARAMS", REDIS_PARAMS_DEFAULT): param = settings.getdict("REDIS_BACKEND_PARAMS") pool = redis.ConnectionPool(**param) except Exception as ex: print '%s \nException:%s' % (__file__, ex) return cls(pool, pipeline)
def from_settings(cls, settings): if os.environ.get('spider_set_persist'): persist = (os.environ.get('spider_set_persist') != 'False') else: persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) if os.environ.get('need_dupefilter'): need_dupefilter = (os.environ.get('need_dupefilter') != 'False') else: need_dupefilter = True queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY) queue_cls = load_object( settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS)) dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY) idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE) server = connection.from_settings(settings) return cls(server, persist, queue_key, queue_cls, dupefilter_key, idle_before_close, need_dupefilter)
def from_settings(cls, settings, crawler=None): mwlist = cls._get_mwlist_from_settings(settings) middlewares = [] for clspath in mwlist: try: mwcls = load_object(clspath) if crawler and hasattr(mwcls, 'from_crawler'): mw = mwcls.from_crawler(crawler) elif hasattr(mwcls, 'from_settings'): mw = mwcls.from_settings(settings) else: mw = mwcls() middlewares.append(mw) except NotConfigured, e: if e.args: clsname = clspath.split('.')[-1] log.msg(format="Disabled %(clsname)s: %(eargs)s", level=log.WARNING, clsname=clsname, eargs=e.args[0])
def __init__(self, settings, stats): # should not call parent's __init__ as that configures from "HTTPCACHE_ENABLED" variable # while this class should configure from "HTTPCACHE_PER_SPIDER_ENABLED" if not settings.get('HTTPCACHE_PER_SPIDER_ENABLED'): raise NotConfigured("Disabled") if not settings.get('HTTPCACHE_STORAGE'): raise NotConfigured( "Missing param 'HTTPCACHE_STORAGE'. Storage not configured") self.stats = stats self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings) self.storage_class = settings['HTTPCACHE_STORAGE'] self.ignore_missing = settings.get('HTTPCACHE_IGNORE_MISSING') # dispatcher.connect(self.spider_opened, signal=signals.spider_opened) # dispatcher.connect(self.spider_closed, signal=signals.spider_closed) self.use_cache = set() self.expiration_time = {} self.settings = settings
def get_schema_from(source): if is_schema_url(source): schema = get_contents(source) try: return json.loads(schema) except Exception as e: logger.exception( str(e) + "\nCould not parse schema from '{}'".format(source)) elif source.endswith(".json"): with open(source, "r") as f: try: return json.load(f) except Exception as e: logger.exception( str(e) + "\nCould not parse schema in '{}'".format(source)) else: schema = load_object(source) if isinstance(schema, string_types): return json.loads(schema) return schema
def open_spider(self, spider: Spider): try: if self.settings.get(SEEDS_MONGODB_SEEDS_PREPARE): self.prepare = load_object( self.settings.get(SEEDS_MONGODB_SEEDS_PREPARE)) else: self.prepare = lambda x: map(lambda y: (y, { 'seed': x }), x['websites']) except: raise NotConfigured self.cnx = MongoClient(self.uri) self.db = self.cnx.get_database( self.settings.get(SEEDS_MONGODB_DATABASE, 'seeds')) self.coll = self.db.get_collection( self.settings.get(SEEDS_MONGODB_COLLECTION, 'seeds')) logger.info('Spider opened: Open the connection to MongoDB: %s', self.uri)
def from_crawler(cls, crawler) -> "Scheduler": settings = crawler.settings rq_cls = load_object( settings.get( "SCHEDULER_REQUEST_QUEUE", "os_scrapy_rq_crawler.MemoryRequestQueue", )) rq = create_instance(rq_cls, settings, crawler) logger.debug(f"Using request queue: {class_fullname(rq_cls)}") concurrency = settings.getint("CONCURRENT_REQUESTS", 16) delay = settings.getint("DOWNLOAD_DELAY") max_slots = settings.getint("SCHEDULER_MAX_SLOTS", concurrency * (delay if delay > 0 else 3)) assert max_slots > 1, f"SCHEDULER_MAX_SLOTS({max_slots}) must > 1" standby_slots = settings.getint("SCHEDULER_STANDBY_SLOTS", int(concurrency / 4)) logger.debug( f"max_slots:{max_slots} standby_slots:{standby_slots} concurrency:{concurrency}" ) return cls(crawler, rq, max_slots, standby_slots, crawler.stats)
def __init__(self, crawler, spider_closed_callback: Callable) -> None: self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals self.logformatter = crawler.logformatter self.slot: Optional[Slot] = None self.spider: Optional[Spider] = None self.running = False # 是否暂停 self.paused = False self.scheduler_cls = self._get_scheduler_class(crawler.settings) # 加载下载器 downloader_cls = load_object(self.settings['DOWNLOADER']) self.downloader = downloader_cls(crawler) # 实例化这个抓取这个动作 self.scraper = Scraper(crawler) # 外部传入的关闭回调 self._spider_closed_callback = spider_closed_callback
def _get_next_page_request(self, response, rules): extractors = rules["extractors"] for rule in extractors: xpath = rule["xpath"] regex = rule['regex'] # get url by xpath and re configs url = response.xpath(xpath).re_first( regex) if regex else response.xpath(xpath).extract_first() # maybe multi rules be created to parse next page url if not url or not isinstance(url, str) or not url.strip(): continue request_factory_cls = load_object(rule["request_factory_class"]) req_factory_obj = request_factory_cls() # generate next page request and return return req_factory_obj.make_request( spider=self, depth=rule["depth"], link_or_url=response.urljoin(url).strip(), meta=response.meta) # if no next page request, return None return None
def __init__(self, redis_server, key, queue_cls, queue_serializer=None, dont_serial=False): """scheduler :param redis_server: `redis.Redis` object :param key: str. queue key :param queue_cls: str, full path of queue class :param queue_serializer: None or str, if is None, then it will use `pickle` :param dont_serial: bool. if True, queue_serializer will not be used """ def import_object(obj): try: obj = __import__(obj) except ImportError: obj = load_object(obj) return obj queue_cls = load_object(queue_cls) if isinstance(queue_cls, str) else queue_cls queue_serializer = import_object(queue_serializer) if isinstance(queue_serializer, str) else queue_serializer self.redis_server = redis_server self.queue_cls = queue_cls self.queue = queue_cls(redis_server, key, queue_serializer, dont_serial)
def __init__(self, crawler, *args, **kwargs): super().__init__(**kwargs) splist = self._build_component_list(self.spiders) spcache = {} spiders = [] for clspath in splist: if self.initialize_once and clspath in spcache: spiders.append(spcache[clspath]) continue spcls = load_object(clspath) subsp = create_instance(spcls, crawler.settings, crawler, *args, **kwargs) spcache[clspath] = subsp spiders.append(subsp) if not spiders: self.logger.error('Spider Composer cannot be initialized with no' 'active spiders') self._spiders = spiders
def __init__(self, settings): self.settings = settings self.urifmt = settings['FEED_URI'] if not self.urifmt: raise NotConfigured self.format = settings['FEED_FORMAT'].lower() self.export_encoding = settings['FEED_EXPORT_ENCODING'] self.storages = self._load_components('FEED_STORAGES') self.exporters = self._load_components('FEED_EXPORTERS') if not self._storage_supported(self.urifmt): raise NotConfigured if not self._exporter_supported(self.format): raise NotConfigured self.store_empty = settings.getbool('FEED_STORE_EMPTY') self._exporting = False self.export_fields = settings.getlist('FEED_EXPORT_FIELDS') or None self.indent = None if settings.get('FEED_EXPORT_INDENT') is not None: self.indent = settings.getint('FEED_EXPORT_INDENT') uripar = settings['FEED_URI_PARAMS'] self._uripar = load_object(uripar) if uripar else lambda x, y: None
def _get_agent(self, request, timeout): proxy = request.meta['proxy'] if proxy: proxy_scheme, _, proxy_host, proxy_port, _ = _parse(proxy) proxy_scheme = str(proxy_scheme, 'utf-8') if proxy_scheme == 'socks5': endpoint = TCP4ClientEndpoint(reactor, proxy_host, proxy_port) self._sslMethod = openssl_methods[DOWNLOADER_CLIENT_TLS_METHOD] self._contextFactoryClass = load_object( DOWNLOADER_CLIENTCONTEXTFACTORY) self._contextFactory = create_instance( objcls=self._contextFactoryClass, settings=settings, crawler=None, method=self._sslMethod, ) return self._Agent(reactor, proxyEndpoint=endpoint, contextFactory=self._contextFactory) return super(TorScrapyAgent, self)._get_agent(request, timeout)
def _get_spider_loader(settings): """ Get SpiderLoader instance from settings """ if settings.get('SPIDER_MANAGER_CLASS'): warnings.warn( 'SPIDER_MANAGER_CLASS option is deprecated. ' 'Please use SPIDER_LOADER_CLASS.', category=ScrapyDeprecationWarning, stacklevel=2) cls_path = settings.get('SPIDER_MANAGER_CLASS', settings.get('SPIDER_LOADER_CLASS')) loader_cls = load_object(cls_path) try: verifyClass(ISpiderLoader, loader_cls) except DoesNotImplement: warnings.warn( 'SPIDER_LOADER_CLASS (previously named SPIDER_MANAGER_CLASS) does ' 'not fully implement scrapy.interfaces.ISpiderLoader interface. ' 'Please add all missing methods to avoid unexpected runtime errors.', category=ScrapyDeprecationWarning, stacklevel=2) return loader_cls.from_settings(settings.frozencopy())
def __init__(self, crawler: Crawler, auth_encoding: str, mw): super().__init__(crawler, auth_encoding, mw) self.mongodb_settings: Dict = self._get_mongodb_settings() self.not_mongoclient_parameters: Dict = self.mongodb_settings.get( 'not_mongoclient_parameters') self.uri: str = None self.conn: MongoClient = None self.db: DatabaseSync = None self.coll: CollectionSync = None self._proxy_retriever: methodcaller = methodcaller( self.mongodb_settings['proxy_retriever'].pop('name'), **self.mongodb_settings['proxy_retriever']) self._get_proxy_from_doc: Callable = partial( load_object(self.mongodb_settings['get_proxy_from_doc']), auth_encoding=self.auth_encoding) self.proxies_invalidated: Set[Tuple[str, bytes, str]] = set()
def from_settings(cls, settings): server = redis.Redis(host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT')) persist = settings.get('SCHEDULER_PERSIST', True) up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10) hits = settings.get('QUEUE_HITS', 10) window = settings.get('QUEUE_WINDOW', 60) mod = settings.get('QUEUE_MODERATED', False) timeout = settings.get('DUPEFILTER_TIMEOUT', 600) ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60) add_type = settings.get('SCHEDULER_TYPE_ENABLED', False) add_ip = settings.get('SCHEDULER_IP_ENABLED', False) retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3) ip_regex = settings.get('IP_ADDR_REGEX', '.*') throt = settings.get('QUEUE_THROTTLED', False) dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) my_level = settings.get('SC_LOG_LEVEL', 'INFO') my_name = settings.get('SC_LOGGER_NAME', 'sc-logger') my_output = settings.get('SC_LOG_STDOUT', True) my_json = settings.get('SC_LOG_JSON', False) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = settings.get('SC_LOG_FILE', 'main.log') my_backups = settings.get('SC_LOG_BACKUPS', 5) logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) return cls(dupefilter, server, persist, up_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex, throt)
def _load_handler(self, scheme, skip_lazy=False): path = self._schemes[scheme] try: dhcls = load_object(path) if skip_lazy and getattr(dhcls, 'lazy', True): return None dh = dhcls(self._crawler.settings) except NotConfigured as ex: self._notconfigured[scheme] = str(ex) return None except Exception as ex: logger.error('Loading "%(clspath)s" for scheme "%(scheme)s"', { "clspath": path, "scheme": scheme }, exc_info=True, extra={'crawler': self._crawler}) self._notconfigured[scheme] = str(ex) return None else: self._handlers[scheme] = dh return dh
def _load_policy_class(policy, warning_only=False): """ Expect a string for the path to the policy class, otherwise try to interpret the string as a standard value from https://www.w3.org/TR/referrer-policy/#referrer-policies """ try: # 尝试加载对象 return load_object(policy) except ValueError: try: return _policy_classes[policy.lower()] except KeyError: msg = "Could not load referrer policy %r" % policy if not warning_only: # 抛异常 raise RuntimeError(msg) else: # 仅告警 warnings.warn(msg, RuntimeWarning) return None
def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False self._sslMethod = openssl_methods[settings.get('DOWNLOADER_CLIENT_TLS_METHOD')] self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) # try method-aware context factory try: self._contextFactory = self._contextFactoryClass(method=self._sslMethod) except TypeError: # use context factory defaults self._contextFactory = self._contextFactoryClass() msg = """ '%s' does not accept `method` argument (type OpenSSL.SSL method,\ e.g. OpenSSL.SSL.SSLv23_METHOD).\ Please upgrade your context factory class to handle it or ignore it.""" % ( settings['DOWNLOADER_CLIENTCONTEXTFACTORY'],) warnings.warn(msg) self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE') self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE') self._disconnect_timeout = 1
def from_crawler(cls, crawler,): # first check if the extension should be enabled and raise # NotConfigured otherwis if not crawler.settings.getbool('MYEXT_ENABLED'): raise NotConfigured # get the number of items from settings request_count = crawler.settings.getint('MYEXT_ITEMCOUNT', 1000) # instantiate the extension object scheduler_cls = load_object(crawler.settings['SCHEDULER']) scheduler = scheduler_cls.from_crawler(crawler) ext = cls(request_count,scheduler) # connect the extension object to signals crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened) crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) crawler.signals.connect(ext.request_scheduled, signal=signals.request_scheduled) # return the extension object return ext
def from_settings(cls, settings, crawler=None): mwlist = cls._get_mwlist_from_settings(settings) middlewares = [] enabled = [] for clspath in mwlist: try: mwcls = load_object(clspath) mw = create_instance(mwcls, settings, crawler) middlewares.append(mw) enabled.append(clspath) except NotConfigured as e: if e.args: clsname = clspath.split('.')[-1] logger.warning("Disabled %(clsname)s: %(eargs)s", {'clsname': clsname, 'eargs': e.args[0]}, extra={'crawler': crawler}) logger.info("Enabled %(componentname)ss:\n%(enabledlist)s", {'componentname': cls.component_name, 'enabledlist': pprint.pformat(enabled)}, extra={'crawler': crawler}) return cls(*middlewares)