Example #1
0
 def from_settings(cls, settings):
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     dupefilter = dupefilter_cls.from_settings(settings)
     dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
     mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
     logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
     return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser)
Example #2
0
def application(config):
    app = Application("Scrapyd")
    http_port = config.getint('http_port', 6800)
    bind_address = config.get('bind_address', '127.0.0.1')
    poll_interval = config.getfloat('poll_interval', 5)

    poller = QueuePoller(config)
    eggstorage = FilesystemEggStorage(config)
    scheduler = SpiderScheduler(config)
    environment = Environment(config)

    app.setComponent(IPoller, poller)
    app.setComponent(IEggStorage, eggstorage)
    app.setComponent(ISpiderScheduler, scheduler)
    app.setComponent(IEnvironment, environment)

    laupath = config.get('launcher', 'scrapyd.launcher.Launcher')
    laucls = load_object(laupath)
    launcher = laucls(config, app)

    webpath = config.get('webroot', 'scrapyd.website.Root')
    webcls = load_object(webpath)

    timer = TimerService(poll_interval, poller.poll)
    webservice = TCPServer(http_port, server.Site(webcls(config, app)), interface=bind_address)
    log.msg(format="Scrapyd web console available at http://%(bind_address)s:%(http_port)s/",
            bind_address=bind_address, http_port=http_port)

    launcher.setServiceParent(app)
    timer.setServiceParent(app)
    webservice.setServiceParent(app)

    return app
Example #3
0
    def from_crawler(cls, crawler, **spider_kwargs):
        settings = crawler.settings
        kwargs = {
            'filter_storage_path': settings.get('FILTER_STORAGE_PATH', ''),
            'item_storage_path': settings.get('ITEM_STORAGE_PATH', ''),
        }
        kwargs.update(spider_kwargs)
        spider_kwargs = kwargs
        spider = super(EndpointSpider, cls).from_crawler(crawler, **spider_kwargs)
        spider.stats = crawler.stats
        
        jobdir = job_dir(settings)
        generated = False
        if jobdir:
            queuecls = load_object(settings['SCHEDULER_DISK_QUEUE'])
            queuedir = os.path.join(jobdir, 'startrequests.queue')
            if os.path.exists(queuedir):
                generated = True
            spider.requestqueue = queuecls(os.path.join(queuedir, '0'))
        else:
            queuecls = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
            spider.requestqueue = queuecls()
        if not generated:
            for x in spider.generate_start_requests():
                spider.enqueue_start_request(x)

        crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
        return spider
Example #4
0
    def open(self, spider):
        self.spider = spider

        try:
            self.queue = load_object(self.queue_cls)(
                server=self.server,
                spider=spider,
                key=self.queue_key % {'spider': spider.name},
                serializer=self.serializer,
            )
        except TypeError as e:
            raise ValueError("Failed to instantiate queue class '%s': %s",
                             self.queue_cls, e)

        try:
            self.df = load_object(self.dupefilter_cls)(
                server=self.server,
                key=self.dupefilter_key % {'spider': spider.name},
                debug=spider.settings.getbool('DUPEFILTER_DEBUG'),
            )
        except TypeError as e:
            raise ValueError("Failed to instantiate dupefilter class '%s': %s",
                             self.dupefilter_cls, e)

        if self.flush_on_start:
            self.flush()
        # notice if there are requests already in the queue to resume the crawl
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
Example #5
0
    def __init__(self, spidercls, settings=None):
        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None
Example #6
0
    def run(self, args, opts):
        # load contracts
        contracts = build_component_list(self.settings["SPIDER_CONTRACTS_BASE"], self.settings["SPIDER_CONTRACTS"])
        self.conman = ContractsManager([load_object(c) for c in contracts])
        self.results = TextTestRunner(verbosity=opts.verbose)._makeResult()

        # contract requests
        contract_reqs = defaultdict(list)

        spman_cls = load_object(self.settings["SPIDER_MANAGER_CLASS"])
        spiders = spman_cls.from_settings(self.settings)

        for spider in args or spiders.list():
            spider = spiders.create(spider)
            requests = self.get_requests(spider)

            if opts.list:
                for req in requests:
                    contract_reqs[spider.name].append(req.callback.__name__)
            elif requests:
                crawler = self.crawler_process.create_crawler(spider.name)
                crawler.crawl(spider, requests)

        # start checks
        if opts.list:
            for spider, methods in sorted(contract_reqs.iteritems()):
                print spider
                for method in sorted(methods):
                    print "  * %s" % method
        else:
            self.crawler_process.start()
            self.results.printErrors()
Example #7
0
 def from_settings(cls, settings):
   persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
   input_queue_key = settings.get(
     'INPUT_QUEUE_KEY', INPUT_QUEUE_KEY)
   input_queue_cls = load_object(settings.get(
     'INPUT_QUEUE_CLASS', INPUT_QUEUE_CLASS))
   input_queue_shard_dist = settings.get(
     'INPUT_QUEUE_SHARD_DIST', INPUT_QUEUE_SHARD_DIST)
   output_queue_key = settings.get(
     'OUTPUT_QUEUE_KEY', OUTPUT_QUEUE_KEY)
   output_queue_cls = load_object(settings.get(
     'OUTPUT_QUEUE_CLASS', OUTPUT_QUEUE_CLASS))
   output_queue_shard_dist = settings.get(
     'OUTPUT_QUEUE_SHARD_DIST', OUTPUT_QUEUE_SHARD_DIST)
   priority_queue_key = settings.get(
     'PRIORITY_QUEUE_KEY', PRIORITY_QUEUE_KEY)
   priority_queue_cls = load_object(settings.get(
     'PRIORITY_QUEUE_CLASS', PRIORITY_QUEUE_CLASS))
   priority_queue_shard_dist = settings.get(
     'PRIORITY_QUEUE_SHARD_DIST', PRIORITY_QUEUE_SHARD_DIST)
   dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY)
   idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE)
   servers = connection.from_settings(settings)
   dupefilter_ins = load_object(
     settings['DUPEFILTER_CLASS']).from_settings(settings)
   recrawl_key = settings.get('RECRAWL_LIST_KEY', RECRAWL_KEY)
   return cls(servers, persist, input_queue_key, input_queue_cls,
              input_queue_shard_dist, output_queue_key, output_queue_cls,
              output_queue_shard_dist, priority_queue_key,
              priority_queue_cls, priority_queue_shard_dist, recrawl_key,
              dupefilter_key, dupefilter_ins, idle_before_close)
Example #8
0
    def __init__(self, spidercls, settings):
        if isinstance(settings, dict):
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()

        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)
        self.signals.connect(lambda: logging.root.removeHandler(handler),
                             signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.spidercls.update_settings(self.settings)
        self.settings.freeze()

        self.crawling = False
        self.spider = None
        self.engine = None
Example #9
0
def application(config, components=interfaces):
    app = Application("Scrapyd")
    http_port = config.getint('http_port', 6800)
    bind_address = config.get('bind_address', '0.0.0.0')

    for interface, key in interfaces:
        path = config.get(key)
        cls = load_object(path)
        component = cls(config)
        app.setComponent(interface, component)
    poller = component
        
    laupath = config.get('launcher', 'scrapyd.launcher.Launcher')
    laucls = load_object(laupath) 
    launcher = laucls(config, app)

    poll_every = config.getint("poll_every", 5)
    timer = TimerService(poll_every, poller.poll)
    
    webservice = TCPServer(http_port, server.Site(Root(config, app)), interface=bind_address)
    log.msg(format="Scrapyd web console available at http://%(bind_address)s:%(http_port)s/",
            bind_address=bind_address, http_port=http_port)

    launcher.setServiceParent(app)
    timer.setServiceParent(app)
    webservice.setServiceParent(app)

    return app
Example #10
0
 def from_crawler(cls, crawler):
     settings = crawler.settings
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     dupefilter = dupefilter_cls.from_settings(settings)
     rqclass = load_object(settings['SCHEDULER_RABBIT_QUEUE'])
     logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
     return cls(dupefilter, job_dir(settings), rqclass, logunser, crawler.stats)
Example #11
0
 def from_crawler(cls, crawler):
     settings = crawler.settings
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     dupefilter = dupefilter_cls.from_settings(settings)
     domainclass = load_object(settings['SCHEDULER_DOMAIN_CLASS'])
     flowclass = load_object(settings['SCHEDULER_FLOW_CLASS'])
     return cls(dupefilter, domainclass, flowclass, crawler.stats, settings)
Example #12
0
 def __init__(self, settings, stats):
     if not settings.getbool('HTTPCACHE_ENABLED'):
         raise NotConfigured
     self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
     self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
     self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
     self.stats = stats
Example #13
0
 def from_settings(cls, settings):
     persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
     queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
     queue_cls = load_object(settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS))
     dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY)
     dupefilter_cls = load_object(settings.get('DUPEFILTER_CLASS', DUPEFILTER_CLASS))
     idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE', IDLE_BEFORE_CLOSE)
     server = connection.from_settings(settings)
     return cls(server, persist, queue_key, queue_cls, dupefilter_key, dupefilter_cls, idle_before_close)
Example #14
0
    def __init__(self, settings):
        self.configured = False
        self.settings = settings
        self.signals = SignalManager(self)
        self.stats = load_object(settings['STATS_CLASS'])(self)

        spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
        self.spiders = spman_cls.from_crawler(self)
        self._scheduled = {}
Example #15
0
 def from_crawler(cls, crawler):
     settings = crawler.settings
     run_as_daemon = settings.get('DAEMON')
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     dupefilter = dupefilter_cls.from_settings(settings)
     dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
     mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
     logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
     return cls(crawler, dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats, run_as_daemon)
Example #16
0
 def configure(self):
     """
     Configure execution engine with the given scheduling policy and downloader.
     """
     self.scheduler = load_object(settings['SCHEDULER'])()
     self.spider_scheduler = load_object(settings['SPIDER_SCHEDULER'])()
     self.downloader = Downloader()
     self.scraper = Scraper(self)
     self.configured = True
Example #17
0
 def from_crawler(cls, crawler):
     settings = crawler.settings
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     dupefilter = create_instance(dupefilter_cls, settings, crawler)
     pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
     dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
     mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
     logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG'))
     return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
                stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)
Example #18
0
 def __init__(self, settings):
     self.configured = False
     self.settings = settings
     self.signals = SignalManager(self)
     self.stats = load_object(settings['STATS_CLASS'])(self)
     self._start_requests = lambda: ()
     self._spider = None
     # TODO: move SpiderManager to CrawlerProcess
     spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
     self.spiders = spman_cls.from_crawler(self)
Example #19
0
 def configure(self):
     if self.configured:
         return
     self.configured = True
     lf_cls = load_object(self.settings['LOG_FORMATTER'])
     self.logformatter = lf_cls.from_crawler(self)
     self.extensions = ExtensionManager.from_crawler(self)
     spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
     self.spiders = spman_cls.from_crawler(self)
     self.engine = ExecutionEngine(self, self._spider_closed)
Example #20
0
    def __init__(self, spidercls, settings):
        self.spidercls = spidercls
        self.settings = settings
        self.signals = SignalManager(self)
        self.stats = load_object(self.settings['STATS_CLASS'])(self)
        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)

        self.crawling = False
        self.spider = None
        self.engine = None
Example #21
0
 def configure(self):
     if self.configured:
         return
     self.configured = True
     d = dict(overridden_settings(self.settings))
     log.msg(format="Overridden settings: %(settings)r", settings=d, level=log.DEBUG)
     lf_cls = load_object(self.settings['LOG_FORMATTER'])
     self.logformatter = lf_cls.from_crawler(self)
     self.extensions = ExtensionManager.from_crawler(self)
     spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
     self.spiders = spman_cls.from_crawler(self)
     self.engine = ExecutionEngine(self, self._spider_closed)
Example #22
0
    def from_settings(cls, global_settings, global_stats):
        settings = global_settings
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = dupefilter_cls.from_settings(settings)
        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
        total_concurrency = self.settings.getint('CONCURRENT_REQUESTS')
        domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
        ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP')

        return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser, global_stats, total_concurrency, domain_concurrency, ip_concurrency)
Example #23
0
 def configure(self):
     if self.configured:
         return
     self.configured = True
     self.extensions = ExtensionManager.from_settings(self.settings)
     spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
     self.spiders = spman_cls.from_settings(self.settings)
     spq_cls = load_object(self.settings['SPIDER_QUEUE_CLASS'])
     spq = spq_cls.from_settings(self.settings)
     keepalive = self.settings.getbool('KEEP_ALIVE')
     pollint = self.settings.getfloat('QUEUE_POLL_INTERVAL')
     self.queue = ExecutionQueue(self.spiders, spq, poll_interval=pollint,
         keep_alive=keepalive)
     self.engine = ExecutionEngine(self.settings, self._spider_closed)
Example #24
0
 def __init__(self, settings):
     super(Scheduler, self).__init__(settings)
     dupefilter_cls = load_object(self.dupfilter_class.to_value())
     dupefilter = dupefilter_cls(self.metas)
     dqclass = load_object(self.schedule_disk_queue.to_value())
     mqclass = load_object(self.schedule_memory_queue.to_value())
     logunser = self.log_unserailizable_requests.to_value()
     
     self.df = dupefilter
     self.jobpath = self.__job_dir(self.jobdir.to_value()) 
     self.dqdir = self._dqdir(self.jobpath)
     self.dqclass = dqclass
     self.mqclass = mqclass
     self.logunser = logunser
Example #25
0
File: engine.py Project: 01-/scrapy
 def __init__(self, crawler, spider_closed_callback):
     self.crawler = crawler
     self.settings = crawler.settings
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
     self.slot = None
     self.spider = None
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(self.settings['SCHEDULER'])
     downloader_cls = load_object(self.settings['DOWNLOADER'])
     self.downloader = downloader_cls(crawler)
     self.scraper = Scraper(crawler)
     self._spider_closed_callback = spider_closed_callback
Example #26
0
 def from_crawler(cls, crawler):
     settings = crawler.settings
     pqcls = load_object(settings["SCHEDULER_PRIORITY_QUEUE"])
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     dupefilter = dupefilter_cls.from_settings(settings)
     dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
     mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
     logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
     return cls(dupefilter,
                job_dir(settings),
                dqclass,
                mqclass,
                logunser,
                crawler.stats,
                pqcls)
Example #27
0
def application(config):
    app = Application("Scrapyd")
    http_port = config.getint('http_port', 6800)
    bind_address = config.get('bind_address', '127.0.0.1')
    poll_interval = config.getfloat('poll_interval', 5)

    poller = QueuePoller(config)
    eggstorage = FilesystemEggStorage(config)
    scheduler = SpiderScheduler(config)
    environment = Environment(config)

    app.setComponent(IPoller, poller)
    app.setComponent(IEggStorage, eggstorage)
    app.setComponent(ISpiderScheduler, scheduler)
    app.setComponent(IEnvironment, environment)

    laupath = config.get('launcher', 'scrapyd.launcher.Launcher')
    laucls = load_object(laupath)
    launcher = laucls(config, app)

    timer = TimerService(poll_interval, poller.poll)

    webpath = config.get('webroot', 'scrapyd.website.Root')
    webcls = load_object(webpath)

    username = config.get('username', '')
    password = config.get('password', '')
    if username and password:
        if ':' in username:
            sys.exit("The `username` option contains illegal character ':', "
                     "check and update the configuration file of Scrapyd")
        portal = Portal(PublicHTMLRealm(webcls(config, app)),
                        [StringCredentialsChecker(username, password)])
        credential_factory = BasicCredentialFactory("Auth")
        resource = HTTPAuthSessionWrapper(portal, [credential_factory])
        log.msg("Basic authentication enabled")
    else:
        resource = webcls(config, app)
        log.msg("Basic authentication disabled as either `username` or `password` is unset")
    webservice = TCPServer(http_port, server.Site(resource), interface=bind_address)
    log.msg(format="Scrapyd web console available at http://%(bind_address)s:%(http_port)s/",
            bind_address=bind_address, http_port=http_port)

    launcher.setServiceParent(app)
    timer.setServiceParent(app)
    webservice.setServiceParent(app)

    return app
Example #28
0
 def __init__(self, datadir, spider_cls=None, settings=None, **kwargs):
     self.spider_cls = load_object(spider_cls) if spider_cls else IblSpider
     self._specs = open_project_from_dir(datadir)
     settings = settings.copy()
     settings.frozen = False
     settings.set('PLUGINS', load_plugins(settings))
     self.settings = settings
Example #29
0
 def __init__(self, engine, settings):
     self.sites = {}
     self.spidermw = SpiderMiddlewareManager.from_settings(settings)
     itemproc_cls = load_object(settings['ITEM_PROCESSOR'])
     self.itemproc = itemproc_cls.from_settings(settings)
     self.concurrent_items = settings.getint('CONCURRENT_ITEMS')
     self.engine = engine
Example #30
0
 def __init__(self, settings):
     self._pool = HTTPConnectionPool(reactor, persistent=True)
     self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
     self._pool._factory.noisy = False
     self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
     self._contextFactory = self._contextFactoryClass()
     self._disconnect_timeout = 1
Example #31
0
    def run(self, args, opts):
        # load contracts
        contracts = build_component_list(
            self.settings.getwithbase('SPIDER_CONTRACTS'))
        conman = ContractsManager(load_object(c) for c in contracts)
        runner = TextTestRunner(verbosity=2 if opts.verbose else 1)
        result = TextTestResult(runner.stream, runner.descriptions,
                                runner.verbosity)

        # contract requests
        contract_reqs = defaultdict(list)

        spider_loader = self.crawler_process.spider_loader

        with set_environ(SCRAPY_CHECK='true'):
            for spidername in args or spider_loader.list():
                spidercls = spider_loader.load(spidername)
                spidercls.start_requests = lambda s: conman.from_spider(
                    s, result)

                tested_methods = conman.tested_methods_from_spidercls(
                    spidercls)
                if opts.list:
                    for method in tested_methods:
                        contract_reqs[spidercls.name].append(method)
                elif tested_methods:
                    self.crawler_process.crawl(spidercls)

        # start checks
        if opts.list:
            for spider, methods in sorted(contract_reqs.items()):
                if not methods and not opts.verbose:
                    continue
                print(spider)
                for method in sorted(methods):
                    print('  * %s' % method)
        else:
            start = time.time()
            self.crawler_process.start()
            stop = time.time()

            result.printErrors()
            result.printSummary(start, stop)
            self.exitcode = int(not result.wasSuccessful())
Example #32
0
    def __init__(self, crawler: Crawler) -> None:
        super().__init__(settings=crawler.settings, crawler=crawler)
        verify_installed_reactor(
            "twisted.internet.asyncioreactor.AsyncioSelectorReactor")
        crawler.signals.connect(self._engine_started, signals.engine_started)
        self.stats = crawler.stats

        self.browser_type: str = crawler.settings.get(
            "PLAYWRIGHT_BROWSER_TYPE") or "chromium"
        self.launch_options: dict = crawler.settings.getdict(
            "PLAYWRIGHT_LAUNCH_OPTIONS") or {}

        self.default_navigation_timeout: Optional[float] = None
        if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in crawler.settings:
            with suppress(TypeError, ValueError):
                self.default_navigation_timeout = float(
                    crawler.settings.get(
                        "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT"))

        if crawler.settings.get("PLAYWRIGHT_PROCESS_REQUEST_HEADERS"):
            self.process_request_headers = load_object(
                crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"])
        else:
            self.process_request_headers = use_scrapy_headers

        default_context_kwargs: dict = {}
        if "PLAYWRIGHT_CONTEXT_ARGS" in crawler.settings:
            default_context_kwargs = crawler.settings.getdict(
                "PLAYWRIGHT_CONTEXT_ARGS")
            warnings.warn(
                "The PLAYWRIGHT_CONTEXT_ARGS setting is deprecated, please use"
                " PLAYWRIGHT_CONTEXTS instead. Keyword arguments defined in"
                " PLAYWRIGHT_CONTEXT_ARGS will be used when creating the 'default' context",
                category=DeprecationWarning,
                stacklevel=2,
            )
        self.context_kwargs: defaultdict = defaultdict(dict)
        for name, kwargs in (crawler.settings.getdict("PLAYWRIGHT_CONTEXTS")
                             or {}).items():
            if name == "default":
                self.context_kwargs[name] = default_context_kwargs
            self.context_kwargs[name].update(kwargs)
        if "default" not in self.context_kwargs and default_context_kwargs:
            self.context_kwargs["default"] = default_context_kwargs
Example #33
0
    def from_settings(cls, settings):
        #        kwargs = {
        #            'persist': settings.getbool('SCHEDULER_PERSIST'),
        #            'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'),
        #            'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'),
        #        }

        # If these values are missing, it means we want to use the defaults.
        #        optional = {
        # TODO: Use custom prefixes for this settings to note that are
        #            # specific to scrapy-redis.
        #            'queue_key': 'SCHEDULER_QUEUE_KEY',
        #            'queue_cls': 'SCHEDULER_QUEUE_CLASS',
        #            'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY',
        # We use the default setting name to keep compatibility.
        #            'dupefilter_cls': 'DUPEFILTER_CLASS',
        #            'serializer': 'SCHEDULER_SERIALIZER',
        #        }
        #        for name, setting_name in optional.items():
        #            val = settings.get(setting_name)
        #            if val:
        #               kwargs[name] = val

        # Support serializer as a path to a module.
        #        if isinstance(kwargs.get('serializer'), six.string_types):
        #            kwargs['serializer'] = importlib.import_module(kwargs['serializer'])

        #        server = connection.from_settings(settings)
        # Ensure the connection is working.
        #        server.ping()

        #        return cls(server=server, **kwargs)
        persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
        queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
        queue_cls = load_object(
            settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS))
        queue_name = settings.get('REDIS_QUEUE_NAME', None)
        dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY)
        idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE',
                                         IDLE_BEFORE_CLOSE)
        server = connection.from_settings(settings)
        server_filter = connection.from_settings_filter(settings)
        return cls(server, server_filter, persist, queue_key, queue_cls,
                   dupefilter_key, idle_before_close, queue_name)
Example #34
0
def get_redis_from_settings(settings):
    """Returns a redis client instance from given Scrapy settings object.

    This function uses ``get_client`` to instantiate the client and uses
    ``defaults.REDIS_PARAMS`` global as defaults values for the parameters. You
    can override them using the ``REDIS_PARAMS`` setting.

    Parameters
    ----------
    settings : Settings
        A scrapy settings object. See the supported settings below.

    Returns
    -------
    server
        Redis client instance.

    Other Parameters
    ----------------
    REDIS_URL : str, optional
        Server connection URL.
    REDIS_HOST : str, optional
        Server host.
    REDIS_PORT : str, optional
        Server port.
    REDIS_ENCODING : str, optional
        Data encoding.
    REDIS_PARAMS : dict, optional
        Additional client parameters.

    """
    params = defaults.REDIS_PARAMS.copy()
    params.update(settings.getdict("REDIS_PARAMS"))
    # XXX: Deprecate REDIS_* settings.
    for source, dest in SETTINGS_PARAMS_MAP.items():
        val = settings.get(source)
        if val:
            params[dest] = val

    # Allow ``redis_cls`` to be a path to a class.
    if isinstance(params.get("redis_cls"), six.string_types):
        params["redis_cls"] = load_object(params["redis_cls"])

    return get_redis(**params)
Example #35
0
 def load_providers(self, default_providers: Optional[Mapping] = None):
     providers_dict = {
         **(default_providers or {}),
         **self.spider.settings.getdict("SCRAPY_POET_PROVIDERS")
     }
     provider_classes = build_component_list(providers_dict)
     logger.info(f"Loading providers:\n {pprint.pformat(provider_classes)}")
     self.providers = [
         load_object(cls)(self.crawler) for cls in provider_classes
     ]
     check_all_providers_are_callable(self.providers)
     # Caching whether each provider requires the scrapy response
     self.is_provider_requiring_scrapy_response = {
         provider: is_provider_requiring_scrapy_response(provider)
         for provider in self.providers
     }
     # Caching the function for faster execution
     self.is_class_provided_by_any_provider = \
         is_class_provided_by_any_provider_fn(self.providers)
Example #36
0
 def __init__(self, config, app):
     resource.Resource.__init__(self)
     self.debug = config.getboolean('debug', False)
     self.runner = config.get('runner')
     logsdir = config.get('logs_dir')
     itemsdir = config.get('items_dir')
     self.app = app
     self.putChild('', Home(self))
     if logsdir:
         self.putChild('logs', static.File(logsdir, 'text/plain'))
     if itemsdir:
         self.putChild('items', static.File(itemsdir, 'text/plain'))
     self.putChild('jobs', Jobs(self))
     self.putChild('static', static.File('static'))  # adding UI support
     services = config.items('services', ())
     for servName, servClsName in services:
         servCls = load_object(servClsName)
         self.putChild(servName, servCls(self))
     self.update_projects()
Example #37
0
    def __init__(self, crawler):
        self._handlers = {}
        self._notconfigured = {}
        handlers = crawler.settings.get('DOWNLOAD_HANDLERS_BASE')
        handlers.update(crawler.settings.get('DOWNLOAD_HANDLERS', {}))
        for scheme, clspath in handlers.iteritems():
            # Allow to disable a handler just like any other
            # component (extension, middleware, etc).
            if clspath is None:
                continue
            cls = load_object(clspath)
            try:
                dh = cls(crawler.settings)
            except NotConfigured as ex:
                self._notconfigured[scheme] = str(ex)
            else:
                self._handlers[scheme] = dh

        crawler.signals.connect(self._close, signals.engine_stopped)
Example #38
0
    def test(self):
        fixture_objects = data['result']

        request = request_from_dict(data['request'], spider)
        response = HtmlResponse(request=request, **data['response'])

        middlewares = []
        middleware_paths = data['middlewares']
        for mw_path in middleware_paths:
            try:
                mw_cls = load_object(mw_path)
                mw = create_instance(mw_cls, settings, crawler)
                middlewares.append(mw)
            except NotConfigured:
                continue
            middlewares.append(mw)

        crawler.signals.send_catch_log(signal=signals.spider_opened,
                                       spider=spider)

        for mw in middlewares:
            if hasattr(mw, 'process_spider_input'):
                mw.process_spider_input(response, spider)

        result = request.callback(response) or []
        middlewares.reverse()

        for mw in middlewares:
            if hasattr(mw, 'process_spider_output'):
                result = mw.process_spider_output(response, result, spider)

        if isinstance(result, (Item, Request, dict)):
            result = [result]

        for index, _object in enumerate(result):
            fixture_data = fixture_objects[index]['data']
            if fixture_objects[index].get('type') == 'request':
                clean_request(fixture_data, settings)
            else:
                clean_item(fixture_data, settings)

            _object = parse_object(_object, spider)
            self.assertEqual(fixture_data, _object, 'Not equal!')
Example #39
0
 def __init__(self, config, app):
     resource.Resource.__init__(self)
     self.debug = config.getboolean('debug', False)
     self.runner = config.get('runner')
     logsdir = config.get('logs_dir')
     itemsdir = config.get('items_dir')
     local_items = itemsdir and (urlparse(itemsdir).scheme.lower() in ['', 'file'])
     self.app = app
     self.nodename = config.get('node_name', socket.gethostname())
     self.putChild(b'', Home(self, local_items))
     if logsdir:
         self.putChild(b'logs', static.File(logsdir.encode('ascii', 'ignore'), 'text/plain'))
     if local_items:
         self.putChild(b'items', static.File(itemsdir, 'text/plain'))
     self.putChild(b'jobs', Jobs(self, local_items))
     services = config.items('services', ())
     for servName, servClsName in services:
       servCls = load_object(servClsName)
       self.putChild(servName.encode('utf-8'), servCls(self))
     self.update_projects()
Example #40
0
    def from_settings(cls, settings):
        """
        获得redis连接池
        """
        pool = None
        pipeline = None

        try:
            if settings.get('REDIS_PIPELINE'):
                pipeline = load_object(settings['REDIS_PIPELINE'])
            else:
                raise NotConfigured
            if settings.getdict("REDIS_BACKEND_PARAMS", REDIS_PARAMS_DEFAULT):
                param = settings.getdict("REDIS_BACKEND_PARAMS")
                pool = redis.ConnectionPool(**param)

        except Exception as ex:
            print '%s \nException:%s' % (__file__, ex)

        return cls(pool, pipeline)
Example #41
0
    def from_settings(cls, settings):
        if os.environ.get('spider_set_persist'):
            persist = (os.environ.get('spider_set_persist') != 'False')
        else:
            persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)

        if os.environ.get('need_dupefilter'):
            need_dupefilter = (os.environ.get('need_dupefilter') != 'False')
        else:
            need_dupefilter = True

        queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
        queue_cls = load_object(
            settings.get('SCHEDULER_QUEUE_CLASS', QUEUE_CLASS))
        dupefilter_key = settings.get('DUPEFILTER_KEY', DUPEFILTER_KEY)
        idle_before_close = settings.get('SCHEDULER_IDLE_BEFORE_CLOSE',
                                         IDLE_BEFORE_CLOSE)
        server = connection.from_settings(settings)
        return cls(server, persist, queue_key, queue_cls, dupefilter_key,
                   idle_before_close, need_dupefilter)
Example #42
0
 def from_settings(cls, settings, crawler=None):
     mwlist = cls._get_mwlist_from_settings(settings)
     middlewares = []
     for clspath in mwlist:
         try:
             mwcls = load_object(clspath)
             if crawler and hasattr(mwcls, 'from_crawler'):
                 mw = mwcls.from_crawler(crawler)
             elif hasattr(mwcls, 'from_settings'):
                 mw = mwcls.from_settings(settings)
             else:
                 mw = mwcls()
             middlewares.append(mw)
         except NotConfigured, e:
             if e.args:
                 clsname = clspath.split('.')[-1]
                 log.msg(format="Disabled %(clsname)s: %(eargs)s",
                         level=log.WARNING,
                         clsname=clsname,
                         eargs=e.args[0])
Example #43
0
    def __init__(self, settings, stats):
        # should not call parent's __init__ as that configures from "HTTPCACHE_ENABLED" variable
        # while this class should configure from "HTTPCACHE_PER_SPIDER_ENABLED"
        if not settings.get('HTTPCACHE_PER_SPIDER_ENABLED'):
            raise NotConfigured("Disabled")

        if not settings.get('HTTPCACHE_STORAGE'):
            raise NotConfigured(
                "Missing param 'HTTPCACHE_STORAGE'. Storage not configured")

        self.stats = stats
        self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
        self.storage_class = settings['HTTPCACHE_STORAGE']
        self.ignore_missing = settings.get('HTTPCACHE_IGNORE_MISSING')
        # dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
        # dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
        self.use_cache = set()
        self.expiration_time = {}

        self.settings = settings
Example #44
0
def get_schema_from(source):
    if is_schema_url(source):
        schema = get_contents(source)
        try:
            return json.loads(schema)
        except Exception as e:
            logger.exception(
                str(e) + "\nCould not parse schema from '{}'".format(source))
    elif source.endswith(".json"):
        with open(source, "r") as f:
            try:
                return json.load(f)
            except Exception as e:
                logger.exception(
                    str(e) + "\nCould not parse schema in '{}'".format(source))
    else:
        schema = load_object(source)
        if isinstance(schema, string_types):
            return json.loads(schema)
        return schema
Example #45
0
    def open_spider(self, spider: Spider):
        try:
            if self.settings.get(SEEDS_MONGODB_SEEDS_PREPARE):
                self.prepare = load_object(
                    self.settings.get(SEEDS_MONGODB_SEEDS_PREPARE))
            else:
                self.prepare = lambda x: map(lambda y: (y, {
                    'seed': x
                }), x['websites'])
        except:
            raise NotConfigured

        self.cnx = MongoClient(self.uri)
        self.db = self.cnx.get_database(
            self.settings.get(SEEDS_MONGODB_DATABASE, 'seeds'))
        self.coll = self.db.get_collection(
            self.settings.get(SEEDS_MONGODB_COLLECTION, 'seeds'))

        logger.info('Spider opened: Open the connection to MongoDB: %s',
                    self.uri)
Example #46
0
 def from_crawler(cls, crawler) -> "Scheduler":
     settings = crawler.settings
     rq_cls = load_object(
         settings.get(
             "SCHEDULER_REQUEST_QUEUE",
             "os_scrapy_rq_crawler.MemoryRequestQueue",
         ))
     rq = create_instance(rq_cls, settings, crawler)
     logger.debug(f"Using request queue: {class_fullname(rq_cls)}")
     concurrency = settings.getint("CONCURRENT_REQUESTS", 16)
     delay = settings.getint("DOWNLOAD_DELAY")
     max_slots = settings.getint("SCHEDULER_MAX_SLOTS",
                                 concurrency * (delay if delay > 0 else 3))
     assert max_slots > 1, f"SCHEDULER_MAX_SLOTS({max_slots}) must > 1"
     standby_slots = settings.getint("SCHEDULER_STANDBY_SLOTS",
                                     int(concurrency / 4))
     logger.debug(
         f"max_slots:{max_slots} standby_slots:{standby_slots} concurrency:{concurrency}"
     )
     return cls(crawler, rq, max_slots, standby_slots, crawler.stats)
Example #47
0
    def __init__(self, crawler, spider_closed_callback: Callable) -> None:
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot: Optional[Slot] = None
        self.spider: Optional[Spider] = None
        self.running = False
        # 是否暂停
        self.paused = False
        self.scheduler_cls = self._get_scheduler_class(crawler.settings)

        # 加载下载器
        downloader_cls = load_object(self.settings['DOWNLOADER'])
        self.downloader = downloader_cls(crawler)

        # 实例化这个抓取这个动作
        self.scraper = Scraper(crawler)

        # 外部传入的关闭回调
        self._spider_closed_callback = spider_closed_callback
 def _get_next_page_request(self, response, rules):
     extractors = rules["extractors"]
     for rule in extractors:
         xpath = rule["xpath"]
         regex = rule['regex']
         # get url by xpath and re configs
         url = response.xpath(xpath).re_first(
             regex) if regex else response.xpath(xpath).extract_first()
         # maybe multi rules be created to parse next page url
         if not url or not isinstance(url, str) or not url.strip():
             continue
         request_factory_cls = load_object(rule["request_factory_class"])
         req_factory_obj = request_factory_cls()
         # generate next page request and return
         return req_factory_obj.make_request(
             spider=self,
             depth=rule["depth"],
             link_or_url=response.urljoin(url).strip(),
             meta=response.meta)
     # if no next page request, return None
     return None
Example #49
0
    def __init__(self, redis_server, key, queue_cls, queue_serializer=None, dont_serial=False):
        """scheduler 
        :param redis_server: `redis.Redis` object 
        :param key: str. queue key
        :param queue_cls: str, full path of queue class 
        :param queue_serializer: None or str, if is None, then it will use `pickle`
        :param dont_serial: bool. if True, queue_serializer will not be used
        """

        def import_object(obj):
            try:
                obj = __import__(obj)
            except ImportError:
                obj = load_object(obj)
            return obj

        queue_cls = load_object(queue_cls) if isinstance(queue_cls, str) else queue_cls
        queue_serializer = import_object(queue_serializer) if isinstance(queue_serializer, str) else queue_serializer
        self.redis_server = redis_server
        self.queue_cls = queue_cls
        self.queue = queue_cls(redis_server, key, queue_serializer, dont_serial)
Example #50
0
    def __init__(self, crawler, *args,  **kwargs):
        super().__init__(**kwargs)

        splist = self._build_component_list(self.spiders)
        spcache = {}
        spiders = []

        for clspath in splist:
            if self.initialize_once and clspath in spcache:
                spiders.append(spcache[clspath])
                continue
            spcls = load_object(clspath)
            subsp = create_instance(spcls, crawler.settings, crawler,
                                    *args, **kwargs)
            spcache[clspath] = subsp
            spiders.append(subsp)

        if not spiders:
            self.logger.error('Spider Composer cannot be initialized with no'
                              'active spiders')
        self._spiders = spiders
Example #51
0
 def __init__(self, settings):
     self.settings = settings
     self.urifmt = settings['FEED_URI']
     if not self.urifmt:
         raise NotConfigured
     self.format = settings['FEED_FORMAT'].lower()
     self.export_encoding = settings['FEED_EXPORT_ENCODING']
     self.storages = self._load_components('FEED_STORAGES')
     self.exporters = self._load_components('FEED_EXPORTERS')
     if not self._storage_supported(self.urifmt):
         raise NotConfigured
     if not self._exporter_supported(self.format):
         raise NotConfigured
     self.store_empty = settings.getbool('FEED_STORE_EMPTY')
     self._exporting = False
     self.export_fields = settings.getlist('FEED_EXPORT_FIELDS') or None
     self.indent = None
     if settings.get('FEED_EXPORT_INDENT') is not None:
         self.indent = settings.getint('FEED_EXPORT_INDENT')
     uripar = settings['FEED_URI_PARAMS']
     self._uripar = load_object(uripar) if uripar else lambda x, y: None
Example #52
0
    def _get_agent(self, request, timeout):
        proxy = request.meta['proxy']
        if proxy:
            proxy_scheme, _, proxy_host, proxy_port, _ = _parse(proxy)
            proxy_scheme = str(proxy_scheme, 'utf-8')
            if proxy_scheme == 'socks5':
                endpoint = TCP4ClientEndpoint(reactor, proxy_host, proxy_port)
                self._sslMethod = openssl_methods[DOWNLOADER_CLIENT_TLS_METHOD]
                self._contextFactoryClass = load_object(
                    DOWNLOADER_CLIENTCONTEXTFACTORY)
                self._contextFactory = create_instance(
                    objcls=self._contextFactoryClass,
                    settings=settings,
                    crawler=None,
                    method=self._sslMethod,
                )
                return self._Agent(reactor,
                                   proxyEndpoint=endpoint,
                                   contextFactory=self._contextFactory)

        return super(TorScrapyAgent, self)._get_agent(request, timeout)
Example #53
0
def _get_spider_loader(settings):
    """ Get SpiderLoader instance from settings """
    if settings.get('SPIDER_MANAGER_CLASS'):
        warnings.warn(
            'SPIDER_MANAGER_CLASS option is deprecated. '
            'Please use SPIDER_LOADER_CLASS.',
            category=ScrapyDeprecationWarning,
            stacklevel=2)
    cls_path = settings.get('SPIDER_MANAGER_CLASS',
                            settings.get('SPIDER_LOADER_CLASS'))
    loader_cls = load_object(cls_path)
    try:
        verifyClass(ISpiderLoader, loader_cls)
    except DoesNotImplement:
        warnings.warn(
            'SPIDER_LOADER_CLASS (previously named SPIDER_MANAGER_CLASS) does '
            'not fully implement scrapy.interfaces.ISpiderLoader interface. '
            'Please add all missing methods to avoid unexpected runtime errors.',
            category=ScrapyDeprecationWarning,
            stacklevel=2)
    return loader_cls.from_settings(settings.frozencopy())
    def __init__(self, crawler: Crawler, auth_encoding: str, mw):
        super().__init__(crawler, auth_encoding, mw)

        self.mongodb_settings: Dict = self._get_mongodb_settings()

        self.not_mongoclient_parameters: Dict = self.mongodb_settings.get(
            'not_mongoclient_parameters')

        self.uri: str = None
        self.conn: MongoClient = None
        self.db: DatabaseSync = None
        self.coll: CollectionSync = None

        self._proxy_retriever: methodcaller = methodcaller(
            self.mongodb_settings['proxy_retriever'].pop('name'),
            **self.mongodb_settings['proxy_retriever'])
        self._get_proxy_from_doc: Callable = partial(
            load_object(self.mongodb_settings['get_proxy_from_doc']),
            auth_encoding=self.auth_encoding)

        self.proxies_invalidated: Set[Tuple[str, bytes, str]] = set()
Example #55
0
    def from_settings(cls, settings):
        server = redis.Redis(host=settings.get('REDIS_HOST'),
                             port=settings.get('REDIS_PORT'))
        persist = settings.get('SCHEDULER_PERSIST', True)
        up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10)
        hits = settings.get('QUEUE_HITS', 10)
        window = settings.get('QUEUE_WINDOW', 60)
        mod = settings.get('QUEUE_MODERATED', False)
        timeout = settings.get('DUPEFILTER_TIMEOUT', 600)
        ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60)
        add_type = settings.get('SCHEDULER_TYPE_ENABLED', False)
        add_ip = settings.get('SCHEDULER_IP_ENABLED', False)
        retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3)
        ip_regex = settings.get('IP_ADDR_REGEX', '.*')
        throt = settings.get('QUEUE_THROTTLED', False)

        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = dupefilter_cls.from_settings(settings)

        my_level = settings.get('SC_LOG_LEVEL', 'INFO')
        my_name = settings.get('SC_LOGGER_NAME', 'sc-logger')
        my_output = settings.get('SC_LOG_STDOUT', True)
        my_json = settings.get('SC_LOG_JSON', False)
        my_dir = settings.get('SC_LOG_DIR', 'logs')
        my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB')
        my_file = settings.get('SC_LOG_FILE', 'main.log')
        my_backups = settings.get('SC_LOG_BACKUPS', 5)

        logger = LogFactory.get_instance(json=my_json,
                                         name=my_name,
                                         stdout=my_output,
                                         level=my_level,
                                         dir=my_dir,
                                         file=my_file,
                                         bytes=my_bytes,
                                         backups=my_backups)

        return cls(dupefilter, server, persist, up_int, timeout, retries,
                   logger, hits, window, mod, ip_refresh, add_type, add_ip,
                   ip_regex, throt)
Example #56
0
 def _load_handler(self, scheme, skip_lazy=False):
     path = self._schemes[scheme]
     try:
         dhcls = load_object(path)
         if skip_lazy and getattr(dhcls, 'lazy', True):
             return None
         dh = dhcls(self._crawler.settings)
     except NotConfigured as ex:
         self._notconfigured[scheme] = str(ex)
         return None
     except Exception as ex:
         logger.error('Loading "%(clspath)s" for scheme "%(scheme)s"', {
             "clspath": path,
             "scheme": scheme
         },
                      exc_info=True,
                      extra={'crawler': self._crawler})
         self._notconfigured[scheme] = str(ex)
         return None
     else:
         self._handlers[scheme] = dh
         return dh
Example #57
0
def _load_policy_class(policy, warning_only=False):
    """
    Expect a string for the path to the policy class,
    otherwise try to interpret the string as a standard value
    from https://www.w3.org/TR/referrer-policy/#referrer-policies
    """

    try:
        # 尝试加载对象
        return load_object(policy)
    except ValueError:
        try:
            return _policy_classes[policy.lower()]
        except KeyError:
            msg = "Could not load referrer policy %r" % policy
            if not warning_only:
                # 抛异常
                raise RuntimeError(msg)
            else:
                # 仅告警
                warnings.warn(msg, RuntimeWarning)
                return None
Example #58
0
    def __init__(self, settings):
        self._pool = HTTPConnectionPool(reactor, persistent=True)
        self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
        self._pool._factory.noisy = False

        self._sslMethod = openssl_methods[settings.get('DOWNLOADER_CLIENT_TLS_METHOD')]
        self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
        # try method-aware context factory
        try:
            self._contextFactory = self._contextFactoryClass(method=self._sslMethod)
        except TypeError:
            # use context factory defaults
            self._contextFactory = self._contextFactoryClass()
            msg = """
 '%s' does not accept `method` argument (type OpenSSL.SSL method,\
 e.g. OpenSSL.SSL.SSLv23_METHOD).\
 Please upgrade your context factory class to handle it or ignore it.""" % (
                settings['DOWNLOADER_CLIENTCONTEXTFACTORY'],)
            warnings.warn(msg)
        self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
        self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
        self._disconnect_timeout = 1
Example #59
0
    def from_crawler(cls, crawler,):
        # first check if the extension should be enabled and raise

        # NotConfigured otherwis
        if not crawler.settings.getbool('MYEXT_ENABLED'):
            raise NotConfigured
        # get the number of items from settings

        request_count = crawler.settings.getint('MYEXT_ITEMCOUNT', 1000)

        # instantiate the extension object
        scheduler_cls = load_object(crawler.settings['SCHEDULER'])
        scheduler = scheduler_cls.from_crawler(crawler)
        ext = cls(request_count,scheduler)

        # connect the extension object to signals
        crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(ext.request_scheduled, signal=signals.request_scheduled)
        # return the extension object

        return ext
    def from_settings(cls, settings, crawler=None):
        mwlist = cls._get_mwlist_from_settings(settings)
        middlewares = []
        enabled = []
        for clspath in mwlist:
            try:
                mwcls = load_object(clspath)
                mw = create_instance(mwcls, settings, crawler)
                middlewares.append(mw)
                enabled.append(clspath)
            except NotConfigured as e:
                if e.args:
                    clsname = clspath.split('.')[-1]
                    logger.warning("Disabled %(clsname)s: %(eargs)s",
                                   {'clsname': clsname, 'eargs': e.args[0]},
                                   extra={'crawler': crawler})

        logger.info("Enabled %(componentname)ss:\n%(enabledlist)s",
                    {'componentname': cls.component_name,
                     'enabledlist': pprint.pformat(enabled)},
                    extra={'crawler': crawler})
        return cls(*middlewares)