Ejemplo n.º 1
0
 def _mq(self):
     """ Create a new priority queue instance, with in-memory storage """
     return create_instance(self.pqclass,
                            settings=None,
                            crawler=self.crawler,
                            downstream_queue_cls=self.mqclass,
                            key='')        
Ejemplo n.º 2
0
    def start(self):
        """
        This method starts a :mod:`~twisted.internet.reactor`, adjusts its pool
        size to :setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache
        based on :setting:`DNSCACHE_ENABLED` and :setting:`DNSCACHE_SIZE`.

        If ``stop_after_crawl`` is True, the reactor will be stopped after all
        crawlers have finished, using :meth:`join`.

        :param boolean stop_after_crawl: stop or not the reactor when all
            crawlers have finished
        """
        d = self.join()
        if d.called:
            return
        d.addBoth(self._stop_reactor)
        resolver_class = load_object(self.settings["DNS_RESOLVER"])
        resolver = create_instance(resolver_class,
                                   self.settings,
                                   self,
                                   reactor=reactor)
        resolver.install_on_reactor()
        tp = reactor.getThreadPool()
        tp.adjustPoolsize(
            maxthreads=self.settings.getint('REACTOR_THREADPOOL_MAXSIZE'))
        reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
        reactor.run()  #blocking call
Ejemplo n.º 3
0
 def from_crawler(cls, crawler):
     settings = crawler.settings
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     dupefilter = create_instance(dupefilter_cls, settings, crawler)
     pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
     mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
     return cls(dupefilter, jobdir=None, pqclass=pqclass, mqclass=mqclass, crawler=crawler)
Ejemplo n.º 4
0
 def from_settings(cls, crawler):
     #get all the components removing None value ones.
     mwlist = cls._get_mwlist_from_settings(crawler.settings)
     middlewares = []
     for clspath in mwlist:
         try:
             mwcls = load_object(clspath)
             mw = create_instance(mwcls, crawler.settings, crawler)
             middlewares.append(mw)
         except:
             raise
     return cls(*middlewares)
Ejemplo n.º 5
0
    def _load_handler(self, scheme, skip_lazy=False):
        path = self._schemes[scheme]
        dhcls = load_object(path)
        if skip_lazy and getattr(dhcls, 'lazy', True):
            return None

        dh = create_instance(
            objcls=dhcls,
            settings=self._crawler.settings,
            crawler=self._crawler,
        )
        self._handlers[scheme] = dh
        return dh
Ejemplo n.º 6
0
    def __init__(self, settings, crawler=None):
        self._crawler = crawler
        self._pool = HTTPConnectionPool(reactor, persistent=True)
        self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
        self._pool._factory.noisy = False

        self._sslMethod = openssl_methods[settings.get('DOWNLOADER_CLIENT_TLS_METHOD')]
        self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
        # try method-aware context factory
        try:
            self._contextFactory = create_instance(
                objcls=self._contextFactoryClass,
                settings=settings,
                crawler=crawler,
                method=self._sslMethod,
                )
        except TypeError:
            msg = """ '%s' does not accept `method` argument.""" %(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
            print(msg)
        self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
        self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
        self._fail_on_dataloss = settings.getbool('DOWNLOAD_FAIL_ON_DATALOSS')
        self._disconnect_timeout = 1
Ejemplo n.º 7
0
 def qfactory(self, key):
     return create_instance(self.downstream_queue_cls, None, self.crawler,
                            self.key + '/' + str(key))