Exemple #1
0
 def from_crawler(cls, crawler):
     settings = crawler.settings
     dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
     dupefilter = create_instance(dupefilter_cls, settings, crawler)
     pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
     mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
     return cls(dupefilter, jobdir=None, pqclass=pqclass, mqclass=mqclass, crawler=crawler)
Exemple #2
0
 def __init__(self, crawler):
     self.slot = None
     self.spidermw = SpiderMiddlewareManager.from_settings(crawler)
     itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR'])
     self.itemproc = itemproc_cls.from_settings(crawler)
     self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS')
     self.crawler = crawler
Exemple #3
0
    def start(self):
        """
        This method starts a :mod:`~twisted.internet.reactor`, adjusts its pool
        size to :setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache
        based on :setting:`DNSCACHE_ENABLED` and :setting:`DNSCACHE_SIZE`.

        If ``stop_after_crawl`` is True, the reactor will be stopped after all
        crawlers have finished, using :meth:`join`.

        :param boolean stop_after_crawl: stop or not the reactor when all
            crawlers have finished
        """
        d = self.join()
        if d.called:
            return
        d.addBoth(self._stop_reactor)
        resolver_class = load_object(self.settings["DNS_RESOLVER"])
        resolver = create_instance(resolver_class,
                                   self.settings,
                                   self,
                                   reactor=reactor)
        resolver.install_on_reactor()
        tp = reactor.getThreadPool()
        tp.adjustPoolsize(
            maxthreads=self.settings.getint('REACTOR_THREADPOOL_MAXSIZE'))
        reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
        reactor.run()  #blocking call
Exemple #4
0
 def __init__(self):
     self.classes = {}
     self.mimetypes = MimeTypes()
     mimedata = get_data('Jcrapy', 'mime.types').decode('utf8')
     self.mimetypes.readfp(StringIO(mimedata))
     for mimetype, cls in self.CLASSES.items():
         self.classes[mimetype] = load_object(cls)
Exemple #5
0
 def __init__(self, crawler, spider_closed_callback):
     self.crawler = crawler
     self.settings = crawler.settings
     ###
     # TD: Log related
     # self.signals = crawler.signals
     # self.logformatter = crawler.logformatter
     ###
     self.slot = None
     self.spider = None
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(self.settings['SCHEDULER'])
     downloader_cls = load_object(self.settings['DOWNLOADER'])
     self.downloader = downloader_cls(crawler)
     self.scraper = Scraper(crawler)
     self._spider_closed_callback = spider_closed_callback
Exemple #6
0
 def from_settings(cls, crawler):
     #get all the components removing None value ones.
     mwlist = cls._get_mwlist_from_settings(crawler.settings)
     middlewares = []
     for clspath in mwlist:
         try:
             mwcls = load_object(clspath)
             mw = create_instance(mwcls, crawler.settings, crawler)
             middlewares.append(mw)
         except:
             raise
     return cls(*middlewares)
Exemple #7
0
    def _load_handler(self, scheme, skip_lazy=False):
        path = self._schemes[scheme]
        dhcls = load_object(path)
        if skip_lazy and getattr(dhcls, 'lazy', True):
            return None

        dh = create_instance(
            objcls=dhcls,
            settings=self._crawler.settings,
            crawler=self._crawler,
        )
        self._handlers[scheme] = dh
        return dh
Exemple #8
0
    def __init__(self, settings, crawler=None):
        self._crawler = crawler
        self._pool = HTTPConnectionPool(reactor, persistent=True)
        self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
        self._pool._factory.noisy = False

        self._sslMethod = openssl_methods[settings.get('DOWNLOADER_CLIENT_TLS_METHOD')]
        self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
        # try method-aware context factory
        try:
            self._contextFactory = create_instance(
                objcls=self._contextFactoryClass,
                settings=settings,
                crawler=crawler,
                method=self._sslMethod,
                )
        except TypeError:
            msg = """ '%s' does not accept `method` argument.""" %(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
            print(msg)
        self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
        self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
        self._fail_on_dataloss = settings.getbool('DOWNLOAD_FAIL_ON_DATALOSS')
        self._disconnect_timeout = 1
Exemple #9
0
 def _get_spider_loader(self, settings):
     cls_path = settings.get('SPIDER_LOADER_CLASS')
     loader_cls = load_object(cls_path)
     return loader_cls.from_settings(settings)