def from_crawler(cls, crawler): settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = create_instance(dupefilter_cls, settings, crawler) pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) return cls(dupefilter, jobdir=None, pqclass=pqclass, mqclass=mqclass, crawler=crawler)
def __init__(self, crawler): self.slot = None self.spidermw = SpiderMiddlewareManager.from_settings(crawler) itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR']) self.itemproc = itemproc_cls.from_settings(crawler) self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS') self.crawler = crawler
def start(self): """ This method starts a :mod:`~twisted.internet.reactor`, adjusts its pool size to :setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache based on :setting:`DNSCACHE_ENABLED` and :setting:`DNSCACHE_SIZE`. If ``stop_after_crawl`` is True, the reactor will be stopped after all crawlers have finished, using :meth:`join`. :param boolean stop_after_crawl: stop or not the reactor when all crawlers have finished """ d = self.join() if d.called: return d.addBoth(self._stop_reactor) resolver_class = load_object(self.settings["DNS_RESOLVER"]) resolver = create_instance(resolver_class, self.settings, self, reactor=reactor) resolver.install_on_reactor() tp = reactor.getThreadPool() tp.adjustPoolsize( maxthreads=self.settings.getint('REACTOR_THREADPOOL_MAXSIZE')) reactor.addSystemEventTrigger('before', 'shutdown', self.stop) reactor.run() #blocking call
def __init__(self): self.classes = {} self.mimetypes = MimeTypes() mimedata = get_data('Jcrapy', 'mime.types').decode('utf8') self.mimetypes.readfp(StringIO(mimedata)) for mimetype, cls in self.CLASSES.items(): self.classes[mimetype] = load_object(cls)
def __init__(self, crawler, spider_closed_callback): self.crawler = crawler self.settings = crawler.settings ### # TD: Log related # self.signals = crawler.signals # self.logformatter = crawler.logformatter ### self.slot = None self.spider = None self.running = False self.paused = False self.scheduler_cls = load_object(self.settings['SCHEDULER']) downloader_cls = load_object(self.settings['DOWNLOADER']) self.downloader = downloader_cls(crawler) self.scraper = Scraper(crawler) self._spider_closed_callback = spider_closed_callback
def from_settings(cls, crawler): #get all the components removing None value ones. mwlist = cls._get_mwlist_from_settings(crawler.settings) middlewares = [] for clspath in mwlist: try: mwcls = load_object(clspath) mw = create_instance(mwcls, crawler.settings, crawler) middlewares.append(mw) except: raise return cls(*middlewares)
def _load_handler(self, scheme, skip_lazy=False): path = self._schemes[scheme] dhcls = load_object(path) if skip_lazy and getattr(dhcls, 'lazy', True): return None dh = create_instance( objcls=dhcls, settings=self._crawler.settings, crawler=self._crawler, ) self._handlers[scheme] = dh return dh
def __init__(self, settings, crawler=None): self._crawler = crawler self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False self._sslMethod = openssl_methods[settings.get('DOWNLOADER_CLIENT_TLS_METHOD')] self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) # try method-aware context factory try: self._contextFactory = create_instance( objcls=self._contextFactoryClass, settings=settings, crawler=crawler, method=self._sslMethod, ) except TypeError: msg = """ '%s' does not accept `method` argument.""" %(settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) print(msg) self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE') self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE') self._fail_on_dataloss = settings.getbool('DOWNLOAD_FAIL_ON_DATALOSS') self._disconnect_timeout = 1
def _get_spider_loader(self, settings): cls_path = settings.get('SPIDER_LOADER_CLASS') loader_cls = load_object(cls_path) return loader_cls.from_settings(settings)