def __init__(self, settings, stats): if not settings.getbool('HTTPCACHE_ENABLED'): raise NotConfigured self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings) self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings) self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING') self.stats = stats
def __init__(self): self.classes = {} self.mimetypes = MimeTypes() mimedata = get_data('dscrapy', 'mime.types') self.mimetypes.readfp(StringIO(mimedata)) for mimetype, cls in self.CLASSES.iteritems(): self.classes[mimetype] = load_object(cls)
def __init__(self, global_settings, global_signals): self._handlers = {} self._notconfigured = {} handlers = global_settings.get('DOWNLOAD_HANDLERS_BASE') handlers.update(global_settings.get('DOWNLOAD_HANDLERS', {})) for scheme, clspath in handlers.iteritems(): # Allow to disable a handler just like any other # component (extension, middleware, etc). if clspath is None: continue cls = load_object(clspath) try: dh = cls(global_settings) except NotConfigured as ex: self._notconfigured[scheme] = str(ex) else: self._handlers[scheme] = dh global_signals.connect(self._close, signals.engine_stopped)
def from_settings(cls, settings, signals=None, stats=None): mwlist = cls._get_mwlist_from_settings(settings) middlewares = [] for clspath in mwlist: try: mwcls = load_object(clspath) if hasattr(mwcls, 'from_settings'): mw = mwcls.from_settings(settings, signals, stats) else: mw = mwcls() middlewares.append(mw) except NotConfigured as e: if e.args: clsname = clspath.split('.')[-1] log.msg(format="Disabled %(clsname)s: %(eargs)s", level=log.WARNING, clsname=clsname, eargs=e.args[0]) enabled = [x.__class__.__name__ for x in middlewares] log.msg(format="Enabled %(componentname)ss: %(enabledlist)s", level=log.INFO, componentname=cls.component_name, enabledlist=', '.join(enabled)) return cls(*middlewares)
def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) self._contextFactory = self._contextFactoryClass()
from dscrapy.downloader import Downloader from dscrapy.http import Request class DownloaderHandler: implements(DownloaderService.Iface) def __init__(self, downloader): self.log = {} self.downloader = downloader def download(self, r): print 'download( %s )' % (r.url) request = Request(r.url) self.downloader.fetch(request, None) if __name__ == '__main__': global_settings = DScrapySettings() global_signals = SignalManager() global_stats = load_object(global_settings['STATS_CLASS'])(global_settings) downloader = Downloader(global_settings, global_signals, global_stats) handler = DownloaderHandler(downloader) processor = DownloaderService.Processor(handler) pfactory = TBinaryProtocol.TBinaryProtocolFactory() server = reactor.listenTCP(9090, TTwisted.ThriftServerFactory(processor, pfactory), interface="127.0.0.1") reactor.run()
def __init__(self, settings): self.HTTPClientFactory = load_object(settings['DOWNLOADER_HTTPCLIENTFACTORY']) self.ClientContextFactory = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])