def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint( 'CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False self._sslMethod = openssl_methods[settings.get( 'DOWNLOADER_CLIENT_TLS_METHOD')] self._contextFactoryClass = load_object( settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) # try method-aware context factory try: self._contextFactory = self._contextFactoryClass( method=self._sslMethod) except TypeError: # use context factory defaults self._contextFactory = self._contextFactoryClass() msg = """ '%s' does not accept `method` argument (type OpenSSL.SSL method,\ e.g. OpenSSL.SSL.SSLv23_METHOD).\ Please upgrade your context factory class to handle it or ignore it.""" % ( settings['DOWNLOADER_CLIENTCONTEXTFACTORY'], ) warnings.warn(msg) self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE') self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE') self._disconnect_timeout = 1
def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint( 'CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False self._contextFactoryClass = load_object( settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) self._contextFactory = self._contextFactoryClass()
class HTTP11DownloadHandler(object): def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) self._contextFactory = self._contextFactoryClass() self._disconnect_timeout = 1 def download_request(self, request, spider): """Return a deferred for the HTTP download""" agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool) return agent.download_request(request) def close(self): d = self._pool.closeCachedConnections() # closeCachedConnections will hang on network or server issues, so # we'll manually timeout the deferred. # # Twisted issue addressing this problem can be found here: # https://twistedmatrix.com/trac/ticket/7738. # # closeCachedConnections doesn't handle external errbacks, so we'll # issue a callback after `_disconnect_timeout` seconds. delayed_call = reactor.callLater(self._disconnect_timeout, d.callback, []) def cancel_delayed_call(result): if delayed_call.active(): delayed_call.cancel() return result d.addBoth(cancel_delayed_call) return d
def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) self._contextFactory = self._contextFactoryClass() self._disconnect_timeout = 1
class HTTP11DownloadHandler(object): def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint( 'CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False self._sslMethod = openssl_methods[settings.get( 'DOWNLOADER_CLIENT_TLS_METHOD')] self._contextFactoryClass = load_object( settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) # try method-aware context factory try: self._contextFactory = self._contextFactoryClass( method=self._sslMethod) except TypeError: # use context factory defaults self._contextFactory = self._contextFactoryClass() msg = """ '%s' does not accept `method` argument (type OpenSSL.SSL method,\ e.g. OpenSSL.SSL.SSLv23_METHOD).\ Please upgrade your context factory class to handle it or ignore it.""" % ( settings['DOWNLOADER_CLIENTCONTEXTFACTORY'], ) warnings.warn(msg) self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE') self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE') self._disconnect_timeout = 1 def download_request(self, request, spider): """Return a deferred for the HTTP download""" agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool, maxsize=getattr(spider, 'download_maxsize', self._default_maxsize), warnsize=getattr(spider, 'download_warnsize', self._default_warnsize)) return agent.download_request(request) def close(self): d = self._pool.closeCachedConnections() # closeCachedConnections will hang on network or server issues, so # we'll manually timeout the deferred. # # Twisted issue addressing this problem can be found here: # https://twistedmatrix.com/trac/ticket/7738. # # closeCachedConnections doesn't handle external errbacks, so we'll # issue a callback after `_disconnect_timeout` seconds. delayed_call = reactor.callLater(self._disconnect_timeout, d.callback, []) def cancel_delayed_call(result): if delayed_call.active(): delayed_call.cancel() return result d.addBoth(cancel_delayed_call) return d
class HTTP11DownloadHandler(object): def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False self._sslMethod = openssl_methods[settings.get('DOWNLOADER_CLIENT_TLS_METHOD')] self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) # try method-aware context factory try: self._contextFactory = self._contextFactoryClass(method=self._sslMethod) except TypeError: # use context factory defaults self._contextFactory = self._contextFactoryClass() msg = """ '%s' does not accept `method` argument (type OpenSSL.SSL method,\ e.g. OpenSSL.SSL.SSLv23_METHOD).\ Please upgrade your context factory class to handle it or ignore it.""" % ( settings['DOWNLOADER_CLIENTCONTEXTFACTORY'],) warnings.warn(msg) self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE') self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE') self._disconnect_timeout = 1 def download_request(self, request, spider): """Return a deferred for the HTTP download""" agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool, maxsize=getattr(spider, 'download_maxsize', self._default_maxsize), warnsize=getattr(spider, 'download_warnsize', self._default_warnsize)) return agent.download_request(request) def close(self): d = self._pool.closeCachedConnections() # closeCachedConnections will hang on network or server issues, so # we'll manually timeout the deferred. # # Twisted issue addressing this problem can be found here: # https://twistedmatrix.com/trac/ticket/7738. # # closeCachedConnections doesn't handle external errbacks, so we'll # issue a callback after `_disconnect_timeout` seconds. delayed_call = reactor.callLater(self._disconnect_timeout, d.callback, []) def cancel_delayed_call(result): if delayed_call.active(): delayed_call.cancel() return result d.addBoth(cancel_delayed_call) return d
class HTTP11DownloadHandler(object): def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) self._contextFactory = self._contextFactoryClass() def download_request(self, request, spider): """Return a deferred for the HTTP download""" agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool) return agent.download_request(request) def close(self): return self._pool.closeCachedConnections()
class HTTP11DownloadHandler(object): def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._contextFactoryClass = load_object( settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) self._contextFactory = self._contextFactoryClass() def download_request(self, request, spider): """Return a deferred for the HTTP download""" agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool) return agent.download_request(request) def close(self): return self._pool.closeCachedConnections()
class HTTP11DownloadHandler(object): def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint("CONCURRENT_REQUESTS_PER_DOMAIN") self._pool._factory.noisy = False self._contextFactoryClass = load_object(settings["DOWNLOADER_CLIENTCONTEXTFACTORY"]) self._contextFactory = self._contextFactoryClass() def download_request(self, request, spider): """Return a deferred for the HTTP download""" agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool) return agent.download_request(request) def close(self): return self._pool.closeCachedConnections()
class MyDownloadHandler(object): '''下载接口, 被上层所调用''' def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) self._contextFactory = self._contextFactoryClass() def download_request(self, request, spider): '''下载的主要被调用接口(异步),返回 deferred (twisted 的延迟回调对象)''' myDownloader = MyLogicDownloader() return myDownloader.download(request) def close(self): return self._pool.closeCachedConnections()
class HTTP11DownloadHandler(object): def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint( 'CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False self._contextFactoryClass = load_object( settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) self._contextFactory = self._contextFactoryClass() self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE') self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE') self._disconnect_timeout = 1 def download_request(self, request, spider): """Return a deferred for the HTTP download""" agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool, maxsize=getattr(spider, 'download_maxsize', self._default_maxsize), warnsize=getattr(spider, 'download_warnsize', self._default_warnsize)) return agent.download_request(request) def close(self): d = self._pool.closeCachedConnections() # closeCachedConnections will hang on network or server issues, so # we'll manually timeout the deferred. # # Twisted issue addressing this problem can be found here: # https://twistedmatrix.com/trac/ticket/7738. # # closeCachedConnections doesn't handle external errbacks, so we'll # issue a callback after `_disconnect_timeout` seconds. delayed_call = reactor.callLater(self._disconnect_timeout, d.callback, []) def cancel_delayed_call(result): if delayed_call.active(): delayed_call.cancel() return result d.addBoth(cancel_delayed_call) return d
class SeleniumDownloadHandler(object): """ download interface """ def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) self._contextFactory = self._contextFactoryClass() self._disconnect_timeout = 1 def download_request(self, request, spider): myDownloader = SeleniumLogicDownloader() return myDownloader.download(request) def close(self): return self._pool.closeCachedConnections()
def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False self._sslMethod = openssl_methods[settings.get('DOWNLOADER_CLIENT_TLS_METHOD')] self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) # try method-aware context factory try: self._contextFactory = self._contextFactoryClass(method=self._sslMethod) except TypeError: # use context factory defaults self._contextFactory = self._contextFactoryClass() msg = """ '%s' does not accept `method` argument (type OpenSSL.SSL method,\ e.g. OpenSSL.SSL.SSLv23_METHOD).\ Please upgrade your context factory class to handle it or ignore it.""" % ( settings['DOWNLOADER_CLIENTCONTEXTFACTORY'],) warnings.warn(msg) self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE') self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE') self._disconnect_timeout = 1
class AntDownloadHandler(object): def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint( 'CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False self._contextFactoryClass = load_object( settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) self._contextFactory = self._contextFactoryClass() self._agent = AntAgent(contextFactory=self._contextFactory, pool=self._pool) def download_request(self, request, spider): if self._pool.persistent: if not request.headers.get('Connection'): request.headers.appendlist('Connection', 'keep-alive') proxy = request.meta.get('proxy') if proxy and not request.headers.get('Proxy-Connection'): request.headers.appendlist('Proxy-Connection', 'keep-alive') return self._agent.download_request(request) def close(self): return self._pool.closeCachedConnections()
class HTTP11DownloadHandler(object): def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint( 'CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False self._contextFactoryClass = load_object( settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) self._contextFactory = self._contextFactoryClass() self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE') self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE') def download_request(self, request, spider): """Return a deferred for the HTTP download""" agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool, maxsize=getattr(spider, 'download_maxsize', self._default_maxsize), warnsize=getattr(spider, 'download_warnsize', self._default_warnsize)) return agent.download_request(request) def close(self): return self._pool.closeCachedConnections()
def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool._factory.noisy = False self._contextFactoryClass = load_object( settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) self._contextFactory = self._contextFactoryClass()
def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) self._contextFactory = self._contextFactoryClass()