Beispiel #1
0
    def __init__(self, settings):
        self._pool = HTTPConnectionPool(reactor, persistent=True)
        self._pool.maxPersistentPerHost = settings.getint(
            'CONCURRENT_REQUESTS_PER_DOMAIN')
        self._pool._factory.noisy = False

        self._sslMethod = openssl_methods[settings.get(
            'DOWNLOADER_CLIENT_TLS_METHOD')]
        self._contextFactoryClass = load_object(
            settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
        # try method-aware context factory
        try:
            self._contextFactory = self._contextFactoryClass(
                method=self._sslMethod)
        except TypeError:
            # use context factory defaults
            self._contextFactory = self._contextFactoryClass()
            msg = """
 '%s' does not accept `method` argument (type OpenSSL.SSL method,\
 e.g. OpenSSL.SSL.SSLv23_METHOD).\
 Please upgrade your context factory class to handle it or ignore it.""" % (
                settings['DOWNLOADER_CLIENTCONTEXTFACTORY'], )
            warnings.warn(msg)
        self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
        self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
        self._disconnect_timeout = 1
Beispiel #2
0
 def __init__(self, settings):
     self._pool = HTTPConnectionPool(reactor, persistent=True)
     self._pool.maxPersistentPerHost = settings.getint(
         'CONCURRENT_REQUESTS_PER_DOMAIN')
     self._pool._factory.noisy = False
     self._contextFactoryClass = load_object(
         settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
     self._contextFactory = self._contextFactoryClass()
Beispiel #3
0
class HTTP11DownloadHandler(object):

    def __init__(self, settings):
        self._pool = HTTPConnectionPool(reactor, persistent=True)
        self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
        self._pool._factory.noisy = False
        self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
        self._contextFactory = self._contextFactoryClass()
        self._disconnect_timeout = 1

    def download_request(self, request, spider):
        """Return a deferred for the HTTP download"""
        agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool)
        return agent.download_request(request)

    def close(self):
        d = self._pool.closeCachedConnections()
        # closeCachedConnections will hang on network or server issues, so
        # we'll manually timeout the deferred.
        #
        # Twisted issue addressing this problem can be found here:
        # https://twistedmatrix.com/trac/ticket/7738.
        #
        # closeCachedConnections doesn't handle external errbacks, so we'll
        # issue a callback after `_disconnect_timeout` seconds.
        delayed_call = reactor.callLater(self._disconnect_timeout, d.callback, [])

        def cancel_delayed_call(result):
            if delayed_call.active():
                delayed_call.cancel()
            return result

        d.addBoth(cancel_delayed_call)
        return d
Beispiel #4
0
 def __init__(self, settings):
     self._pool = HTTPConnectionPool(reactor, persistent=True)
     self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
     self._pool._factory.noisy = False
     self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
     self._contextFactory = self._contextFactoryClass()
     self._disconnect_timeout = 1
Beispiel #5
0
class HTTP11DownloadHandler(object):
    def __init__(self, settings):
        self._pool = HTTPConnectionPool(reactor, persistent=True)
        self._pool.maxPersistentPerHost = settings.getint(
            'CONCURRENT_REQUESTS_PER_DOMAIN')
        self._pool._factory.noisy = False

        self._sslMethod = openssl_methods[settings.get(
            'DOWNLOADER_CLIENT_TLS_METHOD')]
        self._contextFactoryClass = load_object(
            settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
        # try method-aware context factory
        try:
            self._contextFactory = self._contextFactoryClass(
                method=self._sslMethod)
        except TypeError:
            # use context factory defaults
            self._contextFactory = self._contextFactoryClass()
            msg = """
 '%s' does not accept `method` argument (type OpenSSL.SSL method,\
 e.g. OpenSSL.SSL.SSLv23_METHOD).\
 Please upgrade your context factory class to handle it or ignore it.""" % (
                settings['DOWNLOADER_CLIENTCONTEXTFACTORY'], )
            warnings.warn(msg)
        self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
        self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
        self._disconnect_timeout = 1

    def download_request(self, request, spider):
        """Return a deferred for the HTTP download"""
        agent = ScrapyAgent(contextFactory=self._contextFactory,
                            pool=self._pool,
                            maxsize=getattr(spider, 'download_maxsize',
                                            self._default_maxsize),
                            warnsize=getattr(spider, 'download_warnsize',
                                             self._default_warnsize))
        return agent.download_request(request)

    def close(self):
        d = self._pool.closeCachedConnections()
        # closeCachedConnections will hang on network or server issues, so
        # we'll manually timeout the deferred.
        #
        # Twisted issue addressing this problem can be found here:
        # https://twistedmatrix.com/trac/ticket/7738.
        #
        # closeCachedConnections doesn't handle external errbacks, so we'll
        # issue a callback after `_disconnect_timeout` seconds.
        delayed_call = reactor.callLater(self._disconnect_timeout, d.callback,
                                         [])

        def cancel_delayed_call(result):
            if delayed_call.active():
                delayed_call.cancel()
            return result

        d.addBoth(cancel_delayed_call)
        return d
Beispiel #6
0
class HTTP11DownloadHandler(object):

    def __init__(self, settings):
        self._pool = HTTPConnectionPool(reactor, persistent=True)
        self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
        self._pool._factory.noisy = False

        self._sslMethod = openssl_methods[settings.get('DOWNLOADER_CLIENT_TLS_METHOD')]
        self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
        # try method-aware context factory
        try:
            self._contextFactory = self._contextFactoryClass(method=self._sslMethod)
        except TypeError:
            # use context factory defaults
            self._contextFactory = self._contextFactoryClass()
            msg = """
 '%s' does not accept `method` argument (type OpenSSL.SSL method,\
 e.g. OpenSSL.SSL.SSLv23_METHOD).\
 Please upgrade your context factory class to handle it or ignore it.""" % (
                settings['DOWNLOADER_CLIENTCONTEXTFACTORY'],)
            warnings.warn(msg)
        self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
        self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
        self._disconnect_timeout = 1

    def download_request(self, request, spider):
        """Return a deferred for the HTTP download"""
        agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool,
            maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
            warnsize=getattr(spider, 'download_warnsize', self._default_warnsize))
        return agent.download_request(request)

    def close(self):
        d = self._pool.closeCachedConnections()
        # closeCachedConnections will hang on network or server issues, so
        # we'll manually timeout the deferred.
        #
        # Twisted issue addressing this problem can be found here:
        # https://twistedmatrix.com/trac/ticket/7738.
        #
        # closeCachedConnections doesn't handle external errbacks, so we'll
        # issue a callback after `_disconnect_timeout` seconds.
        delayed_call = reactor.callLater(self._disconnect_timeout, d.callback, [])

        def cancel_delayed_call(result):
            if delayed_call.active():
                delayed_call.cancel()
            return result

        d.addBoth(cancel_delayed_call)
        return d
Beispiel #7
0
class HTTP11DownloadHandler(object):

    def __init__(self, settings):
        self._pool = HTTPConnectionPool(reactor, persistent=True)
        self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
        self._contextFactory = self._contextFactoryClass()

    def download_request(self, request, spider):
        """Return a deferred for the HTTP download"""
        agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool)
        return agent.download_request(request)

    def close(self):
        return self._pool.closeCachedConnections()
Beispiel #8
0
class HTTP11DownloadHandler(object):
    def __init__(self, settings):
        self._pool = HTTPConnectionPool(reactor, persistent=True)
        self._contextFactoryClass = load_object(
            settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
        self._contextFactory = self._contextFactoryClass()

    def download_request(self, request, spider):
        """Return a deferred for the HTTP download"""
        agent = ScrapyAgent(contextFactory=self._contextFactory,
                            pool=self._pool)
        return agent.download_request(request)

    def close(self):
        return self._pool.closeCachedConnections()
Beispiel #9
0
class HTTP11DownloadHandler(object):
    def __init__(self, settings):
        self._pool = HTTPConnectionPool(reactor, persistent=True)
        self._pool.maxPersistentPerHost = settings.getint("CONCURRENT_REQUESTS_PER_DOMAIN")
        self._pool._factory.noisy = False
        self._contextFactoryClass = load_object(settings["DOWNLOADER_CLIENTCONTEXTFACTORY"])
        self._contextFactory = self._contextFactoryClass()

    def download_request(self, request, spider):
        """Return a deferred for the HTTP download"""
        agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool)
        return agent.download_request(request)

    def close(self):
        return self._pool.closeCachedConnections()
Beispiel #10
0
class MyDownloadHandler(object):
    '''下载接口, 被上层所调用'''
    def __init__(self, settings):
        self._pool = HTTPConnectionPool(reactor, persistent=True)
        self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
        self._pool._factory.noisy = False
        self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
        self._contextFactory = self._contextFactoryClass()

    def download_request(self, request, spider):
        '''下载的主要被调用接口(异步),返回 deferred (twisted 的延迟回调对象)'''
        myDownloader = MyLogicDownloader()
        return myDownloader.download(request)

    def close(self):
        return self._pool.closeCachedConnections()
Beispiel #11
0
class HTTP11DownloadHandler(object):
    def __init__(self, settings):
        self._pool = HTTPConnectionPool(reactor, persistent=True)
        self._pool.maxPersistentPerHost = settings.getint(
            'CONCURRENT_REQUESTS_PER_DOMAIN')
        self._pool._factory.noisy = False
        self._contextFactoryClass = load_object(
            settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
        self._contextFactory = self._contextFactoryClass()
        self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
        self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
        self._disconnect_timeout = 1

    def download_request(self, request, spider):
        """Return a deferred for the HTTP download"""
        agent = ScrapyAgent(contextFactory=self._contextFactory,
                            pool=self._pool,
                            maxsize=getattr(spider, 'download_maxsize',
                                            self._default_maxsize),
                            warnsize=getattr(spider, 'download_warnsize',
                                             self._default_warnsize))
        return agent.download_request(request)

    def close(self):
        d = self._pool.closeCachedConnections()
        # closeCachedConnections will hang on network or server issues, so
        # we'll manually timeout the deferred.
        #
        # Twisted issue addressing this problem can be found here:
        # https://twistedmatrix.com/trac/ticket/7738.
        #
        # closeCachedConnections doesn't handle external errbacks, so we'll
        # issue a callback after `_disconnect_timeout` seconds.
        delayed_call = reactor.callLater(self._disconnect_timeout, d.callback,
                                         [])

        def cancel_delayed_call(result):
            if delayed_call.active():
                delayed_call.cancel()
            return result

        d.addBoth(cancel_delayed_call)
        return d
Beispiel #12
0
class SeleniumDownloadHandler(object):
    """
    download interface
    """

    def __init__(self, settings):
        self._pool = HTTPConnectionPool(reactor, persistent=True)
        self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
        self._pool._factory.noisy = False
        self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
        self._contextFactory = self._contextFactoryClass()
        self._disconnect_timeout = 1

    def download_request(self, request, spider):
        myDownloader = SeleniumLogicDownloader()
        return myDownloader.download(request)

    def close(self):
        return self._pool.closeCachedConnections()
Beispiel #13
0
    def __init__(self, settings):
        self._pool = HTTPConnectionPool(reactor, persistent=True)
        self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
        self._pool._factory.noisy = False

        self._sslMethod = openssl_methods[settings.get('DOWNLOADER_CLIENT_TLS_METHOD')]
        self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
        # try method-aware context factory
        try:
            self._contextFactory = self._contextFactoryClass(method=self._sslMethod)
        except TypeError:
            # use context factory defaults
            self._contextFactory = self._contextFactoryClass()
            msg = """
 '%s' does not accept `method` argument (type OpenSSL.SSL method,\
 e.g. OpenSSL.SSL.SSLv23_METHOD).\
 Please upgrade your context factory class to handle it or ignore it.""" % (
                settings['DOWNLOADER_CLIENTCONTEXTFACTORY'],)
            warnings.warn(msg)
        self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
        self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
        self._disconnect_timeout = 1
Beispiel #14
0
class AntDownloadHandler(object):
    def __init__(self, settings):
        self._pool = HTTPConnectionPool(reactor, persistent=True)
        self._pool.maxPersistentPerHost = settings.getint(
            'CONCURRENT_REQUESTS_PER_DOMAIN')
        self._pool._factory.noisy = False
        self._contextFactoryClass = load_object(
            settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
        self._contextFactory = self._contextFactoryClass()
        self._agent = AntAgent(contextFactory=self._contextFactory,
                               pool=self._pool)

    def download_request(self, request, spider):
        if self._pool.persistent:
            if not request.headers.get('Connection'):
                request.headers.appendlist('Connection', 'keep-alive')
            proxy = request.meta.get('proxy')
            if proxy and not request.headers.get('Proxy-Connection'):
                request.headers.appendlist('Proxy-Connection', 'keep-alive')
        return self._agent.download_request(request)

    def close(self):
        return self._pool.closeCachedConnections()
Beispiel #15
0
class HTTP11DownloadHandler(object):
    def __init__(self, settings):
        self._pool = HTTPConnectionPool(reactor, persistent=True)
        self._pool.maxPersistentPerHost = settings.getint(
            'CONCURRENT_REQUESTS_PER_DOMAIN')
        self._pool._factory.noisy = False
        self._contextFactoryClass = load_object(
            settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
        self._contextFactory = self._contextFactoryClass()
        self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
        self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')

    def download_request(self, request, spider):
        """Return a deferred for the HTTP download"""
        agent = ScrapyAgent(contextFactory=self._contextFactory,
                            pool=self._pool,
                            maxsize=getattr(spider, 'download_maxsize',
                                            self._default_maxsize),
                            warnsize=getattr(spider, 'download_warnsize',
                                             self._default_warnsize))
        return agent.download_request(request)

    def close(self):
        return self._pool.closeCachedConnections()
Beispiel #16
0
 def __init__(self, settings):
     self._pool = HTTPConnectionPool(reactor, persistent=True)
     self._pool._factory.noisy = False
     self._contextFactoryClass = load_object(
         settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
     self._contextFactory = self._contextFactoryClass()
Beispiel #17
0
 def __init__(self, settings):
     self._pool = HTTPConnectionPool(reactor, persistent=True)
     self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
     self._contextFactory = self._contextFactoryClass()