Example #1
0
    def __init__(self, request, timeout=180, download_size=0):
        self.url = urldefrag(request.url)[0]
        self.method = request.method
        self.body = request.body or None
        self.headers = Headers(request.headers)
        self.response_headers = None
        self.start_time = time()
        self.deferred = defer.Deferred()
        self.deferred.addCallback(self._build_response, request)
        self.invalid_headers = []
        self.timeout = timeout
        self.download_size = download_size

        # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
        # to have _disconnectedDeferred. See Twisted r32329.
        # As Crawlmi implements it's own logic to handle redirects is not
        # needed to add the callback _waitForDisconnect.
        # Specifically this avoids the AttributeError exception when
        # clientConnectionFailed method is called.
        self._disconnectedDeferred = defer.Deferred()

        self._set_connection_attributes(request)

        # set Host header based on url
        self.headers.setdefault('Host', self.netloc)

        # set Content-Length based len of body
        if self.body is not None:
            self.headers['Content-Length'] = len(self.body)
            # just in case a broken http/1.1 decides to keep connection alive
            self.headers.setdefault('Connection', 'close')
 def test_from_args(self):
     mappings = [
         ({
             'url': 'http://www.example.com/data.csv'
         }, TextResponse),
         # headers takes precedence over url
         ({
             'headers':
             Headers({'Content-Type': ['text/html; charset=utf-8']}),
             'url': 'http://www.example.com/item/'
         }, HtmlResponse),
         ({
             'headers':
             Headers({
                 'Content-Disposition':
                 ['attachment; filename="data.xml.gz"']
             }),
             'url':
             'http://www.example.com/page/'
         }, Response),
     ]
     for source, cls in mappings:
         retcls = from_args(**source)
         self.assertIs(
             retcls, cls,
             'Expected: %s  Received: %s' % (cls.__name__, retcls.__name__))
Example #3
0
    def test_multivalue(self):
        h = Headers()

        h['X-Forwarded-For'] = hlist = ['ip1', 'ip2']
        self.assertEqual(h['X-Forwarded-For'], 'ip2')
        self.assertEqual(h.get('X-Forwarded-For'), 'ip2')
        self.assertEqual(h.getlist('X-Forwarded-For'), hlist)
        self.assertIsNot(h.getlist('X-Forwarded-For'), hlist)
    def test_multivalue(self):
        h = Headers()

        h['X-Forwarded-For'] = hlist = ['ip1', 'ip2']
        self.assertEqual(h['X-Forwarded-For'], 'ip2')
        self.assertEqual(h.get('X-Forwarded-For'), 'ip2')
        self.assertEqual(h.getlist('X-Forwarded-For'), hlist)
        self.assertIsNot(h.getlist('X-Forwarded-For'), hlist)
Example #5
0
 def test_copy(self):
     h1 = Headers({'header1': ['value1', 'value2']}, encoding='ascii')
     h2 = copy.copy(h1)
     self.assertEqual(h1, h2)
     self.assertEqual(h1.encoding, h2.encoding)
     self.assertEqual(h1.getlist('header1'), h2.getlist('header1'))
     self.assertIsNot(h1.getlist('header1'), h2.getlist('header1'))
     self.assertIsInstance(h2, Headers)
 def test_copy(self):
     h1 = Headers({'header1': ['value1', 'value2']}, encoding='ascii')
     h2 = copy.copy(h1)
     self.assertEqual(h1, h2)
     self.assertEqual(h1.encoding, h2.encoding)
     self.assertEqual(h1.getlist('header1'), h2.getlist('header1'))
     self.assertIsNot(h1.getlist('header1'), h2.getlist('header1'))
     self.assertIsInstance(h2, Headers)
Example #7
0
 def test_iterables(self):
     idict = {'Content-Type': 'text/html', 'X-Forwarded-For': ['ip1', 'ip2']}
     h = Headers(idict)
     self.assertEqual(dict(h), {'Content-Type': ['text/html'], 'X-Forwarded-For': ['ip1', 'ip2']})
     self.assertEqual(h.keys(), ['X-Forwarded-For', 'Content-Type'])
     self.assertEqual(h.items(), [('X-Forwarded-For', ['ip1', 'ip2']), ('Content-Type', ['text/html'])])
     self.assertEqual(list(h.iteritems()),
             [('X-Forwarded-For', ['ip1', 'ip2']), ('Content-Type', ['text/html'])])
     self.assertEqual(h.values(), ['ip2', 'text/html'])
    def test_appendlist(self):
        h = Headers({'header1': 'value1'})
        h.appendlist('header1', 'value3')
        self.assertListEqual(h.getlist('header1'), ['value1', 'value3'])

        h = Headers()
        h.appendlist('header1', 'value1')
        h.appendlist('header1', 'value3')
        self.assertListEqual(h.getlist('header1'), ['value1', 'value3'])
    def test_setdefault(self):
        h = Headers()
        hlist = ['ip1', 'ip2']
        olist = h.setdefault('X-Forwarded-For', hlist)
        self.assertIsNot(h.getlist('X-Forwarded-For'), hlist)
        self.assertIs(h.getlist('X-Forwarded-For'), olist)

        h = Headers()
        olist = h.setdefault('X-Forwarded-For', 'ip1')
        self.assertEqual(h.getlist('X-Forwarded-For'), ['ip1'])
        self.assertIs(h.getlist('X-Forwarded-For'), olist)
Example #10
0
    def connectionMade(self):
        self.headers = Headers()

        # method command
        self.sendCommand(self.factory.method, self.factory.path)
        # headers
        for key, values in self.factory.headers.iteritems():
            for value in values:
                self.sendHeader(key, value)
        self.endHeaders()
        # body
        if self.factory.body is not None:
            self.transport.write(self.factory.body)
Example #11
0
 def test_http_encoding_header(self):
     headers = Headers({'Content-Type': 'text/html; charset=ISO-8859-4'})
     self.assertEqual(get_encoding_from_headers(headers), 'iso8859-4')
     headers = Headers({'Something-else': 'text/html; charset=ISO-8859-4'})
     self.assertIsNone(get_encoding_from_headers(headers))
     headers = Headers({'Content-Type': 'text/html'})
     # self.assertEqual(get_encoding_from_headers(headers), 'cp1252')
     self.assertIsNone(get_encoding_from_headers(headers))
     headers = Headers({'Content-Type': 'application/xhtml+xml'})
     self.assertIsNone(get_encoding_from_headers(headers))
     # invalid encoding
     headers = Headers({'Content-Type': 'text/html; charset='})
     self.assertIsNone(get_encoding_from_headers(headers))
Example #12
0
 def test_non_standard_line_endings(self):
     factory = CrawlmiHTPPClientFactory(Request(url='http://foo/bar'))
     protocol = CrawlmiHTTPClient()
     protocol.factory = factory
     protocol.headers = Headers()
     protocol.dataReceived('HTTP/1.0 200 OK\n')
     protocol.dataReceived('Hello: World\n')
     protocol.dataReceived('Foo: Bar\n')
     protocol.dataReceived('\n')
     self.assertEqual(protocol.headers,
                      Headers({
                          'Hello': ['World'],
                          'Foo': ['Bar']
                      }))
Example #13
0
 def _cb_body_done(self, result, request, url):
     txresponse, body, flags = result
     status = int(txresponse.code)
     headers = Headers(txresponse.headers.getAllRawHeaders())
     response_cls = resp_factory.from_args(headers=headers, url=url)
     return response_cls(url=url, status=status, headers=headers, body=body,
         request=request)
Example #14
0
    def __init__(self,
                 url,
                 callback=None,
                 method='GET',
                 headers={},
                 params={},
                 body='',
                 cookies=None,
                 meta={},
                 errback=None,
                 proxy=None,
                 priority=0,
                 history=[],
                 encoding='utf-8'):
        self.callback = callback
        self.errback = errback

        self.headers = Headers(headers, encoding)
        self.meta = dict(meta)
        self.history = list(history)
        self.proxy = proxy
        self.priority = priority
        self.cookies = cookies or {}

        # following attributes are immutable
        self._encoding = encoding
        self._method = self._prepare_method(method)
        self._url = self._prepare_url(url, params)
        self._body = self._prepare_body(body)
Example #15
0
    def __init__(self, request, timeout=180, download_size=0):
        self.url = urldefrag(request.url)[0]
        self.method = request.method
        self.body = request.body or None
        self.headers = Headers(request.headers)
        self.response_headers = None
        self.start_time = time()
        self.deferred = defer.Deferred()
        self.deferred.addCallback(self._build_response, request)
        self.invalid_headers = []
        self.timeout = timeout
        self.download_size = download_size

        # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
        # to have _disconnectedDeferred. See Twisted r32329.
        # As Crawlmi implements it's own logic to handle redirects is not
        # needed to add the callback _waitForDisconnect.
        # Specifically this avoids the AttributeError exception when
        # clientConnectionFailed method is called.
        self._disconnectedDeferred = defer.Deferred()

        self._set_connection_attributes(request)

        # set Host header based on url
        self.headers.setdefault('Host', self.netloc)

        # set Content-Length based len of body
        if self.body is not None:
            self.headers['Content-Length'] = len(self.body)
            # just in case a broken http/1.1 decides to keep connection alive
            self.headers.setdefault('Connection', 'close')
Example #16
0
    def test_add(self):
        h1 = Headers({'header1': 'value1'})
        h1.add('header1', 'value3')
        self.assertEqual(h1.getlist('header1'), ['value1', 'value3'])

        h1 = Headers()
        h1.add('header1', 'value1')
        h1.add('header1', 'value3')
        self.assertEqual(h1.getlist('header1'), ['value1', 'value3'])
Example #17
0
    def test_appendlist(self):
        h = Headers({'header1': 'value1'})
        h.appendlist('header1', 'value3')
        self.assertListEqual(h.getlist('header1'), ['value1', 'value3'])

        h = Headers()
        h.appendlist('header1', 'value1')
        h.appendlist('header1', 'value3')
        self.assertListEqual(h.getlist('header1'), ['value1', 'value3'])
Example #18
0
    def test_setdefault(self):
        h = Headers()
        hlist = ['ip1', 'ip2']
        olist = h.setdefault('X-Forwarded-For', hlist)
        self.assertIsNot(h.getlist('X-Forwarded-For'), hlist)
        self.assertIs(h.getlist('X-Forwarded-For'), olist)

        h = Headers()
        olist = h.setdefault('X-Forwarded-For', 'ip1')
        self.assertEqual(h.getlist('X-Forwarded-For'), ['ip1'])
        self.assertIs(h.getlist('X-Forwarded-For'), olist)
Example #19
0
 def retrieve_response(self, request):
     data = self._read_data(request)
     if data is None:
         return  # not cached
     url = data['url']
     status = data['status']
     headers = Headers(data['headers'])
     body = data['body']
     respcls = from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
 def test_to_string(self):
     h = Headers({'Content-type': 'text/html', 'Accept': 'gzip'})
     self.assertEqual(h.to_string(),
                      'Content-Type: text/html\r\nAccept: gzip')
     h = Headers({'Content-type': ['text/html'], 'Accept': ['gzip']})
     self.assertEqual(h.to_string(),
                      'Content-Type: text/html\r\nAccept: gzip')
Example #21
0
    def connectionMade(self):
        self.headers = Headers()

        # method command
        self.sendCommand(self.factory.method, self.factory.path)
        # headers
        for key, values in self.factory.headers.iteritems():
            for value in values:
                self.sendHeader(key, value)
        self.endHeaders()
        # body
        if self.factory.body is not None:
            self.transport.write(self.factory.body)
Example #22
0
 def test_invalid_status(self):
     transport = StringTransport()
     factory = CrawlmiHTPPClientFactory(Request(url='http://foo/bar'))
     protocol = CrawlmiHTTPClient()
     protocol.factory = factory
     protocol.makeConnection(transport)
     protocol.headers = Headers()
     protocol.dataReceived('HTTP/1.0 BUG OK\r\n')
     protocol.dataReceived('Hello: World\r\n')
     protocol.dataReceived('Foo: Bar\r\n')
     protocol.dataReceived('\r\n')
     protocol.handleResponse('')
     return self.assertFailure(factory.deferred, BadHttpHeaderError)
 def test_update(self):
     h = Headers()
     h.update({
         'Content-Type': 'text/html',
         'X-Forwarded-For': ['ip1', 'ip2']
     })
     self.assertEqual(h.getlist('Content-Type'), ['text/html'])
     self.assertEqual(h.getlist('X-Forwarded-For'), ['ip1', 'ip2'])
    def test_basics(self):
        h = Headers({'Content-Type': 'text/html', 'Content-Length': 1234})
        self.assertIn('Content-Type', h)
        self.assertIn('Content-Length', h)

        self.assertRaises(KeyError, h.__getitem__, 'Accept')
        self.assertEqual(h.get('Accept'), None)
        self.assertEqual(h.getlist('Accept'), [])

        self.assertEqual(h.get('Accept', '*/*'), '*/*')
        self.assertEqual(h.getlist('Accept', '*/*'), ['*/*'])
        self.assertEqual(h.getlist('Accept', ['text/html', 'images/jpeg']),
                         ['text/html', 'images/jpeg'])
Example #25
0
    def __init__(self,
                 url,
                 status=200,
                 headers={},
                 body=None,
                 request=None,
                 flags=None):
        self.url = url
        self.status = int(status)
        self.headers = Headers(headers)
        self.request = request
        self.flags = [] if flags is None else list(flags)

        # following attributes are immutable
        self._body = body or ''
Example #26
0
    def test_basics(self):
        h = Headers({'Content-Type': 'text/html', 'Content-Length': 1234})
        self.assertIn('Content-Type', h)
        self.assertIn('Content-Length', h)

        self.assertRaises(KeyError, h.__getitem__, 'Accept')
        self.assertEqual(h.get('Accept'), None)
        self.assertEqual(h.getlist('Accept'), [])

        self.assertEqual(h.get('Accept', '*/*'), '*/*')
        self.assertEqual(h.getlist('Accept', '*/*'), ['*/*'])
        self.assertEqual(h.getlist('Accept', ['text/html', 'images/jpeg']), ['text/html', 'images/jpeg'])
 def test_from_headers(self):
     mappings = [
         ({
             'Content-Type': ['text/html; charset=utf-8']
         }, HtmlResponse),
         ({
             'Content-Type': ['application/octet-stream'],
             'Content-Disposition': ['attachment; filename=data.txt']
         }, TextResponse),
         ({
             'Content-Type': ['text/html; charset=utf-8'],
             'Content-Encoding': ['gzip']
         }, Response),
     ]
     for source, cls in mappings:
         source = Headers(source)
         retcls = from_headers(source)
         self.assertIs(
             retcls, cls,
             'Expected: %s  Received: %s' % (cls.__name__, retcls.__name__))
 def test_iterables(self):
     idict = {
         'Content-Type': 'text/html',
         'X-Forwarded-For': ['ip1', 'ip2']
     }
     h = Headers(idict)
     self.assertEqual(dict(h), {
         'Content-Type': ['text/html'],
         'X-Forwarded-For': ['ip1', 'ip2']
     })
     self.assertEqual(h.keys(), ['X-Forwarded-For', 'Content-Type'])
     self.assertEqual(h.items(), [('X-Forwarded-For', ['ip1', 'ip2']),
                                  ('Content-Type', ['text/html'])])
     self.assertEqual(list(h.iteritems()),
                      [('X-Forwarded-For', ['ip1', 'ip2']),
                       ('Content-Type', ['text/html'])])
     self.assertEqual(h.values(), ['ip2', 'text/html'])
 def test_encode_latin1(self):
     h = Headers({u'key': u'\xa3'}, encoding='latin1')
     key, val = dict(h).items()[0]
     self.assertEqual(val[0], '\xa3')
Example #30
0
class CrawlmiHTPPClientFactory(HTTPClientFactory):
    protocol = CrawlmiHTTPClient
    waiting = 1
    noisy = False
    followRedirect = False
    afterFoundGet = False

    def __init__(self, request, timeout=180, download_size=0):
        self.url = urldefrag(request.url)[0]
        self.method = request.method
        self.body = request.body or None
        self.headers = Headers(request.headers)
        self.response_headers = None
        self.start_time = time()
        self.deferred = defer.Deferred()
        self.deferred.addCallback(self._build_response, request)
        self.invalid_headers = []
        self.timeout = timeout
        self.download_size = download_size

        # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
        # to have _disconnectedDeferred. See Twisted r32329.
        # As Crawlmi implements it's own logic to handle redirects is not
        # needed to add the callback _waitForDisconnect.
        # Specifically this avoids the AttributeError exception when
        # clientConnectionFailed method is called.
        self._disconnectedDeferred = defer.Deferred()

        self._set_connection_attributes(request)

        # set Host header based on url
        self.headers.setdefault('Host', self.netloc)

        # set Content-Length based len of body
        if self.body is not None:
            self.headers['Content-Length'] = len(self.body)
            # just in case a broken http/1.1 decides to keep connection alive
            self.headers.setdefault('Connection', 'close')

    def _build_response(self, body, request):
        if self.invalid_headers:
            raise BadHttpHeaderError('Invalid headers received: %s' %
                                     self.invalid_headers)

        response_cls = resp_factory.from_args(headers=self.response_headers,
                                              url=self.url)
        response = response_cls(
            url=self.url, status=self.status, headers=self.response_headers,
            body=body, request=request)
        response.download_latency = self.headers_time - self.start_time
        return response

    def _set_connection_attributes(self, request):
        self.scheme, self.netloc, self.host, self.port, self.path = \
            _parse_url_args(request.url)
        if request.proxy:
            self.scheme, _, self.host, self.port, _ = \
                _parse_url_args(request.proxy)
            self.path = self.url

    def gotStatus(self, version, status, message):
        self.version, self.status, self.message = version, int(status), message

    def gotHeaders(self, headers):
        self.headers_time = time()
        self.response_headers = headers
Example #31
0
 def __init__(self, body, encoding):
     self.body = body
     self.headers = Headers()
     if encoding:
         self.headers['Content-Type'] = 'text/html; charset=' + encoding
Example #32
0
class CrawlmiHTTPClient(HTTPClient):

    delimiter = '\n'

    def __init__(self):
        self.body_size = 0

    def connectionMade(self):
        self.headers = Headers()

        # method command
        self.sendCommand(self.factory.method, self.factory.path)
        # headers
        for key, values in self.factory.headers.iteritems():
            for value in values:
                self.sendHeader(key, value)
        self.endHeaders()
        # body
        if self.factory.body is not None:
            self.transport.write(self.factory.body)

    def extractHeader(self, header):
        key, val = header.split(':', 1)
        val = val.lstrip()
        self.handleHeader(key, val)
        if key.lower() == 'content-length':
            self.length = int(val)

    def lineReceived(self, line):
        try:
            HTTPClient.lineReceived(self, line.rstrip())
        except:
            self.factory.invalid_headers.append(line)
            if hasattr(self.transport, 'abortConnection'):
                self.transport.abortConnection()
            else:
                self.transport.loseConnection()

    def handleHeader(self, key, value):
        self.headers.add(key, value)

    def handleStatus(self, version, status, message):
        self.factory.gotStatus(version, status, message)

    def handleEndHeaders(self):
        self.factory.gotHeaders(self.headers)

    def connectionLost(self, reason):
        self._connection_lost_reason = reason
        HTTPClient.connectionLost(self, reason)
        self.factory.noPage(reason)

    def handleResponse(self, response):
        if self.factory.method.upper() == 'HEAD':
            self.factory.page('')
        elif self.length is not None and self.length > 0:
            self.factory.noPage(self._connection_lost_reason)
        else:
            self.factory.page(response)
        self.transport.loseConnection()

    def timeout(self):
        self.transport.loseConnection()
        self.factory.noPage(
            defer.TimeoutError('Getting %s took longer than %s seconds.' %
                               (self.factory.url, self.factory.timeout)))

    def handleResponsePart(self, data):
        HTTPClient.handleResponsePart(self, data)
        self.body_size += len(data)
        if (self.factory.download_size
                and self.body_size > self.factory.download_size):
            self.transport.loseConnection()
            self.factory.noPage(
                DownloadSizeError('Response exceeded %s bytes.' %
                                  self.factory.download_size))
Example #33
0
    def test_early_headers(self):
        # basic test stolen from twisted HTTPageGetter
        factory = CrawlmiHTPPClientFactory(
            Request(url='http://foo/bar',
                    body='some data',
                    headers={
                        'Host': 'example.net',
                        'User-Agent': 'fooble',
                        'Cookie': 'blah blah',
                        'Content-Length': '12981',
                        'Useful': 'value'
                    }))

        self._test(
            factory, 'GET /bar HTTP/1.0\r\n'
            'Content-Length: 9\r\n'
            'Useful: value\r\n'
            'Connection: close\r\n'
            'User-Agent: fooble\r\n'
            'Host: example.net\r\n'
            'Cookie: blah blah\r\n'
            '\r\n'
            'some data')

        # test minimal sent headers
        factory = CrawlmiHTPPClientFactory(Request('http://foo/bar'))
        self._test(factory, 'GET /bar HTTP/1.0\r\n' 'Host: foo\r\n' '\r\n')

        # test a simple POST with body and content-type
        factory = CrawlmiHTPPClientFactory(
            Request(
                method='POST',
                url='http://foo/bar',
                body='name=value',
                headers={'Content-Type': 'application/x-www-form-urlencoded'}))

        self._test(
            factory, 'POST /bar HTTP/1.0\r\n'
            'Host: foo\r\n'
            'Connection: close\r\n'
            'Content-Type: application/x-www-form-urlencoded\r\n'
            'Content-Length: 10\r\n'
            '\r\n'
            'name=value')

        # test with single and multivalued headers
        factory = CrawlmiHTPPClientFactory(
            Request(url='http://foo/bar',
                    headers={
                        'X-Meta-Single': 'single',
                        'X-Meta-Multivalued': ['value1', 'value2']
                    }))

        self._test(
            factory, 'GET /bar HTTP/1.0\r\n'
            'Host: foo\r\n'
            'X-Meta-Multivalued: value1\r\n'
            'X-Meta-Multivalued: value2\r\n'
            'X-Meta-Single: single\r\n'
            '\r\n')

        # same test with single and multivalued headers but using Headers class
        factory = CrawlmiHTPPClientFactory(
            Request(url='http://foo/bar',
                    headers=Headers({
                        'X-Meta-Single': 'single',
                        'X-Meta-Multivalued': ['value1', 'value2']
                    })))

        self._test(
            factory, 'GET /bar HTTP/1.0\r\n'
            'Host: foo\r\n'
            'X-Meta-Multivalued: value1\r\n'
            'X-Meta-Multivalued: value2\r\n'
            'X-Meta-Single: single\r\n'
            '\r\n')
Example #34
0
 def test_single_value(self):
     h = Headers()
     h['Content-Type'] = 'text/html'
     self.assertEqual(h['Content-Type'], 'text/html')
     self.assertEqual(h.get('Content-Type'), 'text/html')
     self.assertEqual(h.getlist('Content-Type'), ['text/html'])
 def test_clear(self):
     h = Headers({'a': 'b'})
     self.assertIn('a', h)
     h.clear()
     self.assertNotIn('a', h)
Example #36
0
class CrawlmiHTPPClientFactory(HTTPClientFactory):
    protocol = CrawlmiHTTPClient
    waiting = 1
    noisy = False
    followRedirect = False
    afterFoundGet = False

    def __init__(self, request, timeout=180, download_size=0):
        self.url = urldefrag(request.url)[0]
        self.method = request.method
        self.body = request.body or None
        self.headers = Headers(request.headers)
        self.response_headers = None
        self.start_time = time()
        self.deferred = defer.Deferred()
        self.deferred.addCallback(self._build_response, request)
        self.invalid_headers = []
        self.timeout = timeout
        self.download_size = download_size

        # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
        # to have _disconnectedDeferred. See Twisted r32329.
        # As Crawlmi implements it's own logic to handle redirects is not
        # needed to add the callback _waitForDisconnect.
        # Specifically this avoids the AttributeError exception when
        # clientConnectionFailed method is called.
        self._disconnectedDeferred = defer.Deferred()

        self._set_connection_attributes(request)

        # set Host header based on url
        self.headers.setdefault('Host', self.netloc)

        # set Content-Length based len of body
        if self.body is not None:
            self.headers['Content-Length'] = len(self.body)
            # just in case a broken http/1.1 decides to keep connection alive
            self.headers.setdefault('Connection', 'close')

    def _build_response(self, body, request):
        if self.invalid_headers:
            raise BadHttpHeaderError('Invalid headers received: %s' %
                                     self.invalid_headers)

        response_cls = resp_factory.from_args(headers=self.response_headers,
                                              url=self.url)
        response = response_cls(url=self.url,
                                status=self.status,
                                headers=self.response_headers,
                                body=body,
                                request=request)
        response.download_latency = self.headers_time - self.start_time
        return response

    def _set_connection_attributes(self, request):
        self.scheme, self.netloc, self.host, self.port, self.path = \
            _parse_url_args(request.url)
        if request.proxy:
            self.scheme, _, self.host, self.port, _ = \
                _parse_url_args(request.proxy)
            self.path = self.url

    def gotStatus(self, version, status, message):
        self.version, self.status, self.message = version, int(status), message

    def gotHeaders(self, headers):
        self.headers_time = time()
        self.response_headers = headers
Example #37
0
class CrawlmiHTTPClient(HTTPClient):

    delimiter = '\n'

    def __init__(self):
        self.body_size = 0

    def connectionMade(self):
        self.headers = Headers()

        # method command
        self.sendCommand(self.factory.method, self.factory.path)
        # headers
        for key, values in self.factory.headers.iteritems():
            for value in values:
                self.sendHeader(key, value)
        self.endHeaders()
        # body
        if self.factory.body is not None:
            self.transport.write(self.factory.body)

    def extractHeader(self, header):
        key, val = header.split(':', 1)
        val = val.lstrip()
        self.handleHeader(key, val)
        if key.lower() == 'content-length':
            self.length = int(val)

    def lineReceived(self, line):
        try:
            HTTPClient.lineReceived(self, line.rstrip())
        except:
            self.factory.invalid_headers.append(line)
            if hasattr(self.transport, 'abortConnection'):
                self.transport.abortConnection()
            else:
                self.transport.loseConnection()

    def handleHeader(self, key, value):
        self.headers.add(key, value)

    def handleStatus(self, version, status, message):
        self.factory.gotStatus(version, status, message)

    def handleEndHeaders(self):
        self.factory.gotHeaders(self.headers)

    def connectionLost(self, reason):
        self._connection_lost_reason = reason
        HTTPClient.connectionLost(self, reason)
        self.factory.noPage(reason)

    def handleResponse(self, response):
        if self.factory.method.upper() == 'HEAD':
            self.factory.page('')
        elif self.length is not None and self.length > 0:
            self.factory.noPage(self._connection_lost_reason)
        else:
            self.factory.page(response)
        self.transport.loseConnection()

    def timeout(self):
        self.transport.loseConnection()
        self.factory.noPage(
            defer.TimeoutError('Getting %s took longer than %s seconds.' %
                               (self.factory.url, self.factory.timeout)))

    def handleResponsePart(self, data):
        HTTPClient.handleResponsePart(self, data)
        self.body_size += len(data)
        if (self.factory.download_size and
                self.body_size > self.factory.download_size):
            self.transport.loseConnection()
            self.factory.noPage(
                DownloadSizeError('Response exceeded %s bytes.' %
                                  self.factory.download_size))
 def test_delete_and_contains(self):
     h = Headers()
     h['Content-Type'] = 'text/html'
     self.assertIn('Content-Type', h)
     del h['Content-Type']
     self.assertNotIn('Content-Type', h)
 def test_single_value(self):
     h = Headers()
     h['Content-Type'] = 'text/html'
     self.assertEqual(h['Content-Type'], 'text/html')
     self.assertEqual(h.get('Content-Type'), 'text/html')
     self.assertEqual(h.getlist('Content-Type'), ['text/html'])
 def test_encode_utf8(self):
     h = Headers({u'key': u'\xa3'}, encoding='utf-8')
     key, val = dict(h).items()[0]
     self.assertIsInstance(key, str)
     self.assertIsInstance(val[0], str)
     self.assertEqual(val[0], '\xc2\xa3')
Example #41
0
 def test_update(self):
     h = Headers()
     h.update({'Content-Type': 'text/html', 'X-Forwarded-For': ['ip1', 'ip2']})
     self.assertEqual(h.getlist('Content-Type'), ['text/html'])
     self.assertEqual(h.getlist('X-Forwarded-For'), ['ip1', 'ip2'])
Example #42
0
 def test_clear(self):
     h = Headers({'a': 'b'})
     self.assertIn('a', h)
     h.clear()
     self.assertNotIn('a', h)
Example #43
0
 def test_to_string(self):
     h = Headers({'Content-type': 'text/html', 'Accept': 'gzip'})
     self.assertEqual(h.to_string(), 'Content-Type: text/html\r\nAccept: gzip')
     h = Headers({'Content-type': ['text/html'], 'Accept': ['gzip']})
     self.assertEqual(h.to_string(), 'Content-Type: text/html\r\nAccept: gzip')
 def test_encode_multiple(self):
     h = Headers({u'key': [u'\xa3']}, encoding='utf-8')
     key, val = dict(h).items()[0]
     self.assertEqual(val[0], '\xc2\xa3')