Example #1
0
    def __init__(self, request, timeout=180):
        self.url = urldefrag(request.url)[0]
        self.method = request.method
        self.body = request.body or None
        self.headers = Headers(request.headers)
        self.response_headers = None
        self.timeout = request.meta.get('download_timeout') or timeout
        self.start_time = time()
        self.deferred = defer.Deferred().addCallback(self._build_response, request)

        # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
        # to have _disconnectedDeferred. See Twisted r32329.
        # As Scrapy implements it's own logic to handle redirects is not
        # needed to add the callback _waitForDisconnect.
        # Specifically this avoids the AttributeError exception when
        # clientConnectionFailed method is called.
        self._disconnectedDeferred = defer.Deferred()

        self._set_connection_attributes(request)

        # set Host header based on url
        self.headers.setdefault('Host', self.netloc)

        # set Content-Length based len of body
        if self.body is not None:
            self.headers['Content-Length'] = len(self.body)
            # just in case a broken http/1.1 decides to keep connection alive
            self.headers.setdefault("Connection", "close")
Example #2
0
class ScrapyHTTPClientFactory(HTTPClientFactory):
    """Scrapy implementation of the HTTPClientFactory overwriting the
    serUrl method to make use of our Url object that cache the parse
    result.
    """

    protocol = ScrapyHTTPPageGetter
    waiting = 1
    noisy = False
    followRedirect = False
    afterFoundGet = False

    def __init__(self, request, timeout=180):
        self.url = urldefrag(request.url)[0]
        self.method = request.method
        self.body = request.body or None
        self.headers = Headers(request.headers)
        self.response_headers = None
        self.timeout = request.meta.get('download_timeout') or timeout
        self.start_time = time()
        self.deferred = defer.Deferred().addCallback(self._build_response, request)

        # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
        # to have _disconnectedDeferred. See Twisted r32329.
        # As Scrapy implements it's own logic to handle redirects is not
        # needed to add the callback _waitForDisconnect.
        # Specifically this avoids the AttributeError exception when
        # clientConnectionFailed method is called.
        self._disconnectedDeferred = defer.Deferred()

        self._set_connection_attributes(request)

        # set Host header based on url
        self.headers.setdefault('Host', self.netloc)

        # set Content-Length based len of body
        if self.body is not None:
            self.headers['Content-Length'] = len(self.body)
            # just in case a broken http/1.1 decides to keep connection alive
            self.headers.setdefault("Connection", "close")

    def _build_response(self, body, request):
        request.meta['download_latency'] = self.headers_time - self.start_time
        status = int(self.status)
        headers = Headers(self.response_headers)
        respcls = responsetypes.from_args(headers=headers, url=self.url)
        return respcls(url=self.url, status=status, headers=headers, body=body)

    def _set_connection_attributes(self, request):
        parsed = urlparse_cached(request)
        self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed)
        proxy = request.meta.get('proxy')
        if proxy:
            self.scheme, _, self.host, self.port, _ = _parse(proxy)
            self.path = self.url

    def gotHeaders(self, headers):
        self.headers_time = time()
        self.response_headers = headers
Example #3
0
    def connectionMade(self):
        self.headers = Headers()  # bucket for response headers

        # Method command
        self.sendCommand(self.factory.method, self.factory.path)
        # Headers
        for key, values in self.factory.headers.items():
            for value in values:
                self.sendHeader(key, value)
        self.endHeaders()
        # Body
        if self.factory.body is not None:
            self.transport.write(self.factory.body)
Example #4
0
class ScrapyHTTPPageGetter(HTTPClient):
    delimiter = '\n'

    def connectionMade(self):
        self.headers = Headers()  # bucket for response headers

        # Method command
        self.sendCommand(self.factory.method, self.factory.path)
        # Headers
        for key, values in self.factory.headers.items():
            for value in values:
                self.sendHeader(key, value)
        self.endHeaders()
        # Body
        if self.factory.body is not None:
            self.transport.write(self.factory.body)

    def lineReceived(self, line):
        return HTTPClient.lineReceived(self, line.rstrip())

    def handleHeader(self, key, value):
        self.headers.appendlist(key, value)

    def handleStatus(self, version, status, message):
        self.factory.gotStatus(version, status, message)

    def handleEndHeaders(self):
        self.factory.gotHeaders(self.headers)

    def connectionLost(self, reason):
        self._connection_lost_reason = reason
        HTTPClient.connectionLost(self, reason)
        self.factory.noPage(reason)

    def handleResponse(self, response):
        if self.factory.method.upper() == 'HEAD':
            self.factory.page('')
        elif self.length is not None and self.length > 0:
            self.factory.noPage(self._connection_lost_reason)
        else:
            self.factory.page(response)
        self.transport.loseConnection()

    def timeout(self):
        self.transport.loseConnection()
        self.factory.noPage( \
            defer.TimeoutError("Getting %s took longer than %s seconds." % \
                               (self.factory.url, self.factory.timeout)))
Example #5
0
 def _cb_bodydone(self, result, request, url):
     txresponse, body, flags = result
     status = int(txresponse.code)
     headers = Headers(txresponse.headers.getAllRawHeaders())
     respcls = responsetypes.from_args(headers=headers, url=url)
     return respcls(url=url,
                    status=status,
                    headers=headers,
                    body=body,
                    flags=flags)
Example #6
0
 def retrieve_response(self, spider, request):
     data = self._read_data(spider, request)
     if data is None:
         return  # not cached
     url = data['url']
     status = data['status']
     headers = Headers(data['headers'])
     body = data['body']
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Example #7
0
 def retrieve_response(self, spider, request):
     """Return response if present in cache, or None otherwise."""
     metadata = self._read_meta(spider, request)
     if metadata is None:
         return  # not cached
     rpath = self._get_request_path(spider, request)
     with open(os.path.join(rpath, 'response_body'), 'rb') as f:
         body = f.read()
     with open(os.path.join(rpath, 'response_headers'), 'rb') as f:
         rawheaders = f.read()
     url = metadata.get('response_url')
     status = metadata['status']
     headers = Headers(headers_raw_to_dict(rawheaders))
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Example #8
0
 def _build_response(self, body, request):
     request.meta['download_latency'] = self.headers_time - self.start_time
     status = int(self.status)
     headers = Headers(self.response_headers)
     respcls = responsetypes.from_args(headers=headers, url=self.url)
     return respcls(url=self.url, status=status, headers=headers, body=body)