def __init__(self, request, timeout=180): self.url = urldefrag(request.url)[0] self.method = request.method self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.timeout = request.meta.get('download_timeout') or timeout self.start_time = time() self.deferred = defer.Deferred().addCallback(self._build_response, request) # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected # to have _disconnectedDeferred. See Twisted r32329. # As Scrapy implements it's own logic to handle redirects is not # needed to add the callback _waitForDisconnect. # Specifically this avoids the AttributeError exception when # clientConnectionFailed method is called. self._disconnectedDeferred = defer.Deferred() self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("Connection", "close")
class ScrapyHTTPClientFactory(HTTPClientFactory): """Scrapy implementation of the HTTPClientFactory overwriting the serUrl method to make use of our Url object that cache the parse result. """ protocol = ScrapyHTTPPageGetter waiting = 1 noisy = False followRedirect = False afterFoundGet = False def __init__(self, request, timeout=180): self.url = urldefrag(request.url)[0] self.method = request.method self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.timeout = request.meta.get('download_timeout') or timeout self.start_time = time() self.deferred = defer.Deferred().addCallback(self._build_response, request) # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected # to have _disconnectedDeferred. See Twisted r32329. # As Scrapy implements it's own logic to handle redirects is not # needed to add the callback _waitForDisconnect. # Specifically this avoids the AttributeError exception when # clientConnectionFailed method is called. self._disconnectedDeferred = defer.Deferred() self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("Connection", "close") def _build_response(self, body, request): request.meta['download_latency'] = self.headers_time - self.start_time status = int(self.status) headers = Headers(self.response_headers) respcls = responsetypes.from_args(headers=headers, url=self.url) return respcls(url=self.url, status=status, headers=headers, body=body) def _set_connection_attributes(self, request): parsed = urlparse_cached(request) self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed) proxy = request.meta.get('proxy') if proxy: self.scheme, _, self.host, self.port, _ = _parse(proxy) self.path = self.url def gotHeaders(self, headers): self.headers_time = time() self.response_headers = headers
def connectionMade(self): self.headers = Headers() # bucket for response headers # Method command self.sendCommand(self.factory.method, self.factory.path) # Headers for key, values in self.factory.headers.items(): for value in values: self.sendHeader(key, value) self.endHeaders() # Body if self.factory.body is not None: self.transport.write(self.factory.body)
class ScrapyHTTPPageGetter(HTTPClient): delimiter = '\n' def connectionMade(self): self.headers = Headers() # bucket for response headers # Method command self.sendCommand(self.factory.method, self.factory.path) # Headers for key, values in self.factory.headers.items(): for value in values: self.sendHeader(key, value) self.endHeaders() # Body if self.factory.body is not None: self.transport.write(self.factory.body) def lineReceived(self, line): return HTTPClient.lineReceived(self, line.rstrip()) def handleHeader(self, key, value): self.headers.appendlist(key, value) def handleStatus(self, version, status, message): self.factory.gotStatus(version, status, message) def handleEndHeaders(self): self.factory.gotHeaders(self.headers) def connectionLost(self, reason): self._connection_lost_reason = reason HTTPClient.connectionLost(self, reason) self.factory.noPage(reason) def handleResponse(self, response): if self.factory.method.upper() == 'HEAD': self.factory.page('') elif self.length is not None and self.length > 0: self.factory.noPage(self._connection_lost_reason) else: self.factory.page(response) self.transport.loseConnection() def timeout(self): self.transport.loseConnection() self.factory.noPage( \ defer.TimeoutError("Getting %s took longer than %s seconds." % \ (self.factory.url, self.factory.timeout)))
def _cb_bodydone(self, result, request, url): txresponse, body, flags = result status = int(txresponse.code) headers = Headers(txresponse.headers.getAllRawHeaders()) respcls = responsetypes.from_args(headers=headers, url=url) return respcls(url=url, status=status, headers=headers, body=body, flags=flags)
def retrieve_response(self, spider, request): data = self._read_data(spider, request) if data is None: return # not cached url = data['url'] status = data['status'] headers = Headers(data['headers']) body = data['body'] respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(self, spider, request): """Return response if present in cache, or None otherwise.""" metadata = self._read_meta(spider, request) if metadata is None: return # not cached rpath = self._get_request_path(spider, request) with open(os.path.join(rpath, 'response_body'), 'rb') as f: body = f.read() with open(os.path.join(rpath, 'response_headers'), 'rb') as f: rawheaders = f.read() url = metadata.get('response_url') status = metadata['status'] headers = Headers(headers_raw_to_dict(rawheaders)) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def _build_response(self, body, request): request.meta['download_latency'] = self.headers_time - self.start_time status = int(self.status) headers = Headers(self.response_headers) respcls = responsetypes.from_args(headers=headers, url=self.url) return respcls(url=self.url, status=status, headers=headers, body=body)