def test_none_value(self): h1 = Headers() h1['foo'] = 'bar' h1['foo'] = None h1.setdefault('foo', 'bar') self.assertEqual(h1.get('foo'), None) self.assertEqual(h1.getlist('foo'), [])
def test_none_value(self): h1 = Headers() h1['foo'] = 'bar' h1['foo'] = None h1.setdefault('foo', 'bar') self.assertEqual(h1.get('foo'), None) self.assertEqual(h1.getlist('foo'), [])
class ScrapyHTTPClientFactory(HTTPClientFactory): """Scrapy implementation of the HTTPClientFactory overwriting the serUrl method to make use of our Url object that cache the parse result. """ protocol = ScrapyHTTPPageGetter waiting = 1 noisy = False followRedirect = False afterFoundGet = False def __init__(self, request, timeout=180): self.url = urldefrag(request.url)[0] self.method = request.method self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.timeout = request.meta.get('download_timeout') or timeout self.start_time = time() self.deferred = defer.Deferred().addCallback(self._build_response, request) # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected # to have _disconnectedDeferred. See Twisted r32329. # As Scrapy implements it's own logic to handle redirects is not # needed to add the callback _waitForDisconnect. # Specifically this avoids the AttributeError exception when # clientConnectionFailed method is called. self._disconnectedDeferred = defer.Deferred() self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("Connection", "close") def _build_response(self, body, request): request.meta['download_latency'] = self.headers_time-self.start_time status = int(self.status) headers = Headers(self.response_headers) respcls = responsetypes.from_args(headers=headers, url=self.url) return respcls(url=self.url, status=status, headers=headers, body=body) def _set_connection_attributes(self, request): parsed = urlparse_cached(request) self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed) self.bind_address = request.meta.get("bind_address") proxy = request.meta.get('proxy') if proxy: self.scheme, _, self.host, self.port, _ = _parse(proxy) self.path = self.url def gotHeaders(self, headers): self.headers_time = time() self.response_headers = headers
def test_none_value(self): h1 = Headers() h1["foo"] = "bar" h1["foo"] = None h1.setdefault("foo", "bar") self.assertEqual(h1.get("foo"), None) self.assertEqual(h1.getlist("foo"), [])
class ScrapyHTTPClientFactory(HTTPClientFactory): """Scrapy implementation of the HTTPClientFactory overwriting the serUrl method to make use of our Url object that cache the parse result. """ protocol = ScrapyHTTPPageGetter waiting = 1 noisy = False followRedirect = False afterFoundGet = False def __init__(self, request, timeout=180): self.url = urldefrag(request.url)[0] self.method = request.method self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.timeout = request.meta.get('download_timeout') or timeout self.start_time = time() self.deferred = defer.Deferred().addCallback(self._build_response, request) # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected # to have _disconnectedDeferred. See Twisted r32329. # As Scrapy implements it's own logic to handle redirects is not # needed to add the callback _waitForDisconnect. # Specifically this avoids the AttributeError exception when # clientConnectionFailed method is called. self._disconnectedDeferred = defer.Deferred() self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("Connection", "close") def _build_response(self, body, request): request.meta['download_latency'] = self.headers_time-self.start_time status = int(self.status) headers = Headers(self.response_headers) respcls = responsetypes.from_args(headers=headers, url=self.url) return respcls(url=self.url, status=status, headers=headers, body=body) def _set_connection_attributes(self, request): parsed = urlparse_cached(request) self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed) proxy = request.meta.get('proxy') if proxy: self.scheme, _, self.host, self.port, _ = _parse(proxy) self.path = self.url def gotHeaders(self, headers): self.headers_time = time() self.response_headers = headers
class ScrapyHTTPClientFactory(HTTPClientFactory): """Scrapy implementation of the HTTPClientFactory overwriting the serUrl method to make use of our Url object that cache the parse result. """ protocol = ScrapyHTTPPageGetter waiting = 1 noisy = False followRedirect = False afterFoundGet = False def __init__(self, request, timeout=180): self.url = urldefrag(request.url)[0] self.method = request.method self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.timeout = request.meta.get('download_timeout') or timeout self.start_time = time() self.deferred = defer.Deferred().addCallback(self._build_response, request) self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("Connection", "close") def _build_response(self, body, request): request.meta['download_latency'] = self.headers_time-self.start_time status = int(self.status) headers = Headers(self.response_headers) respcls = responsetypes.from_args(headers=headers, url=self.url) return respcls(url=self.url, status=status, headers=headers, body=body) def _set_connection_attributes(self, request): parsed = urlparse_cached(request) self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed) self.use_tunnel = False proxy = request.meta.get('proxy') if proxy: old_scheme, old_host, old_port = self.scheme, self.host, self.port self.scheme, _, self.host, self.port, _ = _parse(proxy) self.path = self.url if old_scheme=="https": self.headers['Proxy-Connection'] = 'keep-alive' self.use_tunnel = True self.tunnel_to_host = old_host self.tunnel_to_port = old_port def gotHeaders(self, headers): self.headers_time = time() self.response_headers = headers
def test_int_value(self): h1 = Headers({'hey': 5}) h1['foo'] = 1 h1.setdefault('bar', 2) h1.setlist('buz', [1, 'dos', 3]) self.assertEqual(h1.getlist('foo'), [b'1']) self.assertEqual(h1.getlist('bar'), [b'2']) self.assertEqual(h1.getlist('buz'), [b'1', b'dos', b'3']) self.assertEqual(h1.getlist('hey'), [b'5'])
def test_int_value(self): h1 = Headers({'hey': 5}) h1['foo'] = 1 h1.setdefault('bar', 2) h1.setlist('buz', [1, 'dos', 3]) self.assertEqual(h1.getlist('foo'), [b'1']) self.assertEqual(h1.getlist('bar'), [b'2']) self.assertEqual(h1.getlist('buz'), [b'1', b'dos', b'3']) self.assertEqual(h1.getlist('hey'), [b'5'])
def test_int_value(self): h1 = Headers({"hey": 5}) h1["foo"] = 1 h1.setdefault("bar", 2) h1.setlist("buz", [1, "dos", 3]) self.assertEqual(h1.getlist("foo"), [b"1"]) self.assertEqual(h1.getlist("bar"), [b"2"]) self.assertEqual(h1.getlist("buz"), [b"1", b"dos", b"3"]) self.assertEqual(h1.getlist("hey"), [b"5"])
def test_setdefault(self): h = Headers() hlist = ['ip1', 'ip2'] olist = h.setdefault('X-Forwarded-For', hlist) assert h.getlist('X-Forwarded-For') is not hlist assert h.getlist('X-Forwarded-For') is olist h = Headers() olist = h.setdefault('X-Forwarded-For', 'ip1') self.assertEqual(h.getlist('X-Forwarded-For'), ['ip1']) assert h.getlist('X-Forwarded-For') is olist
def test_setdefault(self): h = Headers() hlist = ['ip1', 'ip2'] olist = h.setdefault('X-Forwarded-For', hlist) assert h.getlist('X-Forwarded-For') is not hlist assert h.getlist('X-Forwarded-For') is olist h = Headers() olist = h.setdefault('X-Forwarded-For', 'ip1') self.assertEqual(h.getlist('X-Forwarded-For'), ['ip1']) assert h.getlist('X-Forwarded-For') is olist
def test_setdefault(self): h = Headers() hlist = ["ip1", "ip2"] olist = h.setdefault("X-Forwarded-For", hlist) assert h.getlist("X-Forwarded-For") is not hlist assert h.getlist("X-Forwarded-For") is olist h = Headers() olist = h.setdefault("X-Forwarded-For", "ip1") self.assertEqual(h.getlist("X-Forwarded-For"), [b"ip1"]) assert h.getlist("X-Forwarded-For") is olist
class ScrapyHTTPClientFactory(HTTPClientFactory): """Scrapy implementation of the HTTPClientFactory overwriting the serUrl method to make use of our Url object that cache the parse result. """ protocol = ScrapyHTTPPageGetter waiting = 1 noisy = False followRedirect = False afterFoundGet = False def __init__(self, request, timeout=180): self.url = urldefrag(request.url)[0] self.method = request.method self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.timeout = request.meta.get('download_timeout') or timeout self.deferred = defer.Deferred().addCallback(self._build_response) self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("Connection", "close") def _build_response(self, body): status = int(self.status) headers = Headers(self.response_headers) respcls = responsetypes.from_args(headers=headers, url=self.url) return respcls(url=self.url, status=status, headers=headers, body=body) def _set_connection_attributes(self, request): parsed = urlparse_cached(request) self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed) proxy = request.meta.get('proxy') if proxy: self.scheme, _, self.host, self.port, _ = _parse(proxy) self.path = self.url def gotHeaders(self, headers): self.response_headers = headers
class ScrapyHTTPClientFactory(ClientFactory): protocol = ScrapyHTTPPageGetter waiting = 1 noisy = False followRedirect = False afterFoundGet = False def _build_response(self, body, request): request.meta['download_latency'] = self.headers_time - self.start_time status = int(self.status) headers = Headers(self.response_headers) respcls = responsetypes.from_args(headers=headers, url=self._url) return respcls(url=self._url, status=status, headers=headers, body=body, protocol=to_unicode(self.version)) def _set_connection_attributes(self, request): parsed = urlparse_cached(request) self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args( parsed) proxy = request.meta.get('proxy') if proxy: self.scheme, _, self.host, self.port, _ = _parse(proxy) self.path = self.url def __init__(self, request, timeout=180): self._url = urldefrag(request.url)[0] # converting to bytes to comply to Twisted interface self.url = to_bytes(self._url, encoding='ascii') self.method = to_bytes(request.method, encoding='ascii') self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.timeout = request.meta.get('download_timeout') or timeout self.start_time = time() self.deferred = defer.Deferred().addCallback(self._build_response, request) # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected # to have _disconnectedDeferred. See Twisted r32329. # As Scrapy implements it's own logic to handle redirects is not # needed to add the callback _waitForDisconnect. # Specifically this avoids the AttributeError exception when # clientConnectionFailed method is called. self._disconnectedDeferred = defer.Deferred() self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("Connection", "close") # Content-Length must be specified in POST method even with no body elif self.method == b'POST': self.headers['Content-Length'] = 0 def __repr__(self): return f"<{self.__class__.__name__}: {self.url}>" def _cancelTimeout(self, result, timeoutCall): if timeoutCall.active(): timeoutCall.cancel() return result def buildProtocol(self, addr): p = ClientFactory.buildProtocol(self, addr) p.followRedirect = self.followRedirect p.afterFoundGet = self.afterFoundGet if self.timeout: timeoutCall = reactor.callLater(self.timeout, p.timeout) self.deferred.addBoth(self._cancelTimeout, timeoutCall) return p def gotHeaders(self, headers): self.headers_time = time() self.response_headers = headers def gotStatus(self, version, status, message): """ Set the status of the request on us. @param version: The HTTP version. @type version: L{bytes} @param status: The HTTP status code, an integer represented as a bytestring. @type status: L{bytes} @param message: The HTTP status message. @type message: L{bytes} """ self.version, self.status, self.message = version, status, message def page(self, page): if self.waiting: self.waiting = 0 self.deferred.callback(page) def noPage(self, reason): if self.waiting: self.waiting = 0 self.deferred.errback(reason) def clientConnectionFailed(self, _, reason): """ When a connection attempt fails, the request cannot be issued. If no result has yet been provided to the result Deferred, provide the connection failure reason as an error result. """ if self.waiting: self.waiting = 0 # If the connection attempt failed, there is nothing more to # disconnect, so just fire that Deferred now. self._disconnectedDeferred.callback(None) self.deferred.errback(reason)