class ScrapyHTTPClientFactory(HTTPClientFactory): """Scrapy implementation of the HTTPClientFactory overwriting the serUrl method to make use of our Url object that cache the parse result. """ protocol = ScrapyHTTPPageGetter waiting = 1 noisy = False followRedirect = False afterFoundGet = False def __init__(self, request, timeout=180): self.url = urldefrag(request.url)[0] self.method = request.method self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.timeout = request.meta.get('download_timeout') or timeout self.start_time = time() self.deferred = defer.Deferred().addCallback(self._build_response, request) # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected # to have _disconnectedDeferred. See Twisted r32329. # As Scrapy implements it's own logic to handle redirects is not # needed to add the callback _waitForDisconnect. # Specifically this avoids the AttributeError exception when # clientConnectionFailed method is called. self._disconnectedDeferred = defer.Deferred() self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("Connection", "close") def _build_response(self, body, request): request.meta['download_latency'] = self.headers_time-self.start_time status = int(self.status) headers = Headers(self.response_headers) respcls = responsetypes.from_args(headers=headers, url=self.url) return respcls(url=self.url, status=status, headers=headers, body=body) def _set_connection_attributes(self, request): parsed = urlparse_cached(request) self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed) self.bind_address = request.meta.get("bind_address") proxy = request.meta.get('proxy') if proxy: self.scheme, _, self.host, self.port, _ = _parse(proxy) self.path = self.url def gotHeaders(self, headers): self.headers_time = time() self.response_headers = headers
def retrieve_response(self, spider, request): key = self._request_key(spider, request) try: gf = self.fs[spider].fetchDocument(key, rawResults=True) print("found\t" + str(request.url), request.priority) except pyArango.theExceptions.DocumentNotFoundError: print("not found\t" + str(request.url), request.priority) return url = str(gf["url"]) status = str(gf["status"]) bod = gf["body"].encode("utf-8") body = zlib.decompress(base64.urlsafe_b64decode(bod)) try: if gf["headers"].get("content-encoding" ) == "gzip" and not gzip_magic_number(body): del gf["headers"]["content-encoding"] elif not gf["headers"].get( "content-encoding") and gzip_magic_number(body): gf["headers"]["content-encoding"] = "gzip" headers = Headers([(x, str(y)) for x, y in gf["headers"].items()]) except AttributeError as e: if gzip_magic_number(body): #print("added headers") headers = Headers((("Content-Encoding", "gzip"), )) else: #print("headers=None") headers = None respcls = responsetypes.from_args(headers=headers, url=url, body=body) response = respcls(url=url, headers=headers, status=status, body=body) return response
def test_none_value(self): h1 = Headers() h1['foo'] = 'bar' h1['foo'] = None h1.setdefault('foo', 'bar') self.assertEqual(h1.get('foo'), None) self.assertEqual(h1.getlist('foo'), [])
def test_copy(self): h1 = Headers({"header1": ["value1", "value2"]}) h2 = copy.copy(h1) self.assertEqual(h1, h2) self.assertEqual(h1.getlist("header1"), h2.getlist("header1")) assert h1.getlist("header1") is not h2.getlist("header1") assert isinstance(h2, Headers)
def test_from_args(self): # TODO: add more tests that check precedence between the different arguments mappings = [ ({ 'url': 'http://www.example.com/data.csv' }, TextResponse), # headers takes precedence over url ({ 'headers': Headers({'Content-Type': ['text/html; charset=utf-8']}), 'url': 'http://www.example.com/item/' }, HtmlResponse), ({ 'headers': Headers({ 'Content-Disposition': ['attachment; filename="data.xml.gz"'] }), 'url': 'http://www.example.com/page/' }, Response), ] for source, cls in mappings: retcls = responsetypes.from_args(**source) assert retcls is cls, "%s ==> %s != %s" % (source, retcls, cls)
def test_copy(self): h1 = Headers({'header1': ['value1', 'value2']}) h2 = copy.copy(h1) self.assertEqual(h1, h2) self.assertEqual(h1.getlist('header1'), h2.getlist('header1')) assert h1.getlist('header1') is not h2.getlist('header1') assert isinstance(h2, Headers)
def test_multivalue(self): h = Headers() h["X-Forwarded-For"] = hlist = ["ip1", "ip2"] self.assertEqual(h["X-Forwarded-For"], b"ip2") self.assertEqual(h.get("X-Forwarded-For"), b"ip2") self.assertEqual(h.getlist("X-Forwarded-For"), [b"ip1", b"ip2"]) assert h.getlist("X-Forwarded-For") is not hlist
def __init__(self, request, timeout=180): self._url = urldefrag(request.url)[0] # converting to bytes to comply to Twisted interface self.url = to_bytes(self._url, encoding='ascii') self.method = to_bytes(request.method, encoding='ascii') self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.timeout = request.meta.get('download_timeout') or timeout self.start_time = time() self.deferred = defer.Deferred().addCallback(self._build_response, request) # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected # to have _disconnectedDeferred. See Twisted r32329. # As Scrapy implements it's own logic to handle redirects is not # needed to add the callback _waitForDisconnect. # Specifically this avoids the AttributeError exception when # clientConnectionFailed method is called. self._disconnectedDeferred = defer.Deferred() self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("Connection", "close") # Content-Length must be specified in POST method even with no body elif self.method == b'POST': self.headers['Content-Length'] = 0
def test_multivalue(self): h = Headers() h['X-Forwarded-For'] = hlist = ['ip1', 'ip2'] self.assertEqual(h['X-Forwarded-For'], b'ip2') self.assertEqual(h.get('X-Forwarded-For'), b'ip2') self.assertEqual(h.getlist('X-Forwarded-For'), [b'ip1', b'ip2']) assert h.getlist('X-Forwarded-For') is not hlist
class ScrapyHTTPClientFactory(HTTPClientFactory): """Scrapy implementation of the HTTPClientFactory overwriting the serUrl method to make use of our Url object that cache the parse result. """ protocol = ScrapyHTTPPageGetter waiting = 1 noisy = False followRedirect = False afterFoundGet = False def __init__(self, request, timeout=180): self.url = urldefrag(request.url)[0] self.method = request.method self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.timeout = request.meta.get('download_timeout') or timeout self.start_time = time() self.deferred = defer.Deferred().addCallback(self._build_response, request) # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected # to have _disconnectedDeferred. See Twisted r32329. # As Scrapy implements it's own logic to handle redirects is not # needed to add the callback _waitForDisconnect. # Specifically this avoids the AttributeError exception when # clientConnectionFailed method is called. self._disconnectedDeferred = defer.Deferred() self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("Connection", "close") def _build_response(self, body, request): request.meta['download_latency'] = self.headers_time-self.start_time status = int(self.status) headers = Headers(self.response_headers) respcls = responsetypes.from_args(headers=headers, url=self.url) return respcls(url=self.url, status=status, headers=headers, body=body) def _set_connection_attributes(self, request): parsed = urlparse_cached(request) self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed) proxy = request.meta.get('proxy') if proxy: self.scheme, _, self.host, self.port, _ = _parse(proxy) self.path = self.url def gotHeaders(self, headers): self.headers_time = time() self.response_headers = headers
def test_is_gzipped_case_insensitive(self): hdrs = Headers({"Content-Type": "Application/X-Gzip"}) r1 = Response("http://www.example.com", headers=hdrs) self.assertTrue(is_gzipped(r1)) hdrs = Headers({"Content-Type": "application/X-GZIP ; charset=utf-8"}) r1 = Response("http://www.example.com", headers=hdrs) self.assertTrue(is_gzipped(r1))
class ScrapyHTTPClientFactory(HTTPClientFactory): """Scrapy implementation of the HTTPClientFactory overwriting the serUrl method to make use of our Url object that cache the parse result. """ protocol = ScrapyHTTPPageGetter waiting = 1 noisy = False followRedirect = False afterFoundGet = False def __init__(self, request, timeout=180): self.url = urldefrag(request.url)[0] self.method = request.method self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.timeout = request.meta.get('download_timeout') or timeout self.start_time = time() self.deferred = defer.Deferred().addCallback(self._build_response, request) self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("Connection", "close") def _build_response(self, body, request): request.meta['download_latency'] = self.headers_time-self.start_time status = int(self.status) headers = Headers(self.response_headers) respcls = responsetypes.from_args(headers=headers, url=self.url) return respcls(url=self.url, status=status, headers=headers, body=body) def _set_connection_attributes(self, request): parsed = urlparse_cached(request) self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed) self.use_tunnel = False proxy = request.meta.get('proxy') if proxy: old_scheme, old_host, old_port = self.scheme, self.host, self.port self.scheme, _, self.host, self.port, _ = _parse(proxy) self.path = self.url if old_scheme=="https": self.headers['Proxy-Connection'] = 'keep-alive' self.use_tunnel = True self.tunnel_to_host = old_host self.tunnel_to_port = old_port def gotHeaders(self, headers): self.headers_time = time() self.response_headers = headers
def test_invalid_value(self): self.assertRaisesRegex(TypeError, 'Unsupported value type', Headers, {'foo': object()}) self.assertRaisesRegex(TypeError, 'Unsupported value type', Headers().__setitem__, 'foo', object()) self.assertRaisesRegex(TypeError, 'Unsupported value type', Headers().setdefault, 'foo', object()) self.assertRaisesRegex(TypeError, 'Unsupported value type', Headers().setlist, 'foo', [object()])
def test_appendlist(self): h1 = Headers({"header1": "value1"}) h1.appendlist("header1", "value3") self.assertEqual(h1.getlist("header1"), [b"value1", b"value3"]) h1 = Headers() h1.appendlist("header1", "value1") h1.appendlist("header1", "value3") self.assertEqual(h1.getlist("header1"), [b"value1", b"value3"])
def test_invalid_value(self): self.assertRaisesRegex(TypeError, "Unsupported value type", Headers, {"foo": object()}) self.assertRaisesRegex(TypeError, "Unsupported value type", Headers().__setitem__, "foo", object()) self.assertRaisesRegex(TypeError, "Unsupported value type", Headers().setdefault, "foo", object()) self.assertRaisesRegex(TypeError, "Unsupported value type", Headers().setlist, "foo", [object()])
def test_appendlist(self): h1 = Headers({'header1': 'value1'}) h1.appendlist('header1', 'value3') self.assertEqual(h1.getlist('header1'), ['value1', 'value3']) h1 = Headers() h1.appendlist('header1', 'value1') h1.appendlist('header1', 'value3') self.assertEqual(h1.getlist('header1'), ['value1', 'value3'])
class ScrapyHTTPPageGetter(HTTPClient): delimiter = '\n' def connectionMade(self): self.headers = Headers() # bucket for response headers # Method command self.sendCommand(self.factory.method, self.factory.path) # Headers for key, values in self.factory.headers.items(): for value in values: self.sendHeader(key, value) self.endHeaders() # Body if self.factory.body is not None: self.transport.write(self.factory.body) def extractHeader(self, header): key, val = header.split(':', 1) val = val.lstrip() self.handleHeader(key, val) if key.lower() == 'content-length': self.length = int(val) def lineReceived(self, line): try: HTTPClient.lineReceived(self, line.rstrip()) except: self.factory.add_invalid_header(line) def handleHeader(self, key, value): self.headers.appendlist(key, value) def handleStatus(self, version, status, message): self.factory.gotStatus(version, status, message) def handleEndHeaders(self): self.factory.gotHeaders(self.headers) def connectionLost(self, reason): HTTPClient.connectionLost(self, reason) self.factory.noPage(reason) def handleResponse(self, response): if self.factory.method.upper() == 'HEAD': self.factory.page('') else: self.factory.page(response) self.transport.loseConnection() def timeout(self): self.transport.loseConnection() self.factory.noPage(\ defer.TimeoutError("Getting %s took longer than %s seconds." % \ (self.factory.url, self.factory.timeout)))
class ScrapyHTTPPageGetter(HTTPClient): delimiter = b'\n' def connectionMade(self): self.headers = Headers() # bucket for response headers # Method command self.sendCommand(self.factory.method, self.factory.path) # Headers for key, values in self.factory.headers.items(): for value in values: self.sendHeader(key, value) self.endHeaders() # Body if self.factory.body is not None: self.transport.write(self.factory.body) def lineReceived(self, line): return HTTPClient.lineReceived(self, line.rstrip()) def handleHeader(self, key, value): self.headers.appendlist(key, value) def handleStatus(self, version, status, message): self.factory.gotStatus(version, status, message) def handleEndHeaders(self): self.factory.gotHeaders(self.headers) def connectionLost(self, reason): self._connection_lost_reason = reason HTTPClient.connectionLost(self, reason) self.factory.noPage(reason) def handleResponse(self, response): if self.factory.method.upper() == b'HEAD': self.factory.page(b'') elif self.length is not None and self.length > 0: self.factory.noPage(self._connection_lost_reason) else: self.factory.page(response) self.transport.loseConnection() def timeout(self): self.transport.loseConnection() # transport cleanup needed for HTTPS connections if self.factory.url.startswith(b'https'): self.transport.stopProducing() self.factory.noPage(\ defer.TimeoutError("Getting %s took longer than %s seconds." % \ (self.factory.url, self.factory.timeout)))
def test_iterables(self): idict = {'Content-Type': 'text/html', 'X-Forwarded-For': ['ip1', 'ip2']} h = Headers(idict) self.assertEqual(dict(h), {'Content-Type': ['text/html'], 'X-Forwarded-For': ['ip1', 'ip2']}) self.assertEqual(h.keys(), ['X-Forwarded-For', 'Content-Type']) self.assertEqual(h.items(), [('X-Forwarded-For', ['ip1', 'ip2']), ('Content-Type', ['text/html'])]) self.assertEqual(list(h.iteritems()), [('X-Forwarded-For', ['ip1', 'ip2']), ('Content-Type', ['text/html'])]) self.assertEqual(h.values(), ['ip2', 'text/html'])
def test_non_standard_line_endings(self): # regression test for: http://dev.scrapy.org/ticket/258 factory = client.ScrapyHTTPClientFactory(Request( url='http://foo/bar')) protocol = client.ScrapyHTTPPageGetter() protocol.factory = factory protocol.headers = Headers() protocol.dataReceived(b"HTTP/1.0 200 OK\n") protocol.dataReceived(b"Hello: World\n") protocol.dataReceived(b"Foo: Bar\n") protocol.dataReceived(b"\n") self.assertEqual(protocol.headers, Headers({'Hello': ['World'], 'Foo': ['Bar']}))
def test_headers_to_scrapy(): assert headers_to_scrapy(None) == Headers() assert headers_to_scrapy({}) == Headers() assert headers_to_scrapy([]) == Headers() html_headers = Headers({'Content-Type': 'text/html'}) assert headers_to_scrapy({'Content-Type': 'text/html'}) == html_headers assert headers_to_scrapy([('Content-Type', 'text/html')]) == html_headers assert headers_to_scrapy([{ 'name': 'Content-Type', 'value': 'text/html' }]) == html_headers
def test_iterables(self): idict = {'Content-Type': 'text/html', 'X-Forwarded-For': ['ip1', 'ip2']} h = Headers(idict) self.assertDictEqual(dict(h), {b'Content-Type': [b'text/html'], b'X-Forwarded-For': [b'ip1', b'ip2']}) self.assertSortedEqual(h.keys(), [b'X-Forwarded-For', b'Content-Type']) self.assertSortedEqual(h.items(), [(b'X-Forwarded-For', [b'ip1', b'ip2']), (b'Content-Type', [b'text/html'])]) self.assertSortedEqual(h.values(), [b'ip2', b'text/html'])
def connectionMade(self): self.headers = Headers() # bucket for response headers # Method command self.sendCommand(self.factory.method, self.factory.path) # Headers for key, values in self.factory.headers.items(): for value in values: self.sendHeader(key, value) self.endHeaders() # Body if self.factory.body is not None: self.transport.write(self.factory.body)
def test_setdefault(self): h = Headers() hlist = ["ip1", "ip2"] olist = h.setdefault("X-Forwarded-For", hlist) assert h.getlist("X-Forwarded-For") is not hlist assert h.getlist("X-Forwarded-For") is olist h = Headers() olist = h.setdefault("X-Forwarded-For", "ip1") self.assertEqual(h.getlist("X-Forwarded-For"), [b"ip1"]) assert h.getlist("X-Forwarded-For") is olist
def retrieve_response(self, spider, request): """ Return response if present in cache, or None otherwise. """ key = self._get_key(spider, request) epoch = request.meta.get('epoch') # guaranteed to be True or datetime s3_key = self._get_s3_key(key, epoch) logger.debug('S3Storage retrieving response for key %s.' % (s3_key)) if not s3_key: return logger.info('S3Storage (epoch => %s): retrieving response for %s.' % (epoch, request.url)) try: data_string = s3_key.get_contents_as_string() except boto.exception.S3ResponseError as e: # See store_response for error descriptions raise e finally: s3_key.close() data = json.loads(data_string) metadata = data['metadata'] request_headers = Headers(data['request_headers']) request_body = data['request_body'] response_headers = Headers(data['response_headers']) response_body = data['response_body'] if 'binary' in data and data['binary'] == True: logger.debug('S3Storage: retrieved binary body') response_body = base64.decode(response_body) url = metadata['response_url'] status = metadata.get('status') logger.debug( 'S3Storage: response headers {} '.format(response_headers)) Response = responsetypes.from_args(headers=response_headers, url=url, body=response_body) logger.debug('S3Storage: response type {} '.format(Response)) return Response(url=url, headers=response_headers, status=status, body=response_body)
def test_setdefault(self): h = Headers() hlist = ['ip1', 'ip2'] olist = h.setdefault('X-Forwarded-For', hlist) assert h.getlist('X-Forwarded-For') is not hlist assert h.getlist('X-Forwarded-For') is olist h = Headers() olist = h.setdefault('X-Forwarded-For', 'ip1') self.assertEqual(h.getlist('X-Forwarded-For'), ['ip1']) assert h.getlist('X-Forwarded-For') is olist
class ScrapyHTTPPageGetter(HTTPClient): delimiter = '\n' def connectionMade(self): self.headers = Headers() # bucket for response headers # Method command self.sendCommand(self.factory.method, self.factory.path) # Headers for key, values in self.factory.headers.items(): for value in values: self.sendHeader(key, value) self.endHeaders() # Body if self.factory.body is not None: self.transport.write(self.factory.body) def lineReceived(self, line): return HTTPClient.lineReceived(self, line.rstrip()) def handleHeader(self, key, value): self.headers.appendlist(key, value) def handleStatus(self, version, status, message): self.factory.gotStatus(version, status, message) def handleEndHeaders(self): self.factory.gotHeaders(self.headers) def connectionLost(self, reason): HTTPClient.connectionLost(self, reason) self.factory.noPage(reason) def handleResponse(self, response): if self.factory.method.upper() == 'HEAD': self.factory.page('') elif self.length != None and self.length != 0: self.factory.noPage(failure.Failure( PartialDownloadError(self.factory.status, None, response))) else: self.factory.page(response) self.transport.loseConnection() def timeout(self): self.transport.loseConnection() self.factory.noPage(\ defer.TimeoutError("Getting %s took longer than %s seconds." % \ (self.factory.url, self.factory.timeout)))
class ScrapyHTTPClientFactory(HTTPClientFactory): """Scrapy implementation of the HTTPClientFactory overwriting the serUrl method to make use of our Url object that cache the parse result. """ protocol = ScrapyHTTPPageGetter waiting = 1 noisy = False followRedirect = False afterFoundGet = False def __init__(self, request, timeout=180): self.url = urldefrag(request.url)[0] self.method = request.method self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.timeout = request.meta.get('download_timeout') or timeout self.deferred = defer.Deferred().addCallback(self._build_response) self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("Connection", "close") def _build_response(self, body): status = int(self.status) headers = Headers(self.response_headers) respcls = responsetypes.from_args(headers=headers, url=self.url) return respcls(url=self.url, status=status, headers=headers, body=body) def _set_connection_attributes(self, request): parsed = urlparse_cached(request) self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed) proxy = request.meta.get('proxy') if proxy: self.scheme, _, self.host, self.port, _ = _parse(proxy) self.path = self.url def gotHeaders(self, headers): self.response_headers = headers
def __init__(self, url, callback=None, method='GET', headers=None, body=None, cookies=None, meta=None, encoding='utf-8', priority=0, dont_filter=False, errback=None, flags=None): # 这是第一个被设置的参数,用于与编码 URL,或者转换 body 为字符(如果body是 # unicode 编码) self._encoding = encoding self.method = str(method).upper() self._set_url(url) self._set_body(body) assert isinstance(priority, int), f"需要整型数据 priority" self.priority = priority if callback is not None and not callable(callback): raise TypeError(f"callback 必须是可调用对象,实际上是 {type(callback).__name__}") if errback is not None and not callable(errback): raise TypeError(f"errback 必须是可调用对象,实际上是 {type(errback).__name__}") assert callback or not errback, "callback 必须配置 errback" self.callback = callback self.errback = errback self.cookies = cookies or {} self.headers = Headers(headers or {}, encoding=encoding) self.dont_filter = dont_filter self._meta = dict(meta) if meta else None self.flags = [] if flags is None else list(flags)
def _headers_from_twisted_response(response): headers = Headers() if response.length is not None: headers[b'Content-Length'] = str(response.length).encode() for key, value in response.headers.getAllRawHeaders(): headers[key] = value return headers
def retrieve_response(self, spider, request): """Return response if present in cache, or None otherwise.""" key = self._get_request_key(spider, request) expiration_time = self._get_expiration_time(spider) metadata = self._read_meta(key, expiration_time) if metadata is None: return # not cached spider.log("%s: found cache for %s" % (self.__class__.__name__, request.url)) res = self._get_cached_data(key, spider) if res is None: return None spider.log("%s: got response from cache for %s" % (self.__class__.__name__, request.url)) response_url = res['response_url'] response_body = str(res['response_body']) response_rawheaders = res['response_meta']['headers'] status = res['response_meta']['status'] response_headers = Headers(headers_raw_to_dict(response_rawheaders)) response_cls = responsetypes.from_args(headers=response_headers, url=response_url) response = response_cls(url=response_url, headers=response_headers, status=status, body=response_body) return response
def _cb_bodydone(self, result, request, url): txresponse, body, flags = result status = int(txresponse.code) headers = Headers(txresponse.headers.getAllRawHeaders()) respcls = responsetypes.from_args(headers=headers, url=url, body=body) # print(respcls) # <class 'scrapy.http.response.html.HtmlResponse'> return respcls(url=url, status=status, headers=headers, body=body, flags=flags)
def headers_to_scrapy(headers): """ Return scrapy.http.Headers instance from headers data. 3 data formats are supported: * {name: value, ...} dict; * [(name, value), ...] list; * [{'name': name, 'value': value'}, ...] list (HAR headers format). """ if isinstance(headers or {}, dict): return Headers(headers or {}) if isinstance(headers[0], dict): return Headers([(d['name'], d.get('value', '')) for d in headers]) return Headers(headers)
def test_iterables(self): idict = {'Content-Type': 'text/html', 'X-Forwarded-For': ['ip1', 'ip2']} h = Headers(idict) self.assertDictEqual(dict(h), {b'Content-Type': [b'text/html'], b'X-Forwarded-For': [b'ip1', b'ip2']}) self.assertSortedEqual(h.keys(), [b'X-Forwarded-For', b'Content-Type']) self.assertSortedEqual(h.items(), [(b'X-Forwarded-For', [b'ip1', b'ip2']), (b'Content-Type', [b'text/html'])]) self.assertSortedEqual(h.iteritems(), [(b'X-Forwarded-For', [b'ip1', b'ip2']), (b'Content-Type', [b'text/html'])]) self.assertSortedEqual(h.values(), [b'ip2', b'text/html'])
def download_request(self, request, spider): # args = ['--ignore-ssl-errors=true', '--load-images=false'] # Remove 'phantomjs-' prefix url = request.url[10:] # driver = webdriver.PhantomJS(service_args=args, # executable_path=\ # 'E:/learn/software/phantomjs-2.0.0-windows/bin/phantomjs', # port=65000) # driver = webdriver.PhantomJS(executable_path=\ # 'E:/learn/software/phantomjs-2.0.0-windows/bin/phantomjs') #windows driver = webdriver.PhantomJS(executable_path=\ 'phantomjs') #centos # driver = webdriver.Chrome(executable_path="C:/Users/LENOVO/Desktop/to/chromedriver.exe") # time.sleep(20) #dyh did it driver.get(url) # body = driver.find_element_by_xpath('//*').get_attribute("outerHTML") #dyh did it body = driver.find_element_by_xpath('//body').get_attribute( "innerHTML") driver.quit() # Set header so httpcache chooses the appropriate Response class headers = Headers({'Content-Type': 'text/html'}) body = body.encode('utf-8') return HtmlResponse(url=url, headers=headers, body=body)
def __init__(self, request, timeout=180): self.url = urldefrag(request.url)[0] self.method = request.method self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.timeout = request.meta.get('download_timeout') or timeout self.start_time = time() self.deferred = defer.Deferred().addCallback(self._build_response, request) # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected # to have _disconnectedDeferred. See Twisted r32329. # As Scrapy implements it's own logic to handle redirects is not # needed to add the callback _waitForDisconnect. # Specifically this avoids the AttributeError exception when # clientConnectionFailed method is called. self._disconnectedDeferred = defer.Deferred() self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("Connection", "close")
def process_request(self, request, spider): if not isinstance(request, PuppeteerRequest): return action = request.action service_url = urljoin(self.service_base_url, action.endpoint) service_params = self._encode_service_params(request) if service_params: service_url += '?' + service_params return Request( url=service_url, method='POST', headers=Headers({'Content-Type': action.content_type}), body=self._serialize_body(action, request), dont_filter=True, cookies=request.cookies, priority=request.priority, callback=request.callback, cb_kwargs=request.cb_kwargs, errback=request.errback, meta={ 'puppeteer_request': request, 'dont_obey_robotstxt': True, 'proxy': None } )
def test_netscape_example_2(self): # Second Example transaction sequence: # # Assume all mappings from above have been cleared. # # Client receives: # # Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/ # # When client requests a URL in path "/" on this server, it sends: # # Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001 # # Client receives: # # Set-Cookie: PART_NUMBER=RIDING_ROCKET_0023; path=/ammo # # When client requests a URL in path "/ammo" on this server, it sends: # # Cookie: PART_NUMBER=RIDING_ROCKET_0023; PART_NUMBER=ROCKET_LAUNCHER_0001 # # NOTE: There are two name/value pairs named "PART_NUMBER" due to # the inheritance of the "/" mapping in addition to the "/ammo" mapping. c = CookieJar() headers = Headers({'Set-Cookie': 'PART_NUMBER=ROCKET_LAUNCHER_0001; path=/'}) req = Request("http://www.acme.com/") res = Response("http://www.acme.com/", headers=headers) c.extract_cookies(res, req) req = Request("http://www.acme.com/") c.add_cookie_header(req) self.assertEquals(req.headers.get("Cookie"), "PART_NUMBER=ROCKET_LAUNCHER_0001") headers.appendlist("Set-Cookie", "PART_NUMBER=RIDING_ROCKET_0023; path=/ammo") res = Response("http://www.acme.com/", headers=headers) c.extract_cookies(res, req) req = Request("http://www.acme.com/ammo") c.add_cookie_header(req) self.assert_(re.search(r"PART_NUMBER=RIDING_ROCKET_0023;\s*" "PART_NUMBER=ROCKET_LAUNCHER_0001", req.headers.get("Cookie")))
def test_headers(self): # Different ways of setting headers attribute url = "http://www.scrapy.org" headers = {"Accept": "gzip", "Custom-Header": "nothing to tell you"} r = self.request_class(url=url, headers=headers) p = self.request_class(url=url, headers=r.headers) self.assertEqual(r.headers, p.headers) self.assertFalse(r.headers is headers) self.assertFalse(p.headers is r.headers) # headers must not be unicode h = Headers({"key1": u"val1", u"key2": "val2"}) h[u"newkey"] = u"newval" for k, v in h.iteritems(): self.assert_(isinstance(k, str)) for s in v: self.assert_(isinstance(s, str))
def test_headers(self): # Different ways of setting headers attribute url = 'http://www.scrapy.org' headers = {'Accept':'gzip', 'Custom-Header':'nothing to tell you'} r = self.request_class(url=url, headers=headers) p = self.request_class(url=url, headers=r.headers) self.assertEqual(r.headers, p.headers) self.assertFalse(r.headers is headers) self.assertFalse(p.headers is r.headers) # headers must not be unicode h = Headers({'key1': u'val1', u'key2': 'val2'}) h[u'newkey'] = u'newval' for k, v in h.iteritems(): self.assert_(isinstance(k, str)) for s in v: self.assert_(isinstance(s, str))
def test_int_value(self): h1 = Headers({'hey': 5}) h1['foo'] = 1 h1.setdefault('bar', 2) h1.setlist('buz', [1, 'dos', 3]) self.assertEqual(h1.getlist('foo'), [b'1']) self.assertEqual(h1.getlist('bar'), [b'2']) self.assertEqual(h1.getlist('buz'), [b'1', b'dos', b'3']) self.assertEqual(h1.getlist('hey'), [b'5'])
def connectionMade(self): self.headers = Headers() # bucket for response headers if self.factory.use_tunnel: log.msg("Sending CONNECT", log.DEBUG) self.tunnel_started = False self.sendCommand("CONNECT", "%s:%s" % (self.factory.tunnel_to_host, self.factory.tunnel_to_port)) self.sendHeaders(only=['Host','Proxy-Connection', 'User-Agent']) del self.factory.headers['Proxy-Connection'] else: self.sendEverything()
def test_netscape_misc(self): # Some additional Netscape cookies tests. c = CookieJar() headers = Headers() req = Request("http://foo.bar.acme.com/foo") # Netscape allows a host part that contains dots headers.appendlist("Set-Cookie", "Customer=WILE_E_COYOTE; domain=.acme.com") res = Response("http://www.acme.com/foo", headers=headers) c.extract_cookies(res, req) # and that the domain is the same as the host without adding a leading # dot to the domain. Should not quote even if strange chars are used # in the cookie value. headers.appendlist("Set-Cookie", "PART_NUMBER=3,4; domain=foo.bar.acme.com") res = Response("http://www.acme.com/foo", headers=headers) c.extract_cookies(res, req) req = Request("http://foo.bar.acme.com/foo") c.add_cookie_header(req) self.assert_( "PART_NUMBER=3,4" in req.headers.get("Cookie") and "Customer=WILE_E_COYOTE" in req.headers.get("Cookie"))
def test_basics(self): h = Headers({'Content-Type': 'text/html', 'Content-Length': 1234}) assert h['Content-Type'] assert h['Content-Length'] self.assertRaises(KeyError, h.__getitem__, 'Accept') self.assertEqual(h.get('Accept'), None) self.assertEqual(h.getlist('Accept'), []) self.assertEqual(h.get('Accept', '*/*'), '*/*') self.assertEqual(h.getlist('Accept', '*/*'), ['*/*']) self.assertEqual(h.getlist('Accept', ['text/html', 'images/jpeg']), ['text/html','images/jpeg'])
def __init__(self, request, timeout=180): self.url = urldefrag(request.url)[0] self.method = request.method self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.timeout = request.meta.get('download_timeout') or timeout self.deferred = defer.Deferred().addCallback(self._build_response) self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("Connection", "close")
def test_update(self): h = Headers() h.update( {'Content-Type': 'text/html', 'X-Forwarded-For': ['ip1', 'ip2']}) self.assertEqual(h.getlist('Content-Type'), ['text/html']) self.assertEqual(h.getlist('X-Forwarded-For'), ['ip1', 'ip2'])
def test_netscape_example_1(self): #------------------------------------------------------------------- # First we check that it works for the original example at # http://www.netscape.com/newsref/std/cookie_spec.html # Client requests a document, and receives in the response: # # Set-Cookie: CUSTOMER=WILE_E_COYOTE; path=/; expires=Wednesday, 09-Nov-99 23:12:40 GMT # # When client requests a URL in path "/" on this server, it sends: # # Cookie: CUSTOMER=WILE_E_COYOTE # # Client requests a document, and receives in the response: # # Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/ # # When client requests a URL in path "/" on this server, it sends: # # Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001 # # Client receives: # # Set-Cookie: SHIPPING=FEDEX; path=/fo # # When client requests a URL in path "/" on this server, it sends: # # Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001 # # When client requests a URL in path "/foo" on this server, it sends: # # Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001; SHIPPING=FEDEX # # The last Cookie is buggy, because both specifications say that the # most specific cookie must be sent first. SHIPPING=FEDEX is the # most specific and should thus be first. year_plus_one = time.localtime()[0] + 1 c = CookieJar(DefaultCookiePolicy(rfc2965 = True)) #req = Request("http://1.1.1.1/", # headers={"Host": "www.acme.com:80"}) req = Request("http://www.acme.com:80/", headers={"Host": "www.acme.com:80"}) headers = Headers() headers['Set-Cookie'] = 'CUSTOMER=WILE_E_COYOTE; path=/ ; expires=Wednesday, 09-Nov-%d 23:12:40 GMT' % year_plus_one res = Response("http://www.acme.com/", headers=headers) c.extract_cookies(res, req) req = Request("http://www.acme.com/") c.add_cookie_header(req) self.assertEqual(req.headers.get("Cookie"), "CUSTOMER=WILE_E_COYOTE") self.assertEqual(req.headers.get("Cookie2"), '$Version="1"') headers.appendlist("Set-Cookie", "PART_NUMBER=ROCKET_LAUNCHER_0001; path=/") res = Response("http://www.acme.com/", headers=headers) c.extract_cookies(res, req) req = Request("http://www.acme.com/foo/bar") c.add_cookie_header(req) h = req.headers.get("Cookie") self.assert_("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and "CUSTOMER=WILE_E_COYOTE" in h) headers.appendlist('Set-Cookie', 'SHIPPING=FEDEX; path=/foo') res = Response("http://www.acme.com", headers=headers) c.extract_cookies(res, req) req = Request("http://www.acme.com/") c.add_cookie_header(req) h = req.headers.get("Cookie") self.assert_("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and "CUSTOMER=WILE_E_COYOTE" in h and "SHIPPING=FEDEX" not in h) req = Request("http://www.acme.com/foo/") c.add_cookie_header(req) h = req.headers.get("Cookie") self.assert_(("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and "CUSTOMER=WILE_E_COYOTE" in h and h.startswith("SHIPPING=FEDEX;")))
def test_session_cookies(self): year_plus_one = time.localtime()[0] + 1 # Check session cookies are deleted properly by # CookieJar.clear_session_cookies method req = Request('http://www.perlmeister.com/scripts') headers = Headers() headers.appendlist("Set-Cookie", "s1=session;Path=/scripts") headers.appendlist("Set-Cookie", "p1=perm; Domain=.perlmeister.com;Path=/;expires=Fri, 02-Feb-%d 23:24:20 GMT" % year_plus_one) headers.appendlist("Set-Cookie", "p2=perm;Path=/;expires=Fri, 02-Feb-%d 23:24:20 GMT" % year_plus_one) headers.appendlist("Set-Cookie", "s2=session;Path=/scripts;" "Domain=.perlmeister.com") headers.appendlist('Set-Cookie2', 's3=session;Version=1;Discard;Path="/"') res = Response('http://www.perlmeister.com/scripts', headers=headers) c = CookieJar() c.extract_cookies(res, req) # How many session/permanent cookies do we have? counter = {"session_after": 0, "perm_after": 0, "session_before": 0, "perm_before": 0} for cookie in c: key = "%s_before" % cookie.value counter[key] = counter[key] + 1 c.clear_session_cookies() # How many now? for cookie in c: key = "%s_after" % cookie.value counter[key] = counter[key] + 1 self.assert_(not ( # a permanent cookie got lost accidently counter["perm_after"] != counter["perm_before"] or # a session cookie hasn't been cleared counter["session_after"] != 0 or # we didn't have session cookies in the first place counter["session_before"] == 0))
class ScrapyHTTPPageGetter(HTTPClient): delimiter = '\n' def connectionMade(self): self.headers = Headers() # bucket for response headers if self.factory.use_tunnel: log.msg("Sending CONNECT", log.DEBUG) self.tunnel_started = False self.sendCommand("CONNECT", "%s:%s" % (self.factory.tunnel_to_host, self.factory.tunnel_to_port)) self.sendHeaders(only=['Host','Proxy-Connection', 'User-Agent']) del self.factory.headers['Proxy-Connection'] else: self.sendEverything() def sendCommand(self, command, path): if self.factory.use_tunnel and not self.tunnel_started: http_version = "1.1" else: http_version = "1.0" self.transport.write('%s %s HTTP/%s\r\n' % (command, path, http_version)) def sendEverything(self): self.sendMethod() self.sendHeaders() self.sendBody() def sendMethod(self): # Method command self.sendCommand(self.factory.method, self.factory.path) def sendHeaders(self, only=None): # Note: it's a Headers object, not a dict keys = only if only is not None else self.factory.headers.keys() for key in keys: for value in self.factory.headers.getlist(key): self.sendHeader(key, value) self.endHeaders() def sendBody(self): # Body if self.factory.body is not None: self.transport.write(self.factory.body) def lineReceived(self, line): if self.factory.use_tunnel and not self.tunnel_started: log.msg("LINE: %s" % line) if self.factory.use_tunnel and not self.tunnel_started and not line.rstrip(): # End of headers from the proxy in response to our CONNECT request # Skip the call to HTTPClient.lienReceived for now, since otherwise # it would switch to row mode. self.startTunnel() else: return HTTPClient.lineReceived(self, line.rstrip()) def startTunnel(self): log.msg("starting Tunnel") # We'll get a new batch of headers through the tunnel. This sets us # up to capture them. self.firstLine = True self.tunnel_started = True # Switch to SSL ctx = ClientContextFactory() self.transport.startTLS(ctx, self.factory) # And send the normal request: self.sendEverything() def handleHeader(self, key, value): if self.factory.use_tunnel and not self.tunnel_started: pass # maybe log headers for CONNECT request? else: self.headers.appendlist(key, value) def handleStatus(self, version, status, message): if self.factory.use_tunnel and not self.tunnel_started: self.tunnel_status = status else: self.factory.gotStatus(version, status, message) def handleEndHeaders(self): self.factory.gotHeaders(self.headers) def connectionLost(self, reason): HTTPClient.connectionLost(self, reason) self.factory.noPage(reason) def handleResponse(self, response): if self.factory.method.upper() == 'HEAD': self.factory.page('') else: self.factory.page(response) self.transport.loseConnection() def timeout(self): self.transport.loseConnection() self.factory.noPage(\ defer.TimeoutError("Getting %s took longer than %s seconds." % \ (self.factory.url, self.factory.timeout)))
def test_setlistdefault(self): h1 = Headers({'header1': 'value1'}) h1.setlistdefault('header1', ['value2', 'value3']) h1.setlistdefault('header2', ['value2', 'value3']) self.assertEqual(h1.getlist('header1'), ['value1']) self.assertEqual(h1.getlist('header2'), ['value2', 'value3'])
def test_single_value(self): h = Headers() h['Content-Type'] = 'text/html' self.assertEqual(h['Content-Type'], 'text/html') self.assertEqual(h.get('Content-Type'), 'text/html') self.assertEqual(h.getlist('Content-Type'), ['text/html'])
def _test(response): import json headers = Headers(json.loads(response.text)['headers']) contentlengths = headers.getlist('Content-Length') self.assertEqual(len(contentlengths), 1) self.assertEqual(contentlengths, [b"0"])