def test_delete_and_contains(self): h = Headers() h['Content-Type'] = 'text/html' assert 'Content-Type' in h del h['Content-Type'] assert 'Content-Type' not in h
def test_multivalue(self): h = Headers() h['X-Forwarded-For'] = hlist = ['ip1', 'ip2'] self.assertEqual(h['X-Forwarded-For'], 'ip2') self.assertEqual(h.get('X-Forwarded-For'), 'ip2') self.assertEqual(h.getlist('X-Forwarded-For'), hlist) assert h.getlist('X-Forwarded-For') is not hlist
def test_update(self): h = Headers() h.update({ "Content-Type": "text/html", "X-Forwarded-For": ["ip1", "ip2"] }) self.assertEqual(h.getlist("Content-Type"), [b"text/html"]) self.assertEqual(h.getlist("X-Forwarded-For"), [b"ip1", b"ip2"])
def test_update(self): h = Headers() h.update({ 'Content-Type': 'text/html', 'X-Forwarded-For': ['ip1', 'ip2'] }) self.assertEqual(h.getlist('Content-Type'), ['text/html']) self.assertEqual(h.getlist('X-Forwarded-For'), ['ip1', 'ip2'])
def process_request(self, request, spider): """ The request will be passed to the AutoExtract server only if the request is explicitly enabled with `{'autoextract': {'enabled': True}}` meta. The page type value must be also present, either in the AUTOEXTRACT_PAGE_TYPE option, or in `{'autoextract': {'pageType': '...'}}` meta. """ if not self._is_enabled_for_request(request): return # If the request was already processed by AutoExtract if request.meta.get(AUTOEXTRACT_META_KEY): return if request.method != 'GET': raise AutoExtractError('Only GET requests are supported by AutoExtract') request.meta[AUTOEXTRACT_META_KEY] = { 'original_url': request.url, 'timing': { 'start_ts': time.time() }, } # Maybe over-write the page type value from the request page_type = self._check_page_type(request) logger.debug('Process AutoExtract request for %s URL %s', page_type, request, extra={'spider': spider}) # Define request timeout request.meta['download_timeout'] = self.timeout # Define concurrency settings self._set_download_slot(request, request.meta) payload = {'url': request.url, 'pageType': page_type} # Add the extra payload, if available extra_payload = self._get_extra_payload(request) if extra_payload: payload.update(extra_payload) headers = Headers({ 'Content-Type': 'application/json', 'Authorization': basic_auth_header(self._api_user, self._api_pass) }) new_request = request.replace( url=self._api_url, method='POST', headers=headers, body=json.dumps([payload], sort_keys=True), ) self.inc_metric('autoextract/request_count') return new_request
def test_int_value(self): h1 = Headers({'hey': 5}) h1['foo'] = 1 h1.setdefault('bar', 2) h1.setlist('buz', [1, 'dos', 3]) self.assertEqual(h1.getlist('foo'), [b'1']) self.assertEqual(h1.getlist('bar'), [b'2']) self.assertEqual(h1.getlist('buz'), [b'1', b'dos', b'3']) self.assertEqual(h1.getlist('hey'), [b'5'])
def _build_response(self, body, request): request.meta['download_latency'] = self.headers_time - self.start_time status = int(self.status) headers = Headers(self.response_headers) respcls = responsetypes.from_args(headers=headers, url=self._url) return respcls(url=self._url, status=status, headers=headers, body=body)
def test_int_value(self): h1 = Headers({"hey": 5}) h1["foo"] = 1 h1.setdefault("bar", 2) h1.setlist("buz", [1, "dos", 3]) self.assertEqual(h1.getlist("foo"), [b"1"]) self.assertEqual(h1.getlist("bar"), [b"2"]) self.assertEqual(h1.getlist("buz"), [b"1", b"dos", b"3"]) self.assertEqual(h1.getlist("hey"), [b"5"])
def close_used_contexts(self, spider): contexts = list(self.used_contexts[id(spider)]) if contexts: request = Request(urljoin(self.service_base_url, '/close_context'), method='POST', headers=Headers({'Content-Type': 'application/json'}), meta={"proxy": None}, body=json.dumps(contexts)) return self.crawler.engine.downloader.fetch(request, None)
def _cb_bodydone(self, result, request, url): txresponse, body, flags = result status = int(txresponse.code) headers = Headers(txresponse.headers.getAllRawHeaders()) respcls = responsetypes.from_args(headers=headers, url=url, body=body) return respcls(url=url, status=status, headers=headers, body=body, flags=flags)
def headers_to_scrapy(headers): """ Return scrapy.http.Headers instance from headers data. 3 data formats are supported: * {name: value, ...} dict; * [(name, value), ...] list; * [{'name': name, 'value': value'}, ...] list (HAR headers format). """ if isinstance(headers or {}, dict): return Headers(headers or {}) if isinstance(headers[0], dict): return Headers([ (d['name'], d.get('value', '')) for d in headers ]) return Headers(headers)
def test_from_headers(self): mappings = [ ({'Content-Type': ['text/html; charset=utf-8']}, HtmlResponse), ({'Content-Type': ['application/octet-stream'], 'Content-Disposition': ['attachment; filename=data.txt']}, TextResponse), ({'Content-Type': ['text/html; charset=utf-8'], 'Content-Encoding': ['gzip']}, Response), ] for source, cls in mappings: source = Headers(source) retcls = responsetypes.from_headers(source) assert retcls is cls, "%s ==> %s != %s" % (source, retcls, cls)
def test_iterables(self): idict = {'Content-Type': 'text/html', 'X-Forwarded-For': ['ip1', 'ip2']} h = Headers(idict) self.assertEqual(dict(h), {'Content-Type': ['text/html'], 'X-Forwarded-For': ['ip1', 'ip2']}) self.assertEqual(h.keys(), ['X-Forwarded-For', 'Content-Type']) self.assertEqual(h.items(), [('X-Forwarded-For', ['ip1', 'ip2']), ('Content-Type', ['text/html'])]) self.assertEqual(list(h.iteritems()), [('X-Forwarded-For', ['ip1', 'ip2']), ('Content-Type', ['text/html'])]) self.assertEqual(h.values(), ['ip2', 'text/html'])
def retrieve_response(self, spider, request): data = self._read_data(spider, request) if data is None: return # not cached url = data['url'] status = data['status'] headers = Headers(data['headers']) body = data['body'] respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def response_from_dict(response, spider=None, **kwargs): """Returns a dict based on a response from a spider""" url = response.get("url") status = "200" headers = Headers([(x, list(map(str, y))) for x, y in response.get("headers").items()]) body = response.get("body") respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(self, spider, request): doc = self.col.find_one({'url': request.url}) if doc is None: return # not cached status = 200 url = doc['url'] headers = Headers(doc['headers']) body = doc['body'].encode('utf-8') respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(self, spider, request): key = self._request_key(spider, request) gf = self._get_file(spider, key) if gf is None: return # not cached url = str(gf.url) status = str(gf.status) headers = Headers([(x, map(str, y)) for x, y in gf.headers.iteritems()]) body = gf.read() respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def test_basics(self): h = Headers({'Content-Type': 'text/html', 'Content-Length': 1234}) assert h['Content-Type'] assert h['Content-Length'] self.assertRaises(KeyError, h.__getitem__, 'Accept') self.assertEqual(h.get('Accept'), None) self.assertEqual(h.getlist('Accept'), []) self.assertEqual(h.get('Accept', '*/*'), '*/*') self.assertEqual(h.getlist('Accept', '*/*'), ['*/*']) self.assertEqual(h.getlist('Accept', ['text/html', 'images/jpeg']), ['text/html','images/jpeg'])
def connectionMade(self): self.headers = Headers() # bucket for response headers # Method command self.sendCommand(self.factory.method, self.factory.path) # Headers for key, values in self.factory.headers.items(): for value in values: self.sendHeader(key, value) self.endHeaders() # Body if self.factory.body is not None: self.transport.write(self.factory.body)
def test_iterables(self): idict = {'Content-Type': 'text/html', 'X-Forwarded-For': ['ip1', 'ip2']} h = Headers(idict) self.assertDictEqual(dict(h), {b'Content-Type': [b'text/html'], b'X-Forwarded-For': [b'ip1', b'ip2']}) self.assertSortedEqual(h.keys(), [b'X-Forwarded-For', b'Content-Type']) self.assertSortedEqual(h.items(), [(b'X-Forwarded-For', [b'ip1', b'ip2']), (b'Content-Type', [b'text/html'])]) self.assertSortedEqual(h.values(), [b'ip2', b'text/html'])
def _cb_bodydone(self, result, request, url): headers = Headers(result["txresponse"].headers.getAllRawHeaders()) respcls = responsetypes.from_args(headers=headers, url=url, body=result["body"]) return respcls( url=url, status=int(result["txresponse"].code), headers=headers, body=result["body"], flags=result["flags"], certificate=result["certificate"], ip_address=result["ip_address"], )
def test_basics(self): h = Headers({"Content-Type": "text/html", "Content-Length": 1234}) assert h["Content-Type"] assert h["Content-Length"] self.assertRaises(KeyError, h.__getitem__, "Accept") self.assertEqual(h.get("Accept"), None) self.assertEqual(h.getlist("Accept"), []) self.assertEqual(h.get("Accept", "*/*"), b"*/*") self.assertEqual(h.getlist("Accept", "*/*"), [b"*/*"]) self.assertEqual( h.getlist("Accept", ["text/html", "images/jpeg"]), [b"text/html", b"images/jpeg"], )
def test_replace(self): """Test Response.replace() method""" hdrs = Headers({"key": "value"}) r1 = self.response_class("http://www.example.com") r2 = r1.replace(status=301, body=b"New body", headers=hdrs) assert r1.body == b'' self.assertEqual(r1.url, r2.url) self.assertEqual((r1.status, r2.status), (200, 301)) self.assertEqual((r1.body, r2.body), (b'', b"New body")) self.assertEqual((r1.headers, r2.headers), ({}, hdrs)) # Empty attributes (which may fail if not compared properly) r3 = self.response_class("http://www.example.com", flags=['cached']) r4 = r3.replace(body=b'', flags=[]) self.assertEqual(r4.body, b'') self.assertEqual(r4.flags, [])
def retrieve_response(self, spider, request): """Return response if present in cache, or None otherwise.""" metadata = self._read_meta(spider, request) if metadata is None: return # not cached rpath = self._get_request_path(spider, request) with self._open(os.path.join(rpath, 'response_body'), 'rb') as f: body = f.read() with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f: rawheaders = f.read() url = metadata.get('response_url') status = metadata['status'] headers = Headers(headers_raw_to_dict(rawheaders)) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(self, spider, request): key = request_fingerprint(request) value = self.conn.hget(self.name, key) if not value: return value_arr = value.split(self.separator) stored_data = json.loads(value_arr[0]) metadata = stored_data['metadata'] body = str(value_arr[2]) rawheaders = stored_data['response_headers'] url = str(metadata['response_url']) status = str(metadata['status']) headers = Headers(rawheaders) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def process_request(self, request, spider): def get_ua(): return getattr(self.ua, self.ua_type) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep - alive', 'referer': 'https://search.jd.com/Search?keyword=%E5%9B%BE%E4%B9%A6&enc=utf-8&wq=%E5%9B%BE%E4%B9%A6&page=1' } headers['User_Agent'] = get_ua() request.headers = Headers(headers) print(request.headers)
def test_replace(self): """Test Request.replace() method""" r1 = self.request_class("http://www.example.com", method='GET') hdrs = Headers(dict(r1.headers, key='value')) r2 = r1.replace(method="POST", body="New body", headers=hdrs) self.assertEqual(r1.url, r2.url) self.assertEqual((r1.method, r2.method), ("GET", "POST")) self.assertEqual((r1.body, r2.body), ('', "New body")) self.assertEqual((r1.headers, r2.headers), (self.default_headers, hdrs)) # Empty attributes (which may fail if not compared properly) r3 = self.request_class("http://www.example.com", meta={'a': 1}, dont_filter=True) r4 = r3.replace(url="http://www.example.com/2", body='', meta={}, dont_filter=False) self.assertEqual(r4.url, "http://www.example.com/2") self.assertEqual(r4.body, '') self.assertEqual(r4.meta, {}) assert r4.dont_filter is False
def retrieve_response(rpath): """ Return response if present in cache, or None otherwise """ metadata = read_meta(rpath) if metadata is None: return # not cached with open(join(rpath, 'response_body'), 'rb') as f: body = f.read() with open(join(rpath, 'response_headers'), 'rb') as f: rawheaders = f.read() url = metadata.get('response_url') status = metadata['status'] headers = Headers(headers_raw_to_dict(rawheaders)) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(self, spider, request): """Return response if present in cache, or None otherwise.""" metadata = self._read_meta(spider, request) if metadata is None: return # not cached rpath = self._get_request_path(spider, request) with open(join(rpath, 'response_body'), 'rb') as f: body = f.read() with open(join(rpath, 'response_headers'), 'rb') as f: rawheaders = f.read() # We failback to metadata['url'] to support old generated caches. TODO: remove for Scrapy 0.11 url = metadata.get('response_url') or metadata['url'] status = metadata['status'] headers = Headers(headers_raw_to_dict(rawheaders)) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def test_headers(self): # Different ways of setting headers attribute url = 'http://www.scrapy.org' headers = {b'Accept': 'gzip', b'Custom-Header': 'nothing to tell you'} r = self.request_class(url=url, headers=headers) p = self.request_class(url=url, headers=r.headers) self.assertEqual(r.headers, p.headers) self.assertFalse(r.headers is headers) self.assertFalse(p.headers is r.headers) # headers must not be unicode h = Headers({'key1': u'val1', u'key2': 'val2'}) h[u'newkey'] = u'newval' for k, v in h.iteritems(): self.assert_(isinstance(k, bytes)) for s in v: self.assert_(isinstance(s, bytes))