def retrieve_response(self, spider, request): response = super(S3CacheStorage, self).retrieve_response(spider, request) if response is None: # not in local filesystem cache, so try copying from s3 local_path = self._get_request_path(spider, request) remote_path = os.path.relpath(local_path, self.tmpcachedir).lower() bucket = self.conn.get_bucket(self.bucket_name, validate=False) def _get_key(filename): key_name = os.path.join(remote_path, filename) return bucket.get_key(key_name) # check if the key exists metadata_key = _get_key('pickled_meta') if metadata_key is None: return None # key not found # check if the cache entry has expired mtime = parse_ts(metadata_key.last_modified) if 0 < self.expiration_secs < (datetime.datetime.utcnow() - mtime).total_seconds(): return None # expired # deserialise the cached response metadata = pickle.loads(metadata_key.get_contents_as_string()) body = _get_key('response_body').get_contents_as_string() rawheaders = _get_key('response_headers').get_contents_as_string() url = metadata.get('response_url') status = metadata['status'] headers = Headers(headers_raw_to_dict(rawheaders)) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(self, spider, request): """Return response if present in cache, or None otherwise.""" key = self._get_request_key(spider, request) expiration_time = self._get_expiration_time(spider) metadata = self._read_meta(key, expiration_time) if metadata is None: return # not cached spider.log("%s: found cache for %s" % (self.__class__.__name__, request.url)) res = self._get_cached_data(key, spider) if res is None: return None spider.log("%s: got response from cache for %s" % (self.__class__.__name__, request.url)) response_url = res['response_url'] response_body = str(res['response_body']) response_rawheaders = res['response_meta']['headers'] status = res['response_meta']['status'] response_headers = Headers(headers_raw_to_dict(response_rawheaders)) response_cls = responsetypes.from_args(headers=response_headers, url=response_url) response = response_cls(url=response_url, headers=response_headers, status=status, body=response_body) return response
def test_headers_raw_to_dict(self): raw = b"Content-type: text/html\n\rAccept: gzip\n\r\ Cache-Control: no-cache\n\rCache-Control: no-store\n\n" dct = { b'Content-type': [b'text/html'], b'Accept': [b'gzip'], b'Cache-Control': [b'no-cache', b'no-store'] } self.assertEqual(headers_raw_to_dict(raw), dct)
def test_headers_raw_to_dict(self): raw = b"Content-type: text/html\n\rAccept: gzip\n\r\ Cache-Control: no-cache\n\rCache-Control: no-store\n\n" dct = { b"Content-type": [b"text/html"], b"Accept": [b"gzip"], b"Cache-Control": [b"no-cache", b"no-store"], } self.assertEqual(headers_raw_to_dict(raw), dct)
def retrieve_response(self, spider, request): """Return response if present in cache, or None otherwise.""" metadata = self._read_meta(spider, request) if metadata is None: return # not cached rpath = self._get_request_path(spider, request) with open(join(rpath, 'response_body'), 'rb') as f: body = f.read() with open(join(rpath, 'response_headers'), 'rb') as f: rawheaders = f.read() url = metadata.get('response_url') status = metadata['status'] headers = Headers(headers_raw_to_dict(rawheaders)) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(self, spider, request): """Return response if present in cache, or None otherwise.""" try: doc = json.loads(self.bucket.get(self._inverse_url(request.url))[2]) except couchbase.exception.MemcachedError: return # @TODO expiration body = doc['response_body'] url = doc['response_url'] status = doc['status'] headers = Headers(headers_raw_to_dict(doc['response_headers'])) encoding = doc['encoding'] respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body, encoding=encoding) return response
def retrieve_response(self, spider, request): """Return response if present in cache, or None otherwise.""" metadata = self._read_meta(spider, request) if metadata is None: return # not cached rpath = self._get_request_path(spider, request) with self._open(os.path.join(rpath, 'response_body'), 'rb') as f: body = f.read() with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f: rawheaders = f.read() url = metadata.get('response_url') status = metadata['status'] headers = Headers(headers_raw_to_dict(rawheaders)) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(rpath): """ Return response if present in cache, or None otherwise """ metadata = read_meta(rpath) if metadata is None: return # not cached with open(join(rpath, 'response_body'), 'rb') as f: body = f.read() with open(join(rpath, 'response_headers'), 'rb') as f: rawheaders = f.read() url = metadata.get('response_url') status = metadata['status'] headers = Headers(headers_raw_to_dict(rawheaders)) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(self, spider, request): """Return response if present in cache, or None otherwise.""" metadata = self._read_meta(spider, request) if metadata is None: return # not cached rpath = self._get_request_path(spider, request) with open(join(rpath, 'response_body'), 'rb') as f: body = f.read() with open(join(rpath, 'response_headers'), 'rb') as f: rawheaders = f.read() # We failback to metadata['url'] to support old generated caches. TODO: remove for Scrapy 0.11 url = metadata.get('response_url') or metadata['url'] status = metadata['status'] headers = Headers(headers_raw_to_dict(rawheaders)) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(self, spider: TSpider, request: TRequest) -> Optional[TResponse]: """Return response if present in cache, or None otherwise.""" metadata = self._read_meta(spider, request) if metadata is None: return # not cached rpath = self._get_request_path(spider, request) with self._open(os.path.join(rpath, "response_body"), "rb") as f: body = f.read() with self._open(os.path.join(rpath, "response_headers"), "rb") as f: rawheaders = f.read() url = metadata.get("response_url") status = metadata["status"] headers = Headers(headers_raw_to_dict(rawheaders)) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(self, spider, request): """Return response if present in cache, or None otherwise.""" if self.dont_retrieve: return keyname = self._get_request_path(request) keydata = self.get_object_from_key(self.bucket_name, keyname) if not keydata: return # not cached keydata = pickle.loads(keydata) metadata = keydata['meta'] body = keydata['response_body'] rawheaders = keydata['response_headers'] url = metadata.get('response_url') status = metadata['status'] headers = Headers(headers_raw_to_dict(rawheaders)) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(self, spider, request): key = request_fingerprint(request) value = self.collection.find_one({"_id": key}) if not value: return stored_data = value["value"] metadata = stored_data['metadata'] url = str(metadata['response_url']) rawheaders = stored_data['response_headers'] rawheaders = headers_raw_to_dict(rawheaders) if "Content-Encoding" in rawheaders: del rawheaders["Content-Encoding"] body = stored_data["response_body"] body = body.encode("utf-8", "w3lib_replace") status = str(metadata['status']) headers = Headers(rawheaders) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(self, spider, request): """Return response if present in cache, or None otherwise.""" try: doc = json.loads( self.bucket.get(self._inverse_url(request.url))[2]) except couchbase.exception.MemcachedError: return # @TODO expiration body = doc['response_body'] url = doc['response_url'] status = doc['status'] headers = Headers(headers_raw_to_dict(doc['response_headers'])) encoding = doc['encoding'] respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body, encoding=encoding) return response
def retrieve_response(self, spider, request): """Return response if present in cache, or None otherwise.""" meta_data_key = self.key_for(spider, request, "meta_data") raw_meta_data = self.client.get(meta_data_key) meta_data = json.loads(raw_meta_data) if raw_meta_data else None if not meta_data: return body, raw_headers = (self.client.get( self.key_for(spider, request, type_)) for type_ in ("response_body", "response_headers")) if not all((body, raw_headers)): return kwargs = { "url": meta_data.get("response_url"), "headers": Headers(headers_raw_to_dict(raw_headers)), } response_class = responsetypes.from_args(**kwargs) kwargs.update({"status": meta_data.get("status"), "body": body}) return response_class(**kwargs)
def retrieve_response(self, spider, request): path = functools.partial(storage_path, request) try: _metadata = get_s3_text(self.bucket, path('pickled_meta')) body = get_s3_text(self.bucket, path('response_body')) rawheaders = get_s3_text(self.bucket, path('response_headers')) except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == "404": return None else: raise metadata = pickle.loads(_metadata) url = metadata.get('response_url') status = metadata['status'] headers = Headers(headers_raw_to_dict(rawheaders)) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) if response.status == 302: self.bucket.delete_objects(Delete=make_delete_objects(path)) return None return response
def test_headers_raw_dict_none(self): self.assertIsNone(headers_raw_to_dict(None)) self.assertIsNone(headers_dict_to_raw(None))
def test_headers_raw_to_dict(self): raw = b"Content-type: text/html\n\rAccept: gzip\n\n" dct = {b'Content-type': [b'text/html'], b'Accept': [b'gzip']} self.assertEqual(headers_raw_to_dict(raw), dct)
def test_headers_raw_to_dict(self): raw = b"Content-type: text/html\n\rAccept: gzip\n\r\ Cache-Control: no-cache\n\rCache-Control: no-store\n\n" dct = {b'Content-type': [b'text/html'], b'Accept': [b'gzip'], b'Cache-Control': [b'no-cache', b'no-store']} self.assertEqual(headers_raw_to_dict(raw), dct)