Ejemplo n.º 1
0
    def retrieve_response(self, spider, request):
        response = super(S3CacheStorage, self).retrieve_response(spider, request)
        if response is None:  # not in local filesystem cache, so try copying from s3
            local_path = self._get_request_path(spider, request)
            remote_path = os.path.relpath(local_path, self.tmpcachedir).lower()
            bucket = self.conn.get_bucket(self.bucket_name, validate=False)

            def _get_key(filename):
                key_name = os.path.join(remote_path, filename)
                return bucket.get_key(key_name)

            # check if the key exists
            metadata_key = _get_key('pickled_meta')
            if metadata_key is None:
                return None  # key not found

            # check if the cache entry has expired
            mtime = parse_ts(metadata_key.last_modified)
            if 0 < self.expiration_secs < (datetime.datetime.utcnow() - mtime).total_seconds():
                return None  # expired

            # deserialise the cached response
            metadata = pickle.loads(metadata_key.get_contents_as_string())
            body = _get_key('response_body').get_contents_as_string()
            rawheaders = _get_key('response_headers').get_contents_as_string()
            url = metadata.get('response_url')
            status = metadata['status']
            headers = Headers(headers_raw_to_dict(rawheaders))
            respcls = responsetypes.from_args(headers=headers, url=url)
            response = respcls(url=url, headers=headers, status=status, body=body)

        return response
    def retrieve_response(self, spider, request):
        """Return response if present in cache, or None otherwise."""
        key = self._get_request_key(spider, request)

        expiration_time = self._get_expiration_time(spider)
        metadata = self._read_meta(key, expiration_time)
        if metadata is None:
            return  # not cached
        spider.log("%s: found cache for %s" %
                   (self.__class__.__name__, request.url))

        res = self._get_cached_data(key, spider)

        if res is None:
            return None

        spider.log("%s: got response from cache for %s" %
                   (self.__class__.__name__, request.url))

        response_url = res['response_url']
        response_body = str(res['response_body'])
        response_rawheaders = res['response_meta']['headers']
        status = res['response_meta']['status']
        response_headers = Headers(headers_raw_to_dict(response_rawheaders))
        response_cls = responsetypes.from_args(headers=response_headers,
                                               url=response_url)
        response = response_cls(url=response_url,
                                headers=response_headers,
                                status=status,
                                body=response_body)
        return response
Ejemplo n.º 3
0
    def test_headers_raw_to_dict(self):
        raw = b"Content-type: text/html\n\rAccept: gzip\n\r\
                Cache-Control: no-cache\n\rCache-Control: no-store\n\n"

        dct = {
            b'Content-type': [b'text/html'],
            b'Accept': [b'gzip'],
            b'Cache-Control': [b'no-cache', b'no-store']
        }
        self.assertEqual(headers_raw_to_dict(raw), dct)
Ejemplo n.º 4
0
    def test_headers_raw_to_dict(self):
        raw = b"Content-type: text/html\n\rAccept: gzip\n\r\
                Cache-Control: no-cache\n\rCache-Control: no-store\n\n"

        dct = {
            b"Content-type": [b"text/html"],
            b"Accept": [b"gzip"],
            b"Cache-Control": [b"no-cache", b"no-store"],
        }
        self.assertEqual(headers_raw_to_dict(raw), dct)
Ejemplo n.º 5
0
 def retrieve_response(self, spider, request):
     """Return response if present in cache, or None otherwise."""
     metadata = self._read_meta(spider, request)
     if metadata is None:
         return # not cached
     rpath = self._get_request_path(spider, request)
     with open(join(rpath, 'response_body'), 'rb') as f:
         body = f.read()
     with open(join(rpath, 'response_headers'), 'rb') as f:
         rawheaders = f.read()
     url = metadata.get('response_url')
     status = metadata['status']
     headers = Headers(headers_raw_to_dict(rawheaders))
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Ejemplo n.º 6
0
 def retrieve_response(self, spider, request):
     """Return response if present in cache, or None otherwise."""
     try:
         doc = json.loads(self.bucket.get(self._inverse_url(request.url))[2])
     except couchbase.exception.MemcachedError:
         return
         # @TODO expiration
     body = doc['response_body']
     url = doc['response_url']
     status = doc['status']
     headers = Headers(headers_raw_to_dict(doc['response_headers']))
     encoding = doc['encoding']
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body,
         encoding=encoding)
     return response
Ejemplo n.º 7
0
 def retrieve_response(self, spider, request):
     """Return response if present in cache, or None otherwise."""
     metadata = self._read_meta(spider, request)
     if metadata is None:
         return  # not cached
     rpath = self._get_request_path(spider, request)
     with self._open(os.path.join(rpath, 'response_body'), 'rb') as f:
         body = f.read()
     with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f:
         rawheaders = f.read()
     url = metadata.get('response_url')
     status = metadata['status']
     headers = Headers(headers_raw_to_dict(rawheaders))
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Ejemplo n.º 8
0
def retrieve_response(rpath):
    """
    Return response if present in cache, or None otherwise
    """
    metadata = read_meta(rpath)
    if metadata is None:
        return  # not cached
    with open(join(rpath, 'response_body'), 'rb') as f:
        body = f.read()
    with open(join(rpath, 'response_headers'), 'rb') as f:
        rawheaders = f.read()
    url = metadata.get('response_url')
    status = metadata['status']
    headers = Headers(headers_raw_to_dict(rawheaders))
    respcls = responsetypes.from_args(headers=headers, url=url)
    response = respcls(url=url, headers=headers, status=status, body=body)
    return response
Ejemplo n.º 9
0
 def retrieve_response(self, spider, request):
     """Return response if present in cache, or None otherwise."""
     metadata = self._read_meta(spider, request)
     if metadata is None:
         return # not cached
     rpath = self._get_request_path(spider, request)
     with open(join(rpath, 'response_body'), 'rb') as f:
         body = f.read()
     with open(join(rpath, 'response_headers'), 'rb') as f:
         rawheaders = f.read()
     # We failback to metadata['url'] to support old generated caches. TODO: remove for Scrapy 0.11
     url = metadata.get('response_url') or metadata['url']
     status = metadata['status']
     headers = Headers(headers_raw_to_dict(rawheaders))
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Ejemplo n.º 10
0
 def retrieve_response(self, spider, request):
     """Return response if present in cache, or None otherwise."""
     metadata = self._read_meta(spider, request)
     if metadata is None:
         return  # not cached
     rpath = self._get_request_path(spider, request)
     with open(join(rpath, 'response_body'), 'rb') as f:
         body = f.read()
     with open(join(rpath, 'response_headers'), 'rb') as f:
         rawheaders = f.read()
     # We failback to metadata['url'] to support old generated caches. TODO: remove for Scrapy 0.11
     url = metadata.get('response_url') or metadata['url']
     status = metadata['status']
     headers = Headers(headers_raw_to_dict(rawheaders))
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Ejemplo n.º 11
0
 def retrieve_response(self, spider: TSpider,
                       request: TRequest) -> Optional[TResponse]:
     """Return response if present in cache, or None otherwise."""
     metadata = self._read_meta(spider, request)
     if metadata is None:
         return  # not cached
     rpath = self._get_request_path(spider, request)
     with self._open(os.path.join(rpath, "response_body"), "rb") as f:
         body = f.read()
     with self._open(os.path.join(rpath, "response_headers"), "rb") as f:
         rawheaders = f.read()
     url = metadata.get("response_url")
     status = metadata["status"]
     headers = Headers(headers_raw_to_dict(rawheaders))
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Ejemplo n.º 12
0
 def retrieve_response(self, spider, request):
     """Return response if present in cache, or None otherwise."""
     if self.dont_retrieve:
         return
     keyname = self._get_request_path(request)
     keydata = self.get_object_from_key(self.bucket_name, keyname)
     if not keydata:
         return  # not cached
     keydata = pickle.loads(keydata)
     metadata = keydata['meta']
     body = keydata['response_body']
     rawheaders = keydata['response_headers']
     url = metadata.get('response_url')
     status = metadata['status']
     headers = Headers(headers_raw_to_dict(rawheaders))
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Ejemplo n.º 13
0
 def retrieve_response(self, spider, request):
     key = request_fingerprint(request)
     value = self.collection.find_one({"_id": key})
     if not value:
         return
     stored_data = value["value"]
     metadata = stored_data['metadata']
     url = str(metadata['response_url'])
     rawheaders = stored_data['response_headers']
     rawheaders = headers_raw_to_dict(rawheaders)
     if "Content-Encoding" in rawheaders:
         del rawheaders["Content-Encoding"]
     body = stored_data["response_body"]
     body = body.encode("utf-8", "w3lib_replace")
     status = str(metadata['status'])
     headers = Headers(rawheaders)
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Ejemplo n.º 14
0
 def retrieve_response(self, spider, request):
     """Return response if present in cache, or None otherwise."""
     try:
         doc = json.loads(
             self.bucket.get(self._inverse_url(request.url))[2])
     except couchbase.exception.MemcachedError:
         return
         # @TODO expiration
     body = doc['response_body']
     url = doc['response_url']
     status = doc['status']
     headers = Headers(headers_raw_to_dict(doc['response_headers']))
     encoding = doc['encoding']
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url,
                        headers=headers,
                        status=status,
                        body=body,
                        encoding=encoding)
     return response
    def retrieve_response(self, spider, request):
        """Return response if present in cache, or None otherwise."""
        meta_data_key = self.key_for(spider, request, "meta_data")
        raw_meta_data = self.client.get(meta_data_key)
        meta_data = json.loads(raw_meta_data) if raw_meta_data else None
        if not meta_data:
            return

        body, raw_headers = (self.client.get(
            self.key_for(spider, request, type_))
                             for type_ in ("response_body",
                                           "response_headers"))
        if not all((body, raw_headers)):
            return

        kwargs = {
            "url": meta_data.get("response_url"),
            "headers": Headers(headers_raw_to_dict(raw_headers)),
        }
        response_class = responsetypes.from_args(**kwargs)
        kwargs.update({"status": meta_data.get("status"), "body": body})
        return response_class(**kwargs)
Ejemplo n.º 16
0
    def retrieve_response(self, spider, request):
        path = functools.partial(storage_path, request)
        try:
            _metadata = get_s3_text(self.bucket, path('pickled_meta'))
            body = get_s3_text(self.bucket, path('response_body'))
            rawheaders = get_s3_text(self.bucket, path('response_headers'))
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                return None
            else:
                raise

        metadata = pickle.loads(_metadata)

        url = metadata.get('response_url')
        status = metadata['status']
        headers = Headers(headers_raw_to_dict(rawheaders))
        respcls = responsetypes.from_args(headers=headers, url=url)
        response = respcls(url=url, headers=headers, status=status, body=body)
        if response.status == 302:
            self.bucket.delete_objects(Delete=make_delete_objects(path))
            return None
        return response
Ejemplo n.º 17
0
    def retrieve_response(self, spider, request):
        response = super(S3CacheStorage,
                         self).retrieve_response(spider, request)
        if response is None:  # not in local filesystem cache, so try copying from s3
            local_path = self._get_request_path(spider, request)
            remote_path = os.path.relpath(local_path, self.tmpcachedir).lower()
            bucket = self.conn.get_bucket(self.bucket_name, validate=False)

            def _get_key(filename):
                key_name = os.path.join(remote_path, filename)
                return bucket.get_key(key_name)

            # check if the key exists
            metadata_key = _get_key('pickled_meta')
            if metadata_key is None:
                return None  # key not found

            # check if the cache entry has expired
            mtime = parse_ts(metadata_key.last_modified)
            if 0 < self.expiration_secs < (datetime.datetime.utcnow() -
                                           mtime).total_seconds():
                return None  # expired

            # deserialise the cached response
            metadata = pickle.loads(metadata_key.get_contents_as_string())
            body = _get_key('response_body').get_contents_as_string()
            rawheaders = _get_key('response_headers').get_contents_as_string()
            url = metadata.get('response_url')
            status = metadata['status']
            headers = Headers(headers_raw_to_dict(rawheaders))
            respcls = responsetypes.from_args(headers=headers, url=url)
            response = respcls(url=url,
                               headers=headers,
                               status=status,
                               body=body)

        return response
Ejemplo n.º 18
0
 def test_headers_raw_dict_none(self):
     self.assertIsNone(headers_raw_to_dict(None))
     self.assertIsNone(headers_dict_to_raw(None))
Ejemplo n.º 19
0
 def test_headers_raw_to_dict(self):
     raw = b"Content-type: text/html\n\rAccept: gzip\n\n"
     dct = {b'Content-type': [b'text/html'], b'Accept': [b'gzip']}
     self.assertEqual(headers_raw_to_dict(raw), dct)
Ejemplo n.º 20
0
 def test_headers_raw_dict_none(self):
     self.assertIsNone(headers_raw_to_dict(None))
     self.assertIsNone(headers_dict_to_raw(None))
Ejemplo n.º 21
0
 def test_headers_raw_to_dict(self):
     raw = b"Content-type: text/html\n\rAccept: gzip\n\n"
     dct = {b'Content-type': [b'text/html'], b'Accept': [b'gzip']}
     self.assertEqual(headers_raw_to_dict(raw), dct)
Ejemplo n.º 22
0
 def test_headers_raw_to_dict(self):
     raw = b"Content-type: text/html\n\rAccept: gzip\n\r\
             Cache-Control: no-cache\n\rCache-Control: no-store\n\n"
     dct = {b'Content-type': [b'text/html'], b'Accept': [b'gzip'], 
            b'Cache-Control': [b'no-cache', b'no-store']}
     self.assertEqual(headers_raw_to_dict(raw), dct)