コード例 #1
0
 def process_request(self, request: Request, spider):
     if request.headers.get('Authorization'):
         return
     auth = request.meta.get('auth')
     if auth:
         request = request.copy()
         request.headers['Authorization'] = f'OAuth {auth}'
         return request
コード例 #2
0
    def test_request_fingerprint(self):
        r1 = Request("http://www.example.com/query?id=111&cat=222")
        r2 = Request("http://www.example.com/query?cat=222&id=111")
        self.assertEqual(request_fingerprint(r1), request_fingerprint(r1))
        self.assertEqual(request_fingerprint(r1), request_fingerprint(r2))

        r1 = Request('http://www.example.com/hnnoticiaj1.aspx?78132,199')
        r2 = Request('http://www.example.com/hnnoticiaj1.aspx?78160,199')
        self.assertNotEqual(request_fingerprint(r1), request_fingerprint(r2))

        # make sure caching is working
        self.assertEqual(request_fingerprint(r1), _fingerprint_cache[r1][None])

        r1 = Request("http://www.example.com/members/offers.html")
        r2 = Request("http://www.example.com/members/offers.html")
        r2.headers['SESSIONID'] = "somehash"
        self.assertEqual(request_fingerprint(r1), request_fingerprint(r2))

        r1 = Request("http://www.example.com/")
        r2 = Request("http://www.example.com/")
        r2.headers['Accept-Language'] = 'en'
        r3 = Request("http://www.example.com/")
        r3.headers['Accept-Language'] = 'en'
        r3.headers['SESSIONID'] = "somehash"

        self.assertEqual(request_fingerprint(r1), request_fingerprint(r2), request_fingerprint(r3))

        self.assertEqual(request_fingerprint(r1),
                         request_fingerprint(r1, include_headers=['Accept-Language']))

        self.assertNotEqual(request_fingerprint(r1),
                         request_fingerprint(r2, include_headers=['Accept-Language']))

        self.assertEqual(request_fingerprint(r3, include_headers=['accept-language', 'sessionid']),
                         request_fingerprint(r3, include_headers=['SESSIONID', 'Accept-Language']))

        r1 = Request("http://www.example.com")
        r2 = Request("http://www.example.com", method='POST')
        r3 = Request("http://www.example.com", method='POST', body='request body')

        self.assertNotEqual(request_fingerprint(r1), request_fingerprint(r2))
        self.assertNotEqual(request_fingerprint(r2), request_fingerprint(r3))

        # cached fingerprint must be cleared on request copy
        r1 = Request("http://www.example.com")
        fp1 = request_fingerprint(r1)
        r2 = r1.copy()
        r2.url = "http://www.example.com/other"
        fp2 = request_fingerprint(r2)
        self.assertNotEqual(fp1, fp2)
コード例 #3
0
class HttpCacheMiddlewareTest(unittest.TestCase):

    storage_class = FilesystemCacheStorage

    def setUp(self):
        self.crawler = get_crawler()
        self.spider = BaseSpider('example.com')
        self.tmpdir = tempfile.mkdtemp()
        self.request = Request('http://www.example.com', headers={'User-Agent': 'test'})
        self.response = Response('http://www.example.com', headers={'Content-Type': 'text/html'}, body='test body', status=202)
        self.crawler.stats.open_spider(self.spider)

    def tearDown(self):
        self.crawler.stats.close_spider(self.spider, '')
        shutil.rmtree(self.tmpdir)

    def _get_settings(self, **new_settings):
        settings = {
            'HTTPCACHE_ENABLED': True,
            'HTTPCACHE_DIR': self.tmpdir,
            'HTTPCACHE_EXPIRATION_SECS': 1,
            'HTTPCACHE_IGNORE_HTTP_CODES': [],
        }
        settings.update(new_settings)
        return Settings(settings)

    def _get_storage(self, **new_settings):
        return self.storage_class(self._get_settings(**new_settings))

    def _get_middleware(self, **new_settings):
        return HttpCacheMiddleware(self._get_settings(**new_settings), self.crawler.stats)

    def test_storage(self):
        storage = self._get_storage()
        request2 = self.request.copy()
        assert storage.retrieve_response(self.spider, request2) is None
        storage.store_response(self.spider, self.request, self.response)
        response2 = storage.retrieve_response(self.spider, request2)
        assert isinstance(response2, HtmlResponse) # inferred from content-type header
        self.assertEqualResponse(self.response, response2)
        time.sleep(2) # wait for cache to expire
        assert storage.retrieve_response(self.spider, request2) is None

    def test_storage_never_expire(self):
        storage = self._get_storage(HTTPCACHE_EXPIRATION_SECS=0)
        assert storage.retrieve_response(self.spider, self.request) is None
        storage.store_response(self.spider, self.request, self.response)
        time.sleep(0.5) # give the chance to expire
        assert storage.retrieve_response(self.spider, self.request)

    def test_middleware(self):
        mw = HttpCacheMiddleware(self._get_settings(), self.crawler.stats)
        assert mw.process_request(self.request, self.spider) is None
        mw.process_response(self.request, self.response, self.spider)
        response = mw.process_request(self.request, self.spider)
        assert isinstance(response, HtmlResponse)
        self.assertEqualResponse(self.response, response)
        assert 'cached' in response.flags

    def test_different_request_response_urls(self):
        mw = HttpCacheMiddleware(self._get_settings(), self.crawler.stats)
        req = Request('http://host.com/path')
        res = Response('http://host2.net/test.html')
        assert mw.process_request(req, self.spider) is None
        mw.process_response(req, res, self.spider)
        cached = mw.process_request(req, self.spider)
        assert isinstance(cached, Response)
        self.assertEqualResponse(res, cached)
        assert 'cached' in cached.flags

    def test_middleware_ignore_missing(self):
        mw = self._get_middleware(HTTPCACHE_IGNORE_MISSING=True)
        self.assertRaises(IgnoreRequest, mw.process_request, self.request, self.spider)
        mw.process_response(self.request, self.response, self.spider)
        response = mw.process_request(self.request, self.spider)
        assert isinstance(response, HtmlResponse)
        self.assertEqualResponse(self.response, response)
        assert 'cached' in response.flags

    def test_middleware_ignore_schemes(self):
        # http responses are cached by default
        req, res = Request('http://test.com/'), Response('http://test.com/')
        mw = self._get_middleware()
        assert mw.process_request(req, self.spider) is None
        mw.process_response(req, res, self.spider)
        cached = mw.process_request(req, self.spider)
        assert isinstance(cached, Response), type(cached)
        self.assertEqualResponse(res, cached)
        assert 'cached' in cached.flags

        # file response is not cached by default
        req, res = Request('file:///tmp/t.txt'), Response('file:///tmp/t.txt')
        mw = self._get_middleware()
        assert mw.process_request(req, self.spider) is None
        mw.process_response(req, res, self.spider)
        assert mw.storage.retrieve_response(self.spider, req) is None
        assert mw.process_request(req, self.spider) is None

        # s3 scheme response is cached by default
        req, res = Request('s3://bucket/key'), Response('http://bucket/key')
        mw = self._get_middleware()
        assert mw.process_request(req, self.spider) is None
        mw.process_response(req, res, self.spider)
        cached = mw.process_request(req, self.spider)
        assert isinstance(cached, Response), type(cached)
        self.assertEqualResponse(res, cached)
        assert 'cached' in cached.flags

        # ignore s3 scheme
        req, res = Request('s3://bucket/key2'), Response('http://bucket/key2')
        mw = self._get_middleware(HTTPCACHE_IGNORE_SCHEMES=['s3'])
        assert mw.process_request(req, self.spider) is None
        mw.process_response(req, res, self.spider)
        assert mw.storage.retrieve_response(self.spider, req) is None
        assert mw.process_request(req, self.spider) is None

    def test_middleware_ignore_http_codes(self):
        # test response is not cached
        mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[202])
        assert mw.process_request(self.request, self.spider) is None
        mw.process_response(self.request, self.response, self.spider)
        assert mw.storage.retrieve_response(self.spider, self.request) is None
        assert mw.process_request(self.request, self.spider) is None

        # test response is cached
        mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[203])
        mw.process_response(self.request, self.response, self.spider)
        response = mw.process_request(self.request, self.spider)
        assert isinstance(response, HtmlResponse)
        self.assertEqualResponse(self.response, response)
        assert 'cached' in response.flags

    def assertEqualResponse(self, response1, response2):
        self.assertEqual(response1.url, response2.url)
        self.assertEqual(response1.status, response2.status)
        self.assertEqual(response1.headers, response2.headers)
        self.assertEqual(response1.body, response2.body)
class S3CacheStorageTest(unittest.TestCase):

    storage_class = 'scrapy_s3_http_cache.S3CacheStorage'
    policy_class = 'scrapy.extensions.httpcache.DummyPolicy'
    spider_name = 's3cache_spider'

    def setUp(self):
        self.crawler = get_crawler(Spider)
        self.spider = self.crawler._create_spider(self.spider_name)
        self.tmpdir = tempfile.mkdtemp()
        self.request = Request('http://www.example.com',
                               headers={'User-Agent': 'test'})
        self.response = Response('http://www.example.com',
                                 headers={'Content-Type': 'text/html'},
                                 body=b'test body',
                                 status=202)
        self.crawler.stats.open_spider(self.spider)
        self.cached_response = {
            'meta': {
                'url': self.request.url,
                'method': self.request.method,
                'status': self.response.status,
                'response_url': self.response.url,
                'timestamp': time.time(),
            },
            'response_headers': headers_dict_to_raw(self.response.headers),
            'response_body': self.response.body,
            'request_headers': headers_dict_to_raw(self.request.headers),
            'request_body': self.request.body
        }
        self.pickled_cached_response = pickle.dumps(self.cached_response)
        self.get_object_response = {
            'Body': StreamingBody(
                io.BytesIO(self.pickled_cached_response),
                len(self.pickled_cached_response)
            )
        }
        self.gzipped_pickled_cached_response = gzip.compress(self.pickled_cached_response)
        self.get_object_response_gziped = {
            'Body': StreamingBody(
                io.BytesIO(self.gzipped_pickled_cached_response),
                len(self.gzipped_pickled_cached_response)
            )
        }

    def tearDown(self):
        self.crawler.stats.close_spider(self.spider, '')
        shutil.rmtree(self.tmpdir)

    def _get_settings(self, **new_settings):
        settings = {
            'HTTPCACHE_ENABLED': True,
            'HTTPCACHE_DIR': self.tmpdir,
            'HTTPCACHE_EXPIRATION_SECS': 1,
            'HTTPCACHE_IGNORE_HTTP_CODES': [],
            'HTTPCACHE_POLICY': self.policy_class,
            'HTTPCACHE_STORAGE': self.storage_class,
            'AWS_ACCESS_KEY_ID': 'dummy',
            'AWS_SECRET_ACCESS_KEY': 'dummy',
            'S3CACHE_URI': 's3://bucket/%(name)s/%(day)s/%(name)s-%(version)s-%(time)s',
        }
        settings.update(new_settings)
        return Settings(settings)

    @contextmanager
    def _storage(self, **new_settings):
        with self._middleware(**new_settings) as mw:
            yield mw.storage

    @contextmanager
    def _policy(self, **new_settings):
        with self._middleware(**new_settings) as mw:
            yield mw.policy

    @contextmanager
    def _middleware(self, **new_settings):
        settings = self._get_settings(**new_settings)
        mw = HttpCacheMiddleware(settings, self.crawler.stats)
        mw.spider_opened(self.spider)
        try:
            yield mw
        finally:
            mw.spider_closed(self.spider)

    @contextmanager
    def _client_stubber(self, storage):
        stubber = storage._client_stubber
        stubber.activate()
        try:
            yield stubber
        finally:
            stubber.assert_no_pending_responses()
            stubber.deactivate()

    def assertEqualResponse(self, response1, response2):
        self.assertEqual(response1.url, response2.url)
        self.assertEqual(response1.status, response2.status)
        self.assertEqual(response1.headers, response2.headers)
        self.assertEqual(response1.body, response2.body)

    """ These tests are the same from FilesystemCacheStorage, just to make sure our class doesn't break anything """
    def test_storage(self):
        with self._storage() as storage:
            with self._client_stubber(storage) as stubber:
                stubber.add_client_error('get_object')
                request2 = self.request.copy()
                assert storage.retrieve_response(self.spider, request2) is None

            with self._client_stubber(storage) as stubber:
                stubber.add_response('put_object', {})
                stubber.add_response('get_object', self.get_object_response)
                storage.store_response(self.spider, self.request, self.response)
                response2 = storage.retrieve_response(self.spider, request2)
                assert isinstance(response2, HtmlResponse)  # content-type header
                self.assertEqualResponse(self.response, response2)

    def test_dont_cache(self):
        with self._middleware() as mw:
            with self._client_stubber(mw.storage) as stubber:
                stubber.add_client_error('get_object')
                self.request.meta['dont_cache'] = True
                mw.process_response(self.request, self.response, self.spider)
                self.assertEqual(mw.storage.retrieve_response(self.spider, self.request), None)

        with self._middleware() as mw:
            with self._client_stubber(mw.storage) as stubber:
                stubber.add_response('put_object', {})
                stubber.add_response('get_object', self.get_object_response)
                self.request.meta['dont_cache'] = False
                mw.process_response(self.request, self.response, self.spider)
                if mw.policy.should_cache_response(self.response, self.request):
                    self.assertIsInstance(
                        mw.storage.retrieve_response(self.spider, self.request),
                        self.response.__class__
                    )

    def test_dont_retrieve(self):
        settings = {
            'S3CACHE_DONT_RETRIEVE': True
        }
        with self._storage(**settings) as storage:
            assert storage.retrieve_response(self.spider, self.request) is None

    def test_disabled(self):
        settings = {
            'HTTPCACHE_ENABLED': False
        }
        with pytest.raises(NotConfigured):
            with self._storage(**settings) as storage:
                storage.__init__(settings)

    def test_s3cache_uri_not_configured(self):
        settings = {
            'S3CACHE_URI': None
        }
        with pytest.raises(NotConfigured, match='S3CACHE_URI must be specified'):
            with self._storage(**settings) as storage:
                storage.__init__(settings)

    def test_keypath_not_configured(self):
        settings = {
            'S3CACHE_URI': 's3://user:pass@bucket',
        }
        with pytest.raises(NotConfigured, match='Could not get key path from S3CACHE_URI'):
            with self._storage(**settings) as storage:
                storage.__init__(settings)

    def test_aws_credentials_not_configured(self):
        settings = {
            'S3CACHE_URI': 's3://:aws_secret_access_key@bucket/keypath',
            'AWS_ACCESS_KEY_ID': None,
        }
        with pytest.raises(NotConfigured, match='AWS_ACCESS_KEY_ID must be specified'):
            with self._storage(**settings) as storage:
                storage.__init__(settings)

        settings = {
            'S3CACHE_URI': 's3://aws_access_key_id@bucket/keypath',
            'AWS_SECRET_ACCESS_KEY': None,
        }
        with pytest.raises(NotConfigured, match='AWS_SECRET_ACCESS_KEY must be specified'):
            with self._storage(**settings) as storage:
                storage.__init__(settings)

    def test_bucket_name_not_configured(self):
        settings = {
            'S3CACHE_URI': 's3://user:pass@/heypath',
        }
        with pytest.raises(NotConfigured, match='Could not get bucket name from S3CACHE_URI'):
            with self._storage(**settings) as storage:
                storage.__init__(settings)

    def test_uri_parse(self):
        settings = {
            'S3CACHE_URI': 's3://aws_access_key:aws_secret_key@bucketname/%(name)s/%(name)s-%(time)s.zip'
        }
        with self._storage(**settings) as storage:
            self.assertEqual(storage.bucket_name, 'bucketname')
            self.assertEqual(storage.access_key, 'aws_access_key')
            self.assertEqual(storage.secret_key, 'aws_secret_key')
            self.assertEqual(storage.keypath_fmt, '%(name)s/%(name)s-%(time)s.zip')

    def test_keypath(self):
        settings = {
            'S3CACHE_URI': 's3://aws_access_key:aws_secret_key@bucketname/%(name)s/%(name)s-%(time)s.zip'
        }
        with self._storage(**settings) as storage:
            keypath_regex = '{spider_name}/{spider_name}-'.format(spider_name=self.spider_name)
            date_regex = r'\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}'
            keypath_regex = keypath_regex + date_regex
            assert re.match(keypath_regex, storage.keypath) is not None

        keypath = '1234567'
        settings = {
            'S3CACHE_URI': 's3://bucketname/{keypath}'.format(keypath=keypath)
        }
        with self._storage(**settings) as storage:
            assert keypath == storage.keypath

    def test_invalid_spider(self):
        with pytest.raises(NotConfigured, match='Could not get spider! Aborting...'):
            with self._storage() as storage:
                storage.open_spider(None)
                storage.spider

    def test_failed_to_get_object_from_s3(self):
        with self._storage() as storage:
            with self._client_stubber(storage) as stubber:
                stubber.add_client_error('get_object')
                storage.get_object_from_key('bucket', 'key')

    def test_failed_to_put_object_to_s3(self):
        with self._storage() as storage:
            with self._client_stubber(storage) as stubber:
                stubber.add_client_error('put_object')
                storage.put_object_to_key(b'file', 'bucket', 'key')
コード例 #5
0
class HttpCacheMiddlewareTest(unittest.TestCase):

    storage_class = FilesystemCacheStorage

    def setUp(self):
        self.crawler = get_crawler()
        self.spider = BaseSpider('example.com')
        self.tmpdir = tempfile.mkdtemp()
        self.request = Request('http://www.example.com',
                               headers={'User-Agent': 'test'})
        self.response = Response('http://www.example.com',
                                 headers={'Content-Type': 'text/html'},
                                 body='test body',
                                 status=202)
        self.crawler.stats.open_spider(self.spider)

    def tearDown(self):
        self.crawler.stats.close_spider(self.spider, '')
        shutil.rmtree(self.tmpdir)

    def _get_settings(self, **new_settings):
        settings = {
            'HTTPCACHE_ENABLED': True,
            'HTTPCACHE_DIR': self.tmpdir,
            'HTTPCACHE_EXPIRATION_SECS': 1,
            'HTTPCACHE_IGNORE_HTTP_CODES': [],
        }
        settings.update(new_settings)
        return Settings(settings)

    def _get_storage(self, **new_settings):
        return self.storage_class(self._get_settings(**new_settings))

    def _get_middleware(self, **new_settings):
        return HttpCacheMiddleware(self._get_settings(**new_settings),
                                   self.crawler.stats)

    def test_storage(self):
        storage = self._get_storage()
        request2 = self.request.copy()
        assert storage.retrieve_response(self.spider, request2) is None
        storage.store_response(self.spider, self.request, self.response)
        response2 = storage.retrieve_response(self.spider, request2)
        assert isinstance(response2,
                          HtmlResponse)  # inferred from content-type header
        self.assertEqualResponse(self.response, response2)
        time.sleep(2)  # wait for cache to expire
        assert storage.retrieve_response(self.spider, request2) is None

    def test_storage_never_expire(self):
        storage = self._get_storage(HTTPCACHE_EXPIRATION_SECS=0)
        assert storage.retrieve_response(self.spider, self.request) is None
        storage.store_response(self.spider, self.request, self.response)
        time.sleep(0.5)  # give the chance to expire
        assert storage.retrieve_response(self.spider, self.request)

    def test_middleware(self):
        mw = HttpCacheMiddleware(self._get_settings(), self.crawler.stats)
        assert mw.process_request(self.request, self.spider) is None
        mw.process_response(self.request, self.response, self.spider)
        response = mw.process_request(self.request, self.spider)
        assert isinstance(response, HtmlResponse)
        self.assertEqualResponse(self.response, response)
        assert 'cached' in response.flags

    def test_different_request_response_urls(self):
        mw = HttpCacheMiddleware(self._get_settings(), self.crawler.stats)
        req = Request('http://host.com/path')
        res = Response('http://host2.net/test.html')
        assert mw.process_request(req, self.spider) is None
        mw.process_response(req, res, self.spider)
        cached = mw.process_request(req, self.spider)
        assert isinstance(cached, Response)
        self.assertEqualResponse(res, cached)
        assert 'cached' in cached.flags

    def test_middleware_ignore_missing(self):
        mw = self._get_middleware(HTTPCACHE_IGNORE_MISSING=True)
        self.assertRaises(IgnoreRequest, mw.process_request, self.request,
                          self.spider)
        mw.process_response(self.request, self.response, self.spider)
        response = mw.process_request(self.request, self.spider)
        assert isinstance(response, HtmlResponse)
        self.assertEqualResponse(self.response, response)
        assert 'cached' in response.flags

    def test_middleware_ignore_schemes(self):
        # http responses are cached by default
        req, res = Request('http://test.com/'), Response('http://test.com/')
        mw = self._get_middleware()
        assert mw.process_request(req, self.spider) is None
        mw.process_response(req, res, self.spider)
        cached = mw.process_request(req, self.spider)
        assert isinstance(cached, Response), type(cached)
        self.assertEqualResponse(res, cached)
        assert 'cached' in cached.flags

        # file response is not cached by default
        req, res = Request('file:///tmp/t.txt'), Response('file:///tmp/t.txt')
        mw = self._get_middleware()
        assert mw.process_request(req, self.spider) is None
        mw.process_response(req, res, self.spider)
        assert mw.storage.retrieve_response(self.spider, req) is None
        assert mw.process_request(req, self.spider) is None

        # s3 scheme response is cached by default
        req, res = Request('s3://bucket/key'), Response('http://bucket/key')
        mw = self._get_middleware()
        assert mw.process_request(req, self.spider) is None
        mw.process_response(req, res, self.spider)
        cached = mw.process_request(req, self.spider)
        assert isinstance(cached, Response), type(cached)
        self.assertEqualResponse(res, cached)
        assert 'cached' in cached.flags

        # ignore s3 scheme
        req, res = Request('s3://bucket/key2'), Response('http://bucket/key2')
        mw = self._get_middleware(HTTPCACHE_IGNORE_SCHEMES=['s3'])
        assert mw.process_request(req, self.spider) is None
        mw.process_response(req, res, self.spider)
        assert mw.storage.retrieve_response(self.spider, req) is None
        assert mw.process_request(req, self.spider) is None

    def test_middleware_ignore_http_codes(self):
        # test response is not cached
        mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[202])
        assert mw.process_request(self.request, self.spider) is None
        mw.process_response(self.request, self.response, self.spider)
        assert mw.storage.retrieve_response(self.spider, self.request) is None
        assert mw.process_request(self.request, self.spider) is None

        # test response is cached
        mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[203])
        mw.process_response(self.request, self.response, self.spider)
        response = mw.process_request(self.request, self.spider)
        assert isinstance(response, HtmlResponse)
        self.assertEqualResponse(self.response, response)
        assert 'cached' in response.flags

    def assertEqualResponse(self, response1, response2):
        self.assertEqual(response1.url, response2.url)
        self.assertEqual(response1.status, response2.status)
        self.assertEqual(response1.headers, response2.headers)
        self.assertEqual(response1.body, response2.body)
コード例 #6
0
class HttpCacheMiddlewareTest(unittest.TestCase):

    storage_class = FilesystemCacheStorage

    def setUp(self):
        self.spider = BaseSpider("example.com")
        self.tmpdir = tempfile.mkdtemp()
        self.request = Request("http://www.example.com", headers={"User-Agent": "test"})
        self.response = Response(
            "http://www.example.com", headers={"Content-Type": "text/html"}, body="test body", status=202
        )

    def tearDown(self):
        shutil.rmtree(self.tmpdir)

    def _get_settings(self, **new_settings):
        settings = {
            "HTTPCACHE_ENABLED": True,
            "HTTPCACHE_DIR": self.tmpdir,
            "HTTPCACHE_EXPIRATION_SECS": 1,
            "HTTPCACHE_IGNORE_HTTP_CODES": [],
        }
        settings.update(new_settings)
        return Settings(settings)

    def _get_storage(self, **new_settings):
        return self.storage_class(self._get_settings(**new_settings))

    def _get_middleware(self, **new_settings):
        return HttpCacheMiddleware(self._get_settings(**new_settings))

    def test_storage(self):
        storage = self._get_storage()
        request2 = self.request.copy()
        assert storage.retrieve_response(self.spider, request2) is None
        storage.store_response(self.spider, self.request, self.response)
        response2 = storage.retrieve_response(self.spider, request2)
        assert isinstance(response2, HtmlResponse)  # inferred from content-type header
        self.assertEqualResponse(self.response, response2)
        time.sleep(2)  # wait for cache to expire
        assert storage.retrieve_response(self.spider, request2) is None

    def test_storage_never_expire(self):
        storage = self._get_storage(HTTPCACHE_EXPIRATION_SECS=0)
        assert storage.retrieve_response(self.spider, self.request) is None
        storage.store_response(self.spider, self.request, self.response)
        time.sleep(0.5)  # give the chance to expire
        assert storage.retrieve_response(self.spider, self.request)

    def test_middleware(self):
        mw = HttpCacheMiddleware(self._get_settings())
        assert mw.process_request(self.request, self.spider) is None
        mw.process_response(self.request, self.response, self.spider)
        response = mw.process_request(self.request, self.spider)
        assert isinstance(response, HtmlResponse)
        self.assertEqualResponse(self.response, response)
        assert "cached" in response.flags

    def test_different_request_response_urls(self):
        mw = HttpCacheMiddleware(self._get_settings())
        req = Request("http://host.com/path")
        res = Response("http://host2.net/test.html")
        assert mw.process_request(req, self.spider) is None
        mw.process_response(req, res, self.spider)
        cached = mw.process_request(req, self.spider)
        assert isinstance(cached, Response)
        self.assertEqualResponse(res, cached)
        assert "cached" in cached.flags

    def test_middleware_ignore_missing(self):
        mw = self._get_middleware(HTTPCACHE_IGNORE_MISSING=True)
        self.assertRaises(IgnoreRequest, mw.process_request, self.request, self.spider)
        mw.process_response(self.request, self.response, self.spider)
        response = mw.process_request(self.request, self.spider)
        assert isinstance(response, HtmlResponse)
        self.assertEqualResponse(self.response, response)
        assert "cached" in response.flags

    def test_middleware_ignore_schemes(self):
        # http responses are cached by default
        req, res = Request("http://test.com/"), Response("http://test.com/")
        mw = self._get_middleware()
        assert mw.process_request(req, self.spider) is None
        mw.process_response(req, res, self.spider)
        cached = mw.process_request(req, self.spider)
        assert isinstance(cached, Response), type(cached)
        self.assertEqualResponse(res, cached)
        assert "cached" in cached.flags

        # file response is not cached by default
        req, res = Request("file:///tmp/t.txt"), Response("file:///tmp/t.txt")
        mw = self._get_middleware()
        assert mw.process_request(req, self.spider) is None
        mw.process_response(req, res, self.spider)
        assert mw.storage.retrieve_response(self.spider, req) is None
        assert mw.process_request(req, self.spider) is None

        # s3 scheme response is cached by default
        req, res = Request("s3://bucket/key"), Response("http://bucket/key")
        mw = self._get_middleware()
        assert mw.process_request(req, self.spider) is None
        mw.process_response(req, res, self.spider)
        cached = mw.process_request(req, self.spider)
        assert isinstance(cached, Response), type(cached)
        self.assertEqualResponse(res, cached)
        assert "cached" in cached.flags

        # ignore s3 scheme
        req, res = Request("s3://bucket/key2"), Response("http://bucket/key2")
        mw = self._get_middleware(HTTPCACHE_IGNORE_SCHEMES=["s3"])
        assert mw.process_request(req, self.spider) is None
        mw.process_response(req, res, self.spider)
        assert mw.storage.retrieve_response(self.spider, req) is None
        assert mw.process_request(req, self.spider) is None

    def test_middleware_ignore_http_codes(self):
        # test response is not cached
        mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[202])
        assert mw.process_request(self.request, self.spider) is None
        mw.process_response(self.request, self.response, self.spider)
        assert mw.storage.retrieve_response(self.spider, self.request) is None
        assert mw.process_request(self.request, self.spider) is None

        # test response is cached
        mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[203])
        mw.process_response(self.request, self.response, self.spider)
        response = mw.process_request(self.request, self.spider)
        assert isinstance(response, HtmlResponse)
        self.assertEqualResponse(self.response, response)
        assert "cached" in response.flags

    def assertEqualResponse(self, response1, response2):
        self.assertEqual(response1.url, response2.url)
        self.assertEqual(response1.status, response2.status)
        self.assertEqual(response1.headers, response2.headers)
        self.assertEqual(response1.body, response2.body)
コード例 #7
0
class HttpCacheMiddlewareTest(unittest.TestCase):

    storage_class = FilesystemCacheStorage

    def setUp(self):
        self.spider = BaseSpider('example.com')
        self.tmpdir = tempfile.mkdtemp()
        self.request = Request('http://www.example.com', headers={'User-Agent': 'test'})
        self.response = Response('http://www.example.com', headers={'Content-Type': 'text/html'}, body='test body', status=202)

    def tearDown(self):
        shutil.rmtree(self.tmpdir)

    def _get_settings(self, **new_settings):
        settings = {
            'HTTPCACHE_DIR': self.tmpdir,
            'HTTPCACHE_EXPIRATION_SECS': 1,
        }
        settings.update(new_settings)
        return Settings(settings)

    def _get_storage(self, **new_settings):
        return self.storage_class(self._get_settings(**new_settings))

    def _get_middleware(self, **new_settings):
        return HttpCacheMiddleware(self._get_settings(**new_settings))

    def test_storage(self):
        storage = self._get_storage()
        request2 = self.request.copy()
        assert storage.retrieve_response(self.spider, request2) is None
        storage.store_response(self.spider, self.request, self.response)
        response2 = storage.retrieve_response(self.spider, request2)
        assert isinstance(response2, HtmlResponse) # inferred from content-type header
        self.assertEqualResponse(self.response, response2)
        time.sleep(2) # wait for cache to expire
        assert storage.retrieve_response(self.spider, request2) is None

    def test_storage_expire_immediately(self):
        storage = self._get_storage(HTTPCACHE_EXPIRATION_SECS=0)
        assert storage.retrieve_response(self.spider, self.request) is None
        storage.store_response(self.spider, self.request, self.response)
        time.sleep(0.1) # required for win32
        assert storage.retrieve_response(self.spider, self.request) is None

    def test_storage_never_expire(self):
        storage = self._get_storage(HTTPCACHE_EXPIRATION_SECS=-1)
        assert storage.retrieve_response(self.spider, self.request) is None
        storage.store_response(self.spider, self.request, self.response)
        assert storage.retrieve_response(self.spider, self.request)

    def test_middleware(self):
        mw = HttpCacheMiddleware(self._get_settings())
        assert mw.process_request(self.request, self.spider) is None
        mw.process_response(self.request, self.response, self.spider)
        response = mw.process_request(self.request, self.spider)
        assert isinstance(response, HtmlResponse)
        self.assertEqualResponse(self.response, response)
        assert 'cached' in response.flags

    def test_middleware_ignore_missing(self):
        mw = self._get_middleware(HTTPCACHE_IGNORE_MISSING=True)
        self.assertRaises(IgnoreRequest, mw.process_request, self.request, self.spider)
        mw.process_response(self.request, self.response, self.spider)
        response = mw.process_request(self.request, self.spider)
        assert isinstance(response, HtmlResponse)
        self.assertEqualResponse(self.response, response)
        assert 'cached' in response.flags

    def assertEqualResponse(self, response1, response2):
        self.assertEqual(response1.url, response2.url)
        self.assertEqual(response1.status, response2.status)
        self.assertEqual(response1.headers, response2.headers)
        self.assertEqual(response1.body, response2.body)