def _middleware(self, **new_settings): settings = self._get_settings(**new_settings) mw = HttpCacheMiddleware(settings, self.crawler.stats) mw.spider_opened(self.spider) try: yield mw finally: mw.spider_closed(self.spider)
def test_magic_response_caching(tmpdir): # prepare middlewares spider = scrapy.Spider(name='foo') crawler = _get_crawler({ 'HTTPCACHE_DIR': str(tmpdir.join('cache')), 'HTTPCACHE_STORAGE': 'scrapy_splash.SplashAwareFSCacheStorage', 'HTTPCACHE_ENABLED': True }) cache_mw = HttpCacheMiddleware.from_crawler(crawler) mw = _get_mw() cookie_mw = _get_cookie_mw() def _get_req(): return SplashRequest( url="http://example.com", endpoint='execute', magic_response=True, args={'lua_source': 'function main(splash) end'}, ) # Emulate Scrapy middleware chain. # first call req = _get_req() req = cookie_mw.process_request(req, spider) or req req = mw.process_request(req, spider) req = cache_mw.process_request(req, spider) or req assert isinstance(req, scrapy.Request) # first call; the cache is empty resp_data = { 'html': "<html><body>Hello</body></html>", 'render_time': 0.5, } resp_body = json.dumps(resp_data).encode('utf8') resp = TextResponse("http://example.com", headers={b'Content-Type': b'application/json'}, body=resp_body) resp2 = cache_mw.process_response(req, resp, spider) resp3 = mw.process_response(req, resp2, spider) resp3 = cookie_mw.process_response(req, resp3, spider) assert resp3.text == "<html><body>Hello</body></html>" assert resp3.css("body").extract_first() == "<body>Hello</body>" assert resp3.data['render_time'] == 0.5 # second call req = _get_req() req = cookie_mw.process_request(req, spider) or req req = mw.process_request(req, spider) cached_resp = cache_mw.process_request(req, spider) or req # response should be from cache: assert cached_resp.__class__ is TextResponse assert cached_resp.body == resp_body resp2_1 = cache_mw.process_response(req, cached_resp, spider) resp3_1 = mw.process_response(req, resp2_1, spider) resp3_1 = cookie_mw.process_response(req, resp3_1, spider) assert isinstance(resp3_1, scrapy_splash.SplashJsonResponse) assert resp3_1.body == b"<html><body>Hello</body></html>" assert resp3_1.text == "<html><body>Hello</body></html>" assert resp3_1.css("body").extract_first() == "<body>Hello</body>" assert resp3_1.data['render_time'] == 0.5 assert resp3_1.headers[b'Content-Type'] == b'text/html; charset=utf-8'