def test_basic(self): # engine is stopped self.clock.advance(60) self.assertEqual(self.lw.get_first_line(), '') # start the engine self.engine.signals.send(signals.engine_started) self.clock.advance(29) self.assertEqual(self.lw.get_first_line(), '') self.clock.advance(1) self.assertEqual(self.lw.get_first_line(), '[crawlmi] INFO: Crawled 0 pages (at 0 pages/min).') # download some responses self.engine.signals.send(signals.response_downloaded, response=Response(url='')) self.engine.signals.send(signals.response_downloaded, response=Response(url='')) self.engine.signals.send(signals.response_received, response=Response(url='')) self.clock.advance(30) self.assertEqual(self.lw.get_first_line(), '[crawlmi] INFO: Crawled 2 pages (at 4 pages/min).') # stop the engine self.engine.signals.send(signals.engine_stopped) self.clock.advance(60) self.assertEqual(self.lw.get_first_line(), '')
def test_download(self): self.engine.start() del self.sp.received[:] req = Request('http://github.com/') self.engine.download(req) self.clock.advance(0) self.check_signals([signals.request_received]) self.assertEqual(len(self.engine.request_queue), 1) # pipeline None self.pipeline.req = lambda req: None self.engine.download(req) self.clock.advance(0) self.assertEqual(len(self.engine.request_queue), 1) # pipeline response self.pipeline.req = lambda req: Response('') self.engine.download(req) self.clock.advance(0) self.assertEqual(len(self.engine.response_queue), 1) # download and stop self.pipeline.req = lambda req: Response('') d = self.engine.download(req) self.engine.stop('finished') self.clock.advance(0) return d
def test_request_cacheability(self): res0 = Response(self.request.url, status=200, headers={'Expires': self.tomorrow}) req0 = Request('http://example.com') req1 = req0.replace(headers={'Cache-Control': 'no-store'}) req2 = req0.replace(headers={'Cache-Control': 'no-cache'}) with self._middleware() as mw: # response for a request with no-store must not be cached res1 = self._process_requestresponse(mw, req1, res0) self.assertEqualResponse(res1, res0) self.assertIsNone(mw.storage.retrieve_response(req1)) # Re-do request without no-store and expect it to be cached res2 = self._process_requestresponse(mw, req0, res0) self.assertNotIn('cached', res2.flags) res3 = mw.process_request(req0) self.assertIn('cached', res3.flags) self.assertEqualResponse(res2, res3) # request with no-cache directive must not return cached response # but it allows new response to be stored res0b = res0.replace(body='foo') res4 = self._process_requestresponse(mw, req2, res0b) self.assertEqualResponse(res4, res0b) self.assertNotIn('cached', res4.flags) res5 = self._process_requestresponse(mw, req0, None) self.assertEqualResponse(res5, res0b) self.assertIn('cached', res5.flags)
def test_fail(self): received = [] def downloaded(result): received.append(result) # enqueue 3 requests r1, dfd1 = get_request('1', func=downloaded) self.slot.enqueue(r1, dfd1) r2, dfd2 = get_request('2', func=downloaded) self.slot.enqueue(r2, dfd2) r3, dfd3 = get_request('3', func=downloaded) self.slot.enqueue(r3, dfd3) # fail the first request err = ValueError('my bad') self.handler.fail(r1, err) self.assertEqual(received[-1].value, err) # other requests should be ok self.assertEqual(len(self.slot.in_progress), 2) self.assertEqual(len(self.slot.transferring), 2) self.handler.call(r2, Response('')) self.assertEqual(received[-1].request, r2) self.handler.call(r3, Response('')) self.assertEqual(received[-1].request, r3) self.assertEqual(len(self.slot.in_progress), 0) self.assertEqual(len(self.slot.transferring), 0)
def test_repr(self): resp_200 = Response('a', status=200) self.assertEqual(repr(resp_200), '<Response a [200 (OK)]>') resp_301 = Response('a', status=301, flags=['cached']) self.assertEqual( repr(resp_301), '<Response a [301 (Moved Permanently)]> [\'cached\']') resp_999 = Response('a', status=999) self.assertEqual(repr(resp_999), '<Response a [999]>')
def test_no_limit(self): q = ResponseQueue(0) r1 = Response('', body='x' * 50) r2 = Response('', body='y' * 50) self.assertFalse(q.needs_backout()) q.push(r1) q.push(r2) self.assertFalse(q.needs_backout())
def test_response_status(self): mw = Filter( self._get_engine(FILTER_RESPONSE_STATUS=lambda x: x != 201)) req = Request('http://github.com/') good1 = Response('', request=req, status=201) good2 = mw.process_response(good1) self.assertIs(good1, good2) bad1 = Response('', request=req, status=200) self.assertRaises(FilterError, mw.process_response, bad1)
def test_limit(self): q = ResponseQueue(10) r1 = Response('', body='x' * 5) r2 = Response('', body='y' * 5) self.assertFalse(q.needs_backout()) q.push(r1) self.assertFalse(q.needs_backout()) q.push(r2) self.assertTrue(q.needs_backout()) q.pop() self.assertFalse(q.needs_backout())
def test_copy(self): req = Request('http://gh.com/') r1 = Response(url='http://hey.com/', status=201, headers={'a': 'b'}, body='hey', request=req, flags=['cached']) r2 = r1.copy() self.assertEqual(r1.url, r2.url) self.assertEqual(r1.status, r2.status) self.assertEqual(r1.body, r2.body) self.assertIs(r1.request, r2.request) self.assertIsInstance(r2.headers, Headers) self.assertDictEqual(r1.headers, r2.headers) self.assertListEqual(r1.flags, r2.flags)
def test_filter_non_200(self): mw = Filter(self._get_engine(FILTER_NON_200_RESPONSE_STATUS=True)) req = Request('http://github.com/') good1 = Response('', request=req, status=200) good2 = mw.process_response(good1) self.assertIs(good1, good2) bad1 = Response('', request=req, status=404) self.assertRaises(FilterError, mw.process_response, bad1) mw = Filter(self._get_engine(FILTER_NON_200_RESPONSE_STATUS=False)) bad3 = mw.process_response(bad1) self.assertIs(bad1, bad3)
def test_max_redirect_times(self): self.mw.max_redirect_times = 1 req = Request('http://crawlmitest.org/302') resp = Response('http://crawlmitest.org/302', headers={'Location': '/redirected'}, status=302, request=req) req2 = self.mw.process_response(resp) self.assertIsInstance(req2, Request) self.assertListEqual(req2.history, ['http://crawlmitest.org/302']) resp2 = Response('http://crawlmitest.org/302', headers={'Location': '/redirected'}, status=302, request=req2) self.assertIsNone(self.mw.process_response(resp2))
def test_redirect_302(self): url = 'http://www.example.com/302' url2 = 'http://www.example.com/redirected2' req = Request(url, method='POST', body='test', headers={ 'Content-Type': 'text/plain', 'Content-length': '4' }) resp = Response(url, headers={'Location': url2}, status=302, request=req) req2 = self.mw.process_response(resp) self.assertIsInstance(req2, Request) self.assertEqual(req2.url, url2) self.assertEqual(req2.method, 'GET') self.assertNotIn( 'Content-Type', req2.headers, 'Content-Type header must not be present in redirected request') self.assertNotIn( 'Content-Length', req2.headers, 'Content-Length header must not be present in redirected request') self.assertEqual(req2.body, '', 'Redirected body must be empty, not `%s`' % req2.body) # response without Location header but with status code is 3XX should be ignored del resp.headers['Location'] self.assertIs(self.mw.process_response(resp), resp)
def test_request(self): req = Request(url='http://github.com', meta={'a': 'b'}) req.history = ['a', 'b'] r = Response(url='', request=req) self.assertIs(r.request, req) self.assertIs(r.meta, req.meta) self.assertIs(r.history, req.history) self.assertIs(r.original_url, req.original_url) r = Response(url='') from crawlmi.http.response.response import _no_request_error self.assertRaisesRegexp(AttributeError, _no_request_error, lambda: r.meta) self.assertRaisesRegexp(AttributeError, _no_request_error, lambda: r.history) self.assertRaisesRegexp(AttributeError, _no_request_error, lambda: r.original_url)
def test_req_or_resp(self): req = Request('http://github.com/', meta={'a': 10, 'x': 'y'}) self.assertEqual(self.settings.get('a', req_or_resp=req), 10) self.assertEqual(self.settings.get('x', req_or_resp=req), 'y') resp = Response('', request=req) self.assertEqual(self.settings.get('a', req_or_resp=resp), 10) self.assertEqual(self.settings.get('x', req_or_resp=resp), 'y')
def setUp(self): engine = get_engine() self.stats = engine.stats self.mw = DownloaderStats(engine) self.req = Request('http://github.com') self.resp = Response('scrapytest.org', status=400, request=self.req)
def test_404(self): req = Request('http://www.scrapytest.org/404') rsp = Response('http://www.scrapytest.org/404', body='', status=404, request=req) self.assertIs(self.mw.process_response(rsp), rsp)
def test_fail(self): self._update_dwn(CONCURRENT_REQUESTS=3, CONCURRENT_REQUESTS_PER_DOMAIN=2) requests = [get_request(id)[0] for id in 'aab'] map(lambda r: self.request_queue.push(r), requests) # enqueue requests self.clock.advance(0) # fail 1st request err = ValueError('my bad') self.handler.fail(requests[0], err) self.assertEqual(self.dwn.free_slots, 1) fail = self.response_queue.pop() self.assertIs(fail.request, requests[0]) self.assertIs(fail.value, err) # fail 3rd request self.handler.fail(requests[2], err) fail = self.response_queue.pop() self.assertIs(fail.request, requests[2]) self.assertIs(fail.value, err) # succeed 2nd request self.handler.call(requests[1], Response('nice!', request=requests[1])) resp = self.response_queue.pop() self.assertIs(resp.request, requests[1]) self.assertEqual(resp.url, 'nice!')
def setUp(self): self.mws = [] self.actions = [] self.req = Request('http://gh.com/') self.resp = Response('http://gh.com/', request=self.req) self.fail = Failure(Exception()) self.fail.request = self.req
def test_priority_adjust(self): req = Request('http://a.com') resp = Response('http://a.com', headers={'Location': 'http://a.com/redirected'}, status=301, request=req) req2 = self.mw.process_response(resp) assert req2.priority > req.priority
def test_priority_adjust(self): req = Request('http://www.scrapytest.org/503') rsp = Response('http://www.scrapytest.org/503', body='', status=503, request=req) req2 = self.mw.process_response(rsp) self.assertTrue(req2.priority < req.priority)
def test_properties(self): r = Response('', body='hey') def set_body(): r.body = '' self.assertEqual(r.body, 'hey') self.assertRaises(AttributeError, set_body)
def test_cookiejar_key(self): req = Request('http://test.org/', cookies={'galleta': 'salada'}, meta={'cookiejar': 'store1'}) self.assertIs(self.mw.process_request(req), req) self.assertEquals(req.headers.get('Cookie'), 'galleta=salada') headers = {'Set-Cookie': 'C1=value1; path=/'} res = Response('http://test.org/', headers=headers, request=req) self.assertIs(self.mw.process_response(res), res) req2 = Request('http://test.org/', meta=res.meta) self.assertIs(self.mw.process_request(req2), req2) self.assertEquals(req2.headers.get('Cookie'), 'C1=value1; galleta=salada') req3 = Request('http://test.org/', cookies={'galleta': 'dulce'}, meta={'cookiejar': 'store2'}) self.assertIs(self.mw.process_request(req3), req3) self.assertEquals(req3.headers.get('Cookie'), 'galleta=dulce') headers = {'Set-Cookie': 'C2=value2; path=/'} res2 = Response('http://test.org/', headers=headers, request=req3) self.assertIs(self.mw.process_response(res2), res2) req4 = Request('http://test.org/', meta=res2.meta) self.assertIs(self.mw.process_request(req4), req4) self.assertEquals(req4.headers.get('Cookie'), 'C2=value2; galleta=dulce') #cookies from hosts with port req5_1 = Request('http://test.org:1104/') self.assertIs(self.mw.process_request(req5_1), req5_1) headers = {'Set-Cookie': 'C1=value1; path=/'} res5_1 = Response('http://test.org:1104/', headers=headers, request=req5_1) self.assertIs(self.mw.process_response(res5_1), res5_1) req5_2 = Request('http://test.org:1104/some-redirected-path') self.assertIs(self.mw.process_request(req5_2), req5_2) self.assertEquals(req5_2.headers.get('Cookie'), 'C1=value1') req5_3 = Request('http://test.org/some-redirected-path') self.assertIs(self.mw.process_request(req5_3), req5_3) self.assertEquals(req5_3.headers.get('Cookie'), 'C1=value1') #skip cookie retrieval for not http request req6 = Request('file:///crawlmi/sometempfile') self.assertIs(self.mw.process_request(req6), req6) self.assertEquals(req6.headers.get('Cookie'), None)
def test_clear_slots(self): requests = [get_request(id)[0] for id in xrange(30)] for r in requests: self.request_queue.push(r) self.clock.advance(Downloader.QUEUE_CHECK_FREQUENCY) self.handler.call(r, Response('')) self.assertLessEqual(len(self.dwn.slots), 2 * self.dwn.total_concurrency)
def test_503(self): req = Request('http://www.scrapytest.org/503') rsp = Response('http://www.scrapytest.org/503', body='', status=503, request=req) # first retry req = self.mw.process_response(rsp) self.assertIsInstance(req, Request) self.assertEqual(req.meta['retry_times'], 1) # second retry rsp.request = req req = self.mw.process_response(rsp) self.assertIsInstance(req, Request) self.assertEqual(req.meta['retry_times'], 2) # discard it rsp.request = req self.assertIs(self.mw.process_response(rsp), rsp)
def test_middleware_ignore_schemes(self): # http responses are cached by default req = Request('http://test.com/') res = Response('http://test.com/', request=req) with self._middleware() as mw: self.assertIs(mw.process_request(req), req) mw.process_response(res) cached = mw.process_request(req) self.assertIsInstance(cached, Response, type(cached)) self.assertEqualResponse(res, cached) self.assertIn('cached', cached.flags) # file response is not cached by default req = Request('file:///tmp/t.txt') res = Response('file:///tmp/t.txt', request=req) with self._middleware() as mw: self.assertIs(mw.process_request(req), req) mw.process_response(res) self.assertIsNone(mw.storage.retrieve_response(req)) self.assertIs(mw.process_request(req), req) # s3 scheme response is cached by default req = Request('s3://bucket/key') res = Response('http://bucket/key', request=req) with self._middleware() as mw: self.assertIs(mw.process_request(req), req) mw.process_response(res) cached = mw.process_request(req) self.assertIsInstance(cached, Response, type(cached)) self.assertEqualResponse(res, cached) self.assertIn('cached', cached.flags) # ignore s3 scheme req = Request('s3://bucket/key2') res = Response('http://bucket/key2', request=req) with self._middleware(HTTP_CACHE_IGNORE_SCHEMES=['s3']) as mw: self.assertIs(mw.process_request(req), req) mw.process_response(res) self.assertIsNone(mw.storage.retrieve_response(req)) self.assertIs(mw.process_request(req), req)
def test_response_cacheability(self): responses = [ # 304 is not cacheable no matter what servers sends (False, 304, {}), (False, 304, {'Last-Modified': self.yesterday}), (False, 304, {'Expires': self.tomorrow}), (False, 304, {'Etag': 'bar'}), (False, 304, {'Cache-Control': 'max-age=3600'}), # Always obey no-store cache control (False, 200, {'Cache-Control': 'no-store'}), (False, 200, {'Cache-Control': 'no-store, max-age=300'}), # invalid (False, 200, {'Cache-Control': 'no-store', 'Expires': self.tomorrow}), # invalid # Ignore responses missing expiration and/or validation headers (False, 200, {}), (False, 302, {}), (False, 307, {}), (False, 404, {}), # Cache responses with expiration and/or validation headers (True, 200, {'Last-Modified': self.yesterday}), (True, 203, {'Last-Modified': self.yesterday}), (True, 300, {'Last-Modified': self.yesterday}), (True, 301, {'Last-Modified': self.yesterday}), (True, 401, {'Last-Modified': self.yesterday}), (True, 404, {'Cache-Control': 'public, max-age=600'}), (True, 302, {'Expires': self.tomorrow}), (True, 200, {'Etag': 'foo'}), ] with self._middleware() as mw: for idx, (shouldcache, status, headers) in enumerate(responses): req0 = Request('http://example-%d.com' % idx) res0 = Response(req0.url, status=status, headers=headers) res1 = self._process_requestresponse(mw, req0, res0) res304 = res0.replace(status=304) res2 = self._process_requestresponse(mw, req0, res304 if shouldcache else res0) self.assertEqualResponse(res1, res0) self.assertEqualResponse(res2, res0) resc = mw.storage.retrieve_response(req0) if shouldcache: self.assertEqualResponse(resc, res1) self.assertTrue('cached' in res2.flags and res2.status != 304) else: self.assertFalse(resc) self.assertNotIn('cached', res2.flags)
def test_different_request_response_urls(self): with self._middleware() as mw: req = Request('http://host.com/path') res = Response('http://host2.net/test.html', request=req) self.assertIs(mw.process_request(req), req) mw.process_response(res) cached = mw.process_request(req) self.assertIsInstance(cached, Response) self.assertEqualResponse(res, cached) self.assertIn('cached', cached.flags)
def _getresponse(self, coding): if coding not in FORMAT: raise ValueError() sample_file, content_encoding = FORMAT[coding] with open(join(SAMPLE_DIR, sample_file), "rb") as sample: body = sample.read() headers = { "Server": "Yaws/1.49 Yet Another Web Server", "Date": "Sun, 08 Mar 2009 00:41:03 GMT", "Content-Length": len(body), "Content-Type": "text/html", "Content-Encoding": content_encoding, } response = Response("http://github.com/", body=body, headers=headers) response.request = Request("http://github.com/", headers={"Accept-Encoding": "gzip,deflate"}) return response
def test_redirect_urls(self): req1 = Request('http://crawlmitest.org/first') resp1 = Response('http://crawlmitest.org/first', headers={'Location': '/redirected'}, status=302, request=req1) req2 = self.mw.process_response(resp1) resp2 = Response('http://crawlmitest.org/redirected', headers={'Location': '/redirected2'}, status=302, request=req2) req3 = self.mw.process_response(resp2) self.assertEqual(req2.url, 'http://crawlmitest.org/redirected') self.assertListEqual(req2.history, ['http://crawlmitest.org/first']) self.assertEqual(req3.url, 'http://crawlmitest.org/redirected2') self.assertListEqual(req3.history, [ 'http://crawlmitest.org/first', 'http://crawlmitest.org/redirected' ])
def test_header(self): # absolute url req = Request('http://a.com/pom') rsp = Response( req.url, headers={'Link': '<https://b.sk/hello>; rel="canonical"'}, request=req) rsp2 = self.mw.process_response(rsp) self.assertIs(rsp, rsp2) self.assertEqual(rsp.meta['canonical_url'], 'https://b.sk/hello') # relative url req = Request('http://a.com/pom') rsp = Response(req.url, headers={'Link': '</hello/world>; rel="canonical"'}, request=req) rsp2 = self.mw.process_response(rsp) self.assertIs(rsp, rsp2) self.assertEqual(rsp.meta['canonical_url'], 'http://a.com/hello/world')
def test_stop_engine(self): def _stop_engine(response): raise StopEngine() def _engine_stopped(): self.assertEqual(len(self.engine.response_queue), 1) req1 = Request('http://github.com/', callback=_stop_engine) resp1 = Response('', request=req1) self.engine.response_queue.push(resp1) req2 = Request('http://github.com/') resp2 = Response('', request=req2) self.engine.response_queue.push(resp2) self.engine.signals.connect(_engine_stopped, signal=signals.engine_stopped) self.engine.start() self.assertTrue(self.engine.running) self.clock.pump([self.engine.QUEUE_CHECK_FREQUENCY, 0, 0, 0]) self.assertFalse(self.engine.running)
def _getresponse(self, coding): if coding not in FORMAT: raise ValueError() sample_file, content_encoding = FORMAT[coding] with open(join(SAMPLE_DIR, sample_file), 'rb') as sample: body = sample.read() headers = { 'Server': 'Yaws/1.49 Yet Another Web Server', 'Date': 'Sun, 08 Mar 2009 00:41:03 GMT', 'Content-Length': len(body), 'Content-Type': 'text/html', 'Content-Encoding': content_encoding, } response = Response('http://github.com/', body=body, headers=headers) response.request = Request('http://github.com/', headers={'Accept-Encoding': 'gzip,deflate'}) return response
def test_cached_and_stale(self): sample_data = [ (200, {'Date': self.today, 'Expires': self.yesterday}), (200, {'Date': self.today, 'Expires': self.yesterday, 'Last-Modified': self.yesterday}), (200, {'Expires': self.yesterday}), (200, {'Expires': self.yesterday, 'ETag': 'foo'}), (200, {'Expires': self.yesterday, 'Last-Modified': self.yesterday}), (200, {'Expires': self.tomorrow, 'Age': '86405'}), (200, {'Cache-Control': 'max-age=86400', 'Age': '86405'}), # no-cache forces expiration, also revalidation if validators exists (200, {'Cache-Control': 'no-cache'}), (200, {'Cache-Control': 'no-cache', 'ETag': 'foo'}), (200, {'Cache-Control': 'no-cache', 'Last-Modified': self.yesterday}), ] with self._middleware() as mw: for idx, (status, headers) in enumerate(sample_data): req0 = Request('http://example-%d.com' % idx) res0a = Response(req0.url, status=status, headers=headers) # cache expired response res1 = self._process_requestresponse(mw, req0, res0a) self.assertEqualResponse(res1, res0a) self.assertNotIn('cached', res1.flags) # Same request but as cached response is stale a new response must # be returned res0b = res0a.replace(body='bar') res2 = self._process_requestresponse(mw, req0, res0b) self.assertEqualResponse(res2, res0b) self.assertNotIn('cached', res2.flags) # Previous response expired too, subsequent request to same # resource must revalidate and succeed on 304 if validators # are present if 'ETag' in headers or 'Last-Modified' in headers: res0c = res0b.replace(status=304) res3 = self._process_requestresponse(mw, req0, res0c) self.assertEqualResponse(res3, res0b) self.assertIn('cached', res3.flags)