def test_request_cacheability(self): res0 = Response(self.request.url, status=200, headers={'Expires': self.tomorrow}) req0 = Request('http://example.com') req1 = req0.replace(headers={'Cache-Control': 'no-store'}) req2 = req0.replace(headers={'Cache-Control': 'no-cache'}) with self._middleware() as mw: # response for a request with no-store must not be cached res1 = self._process_requestresponse(mw, req1, res0) self.assertEqualResponse(res1, res0) self.assertIsNone(mw.storage.retrieve_response(req1)) # Re-do request without no-store and expect it to be cached res2 = self._process_requestresponse(mw, req0, res0) self.assertNotIn('cached', res2.flags) res3 = mw.process_request(req0) self.assertIn('cached', res3.flags) self.assertEqualResponse(res2, res3) # request with no-cache directive must not return cached response # but it allows new response to be stored res0b = res0.replace(body='foo') res4 = self._process_requestresponse(mw, req2, res0b) self.assertEqualResponse(res4, res0b) self.assertNotIn('cached', res4.flags) res5 = self._process_requestresponse(mw, req0, None) self.assertEqualResponse(res5, res0b) self.assertIn('cached', res5.flags)
def test_complex_cookies(self): # merge some cookies into jar cookies = [{'name': 'C1', 'value': 'value1', 'path': '/foo', 'domain': 'test.org'}, {'name': 'C2', 'value': 'value2', 'path': '/bar', 'domain': 'test.org'}, {'name': 'C3', 'value': 'value3', 'path': '/foo', 'domain': 'test.org'}, {'name': 'C4', 'value': 'value4', 'path': '/foo', 'domain': 't.org'}] req = Request('http://test.org/', cookies=cookies) self.mw.process_request(req) # embed C1 and C3 for test.org/foo req = Request('http://test.org/foo') self.mw.process_request(req) self.assertIn(req.headers.get('Cookie'), ('C1=value1; C3=value3', 'C3=value3; C1=value1')) # embed C2 for test.org/bar req = Request('http://test.org/bar') self.mw.process_request(req) self.assertEquals(req.headers.get('Cookie'), 'C2=value2') # embed nothing for test.org/baz req = Request('http://test.org/baz') self.mw.process_request(req) self.assertNotIn('Cookie', req.headers)
def test_copy(self): def somecallback(): pass r1 = Request('http://www.example.com', callback=somecallback, errback=somecallback, method='post', headers={'hello': 'world'}, params={'a': 'b'}, body='blablabla', meta={'c': 'd'}, proxy='123', priority=10, history=[1, 2, 3], encoding='latin1') r2 = r1.copy() self.assertIs(r1.callback, somecallback) self.assertIs(r1.errback, somecallback) self.assertIs(r2.callback, r1.callback) self.assertIs(r2.errback, r2.errback) self.assertEqual(r1.url, r2.url) self.assertEqual(r1.method, r2.method) self.assertIsNot(r1.headers, r2.headers) self.assertDictEqual(r1.headers, r2.headers) self.assertIsNot(r1.meta, r2.meta) self.assertDictEqual(r1.meta, r2.meta) self.assertIsNot(r1.history, r2.history) self.assertListEqual(r1.history, r2.history) self.assertEqual(r1.body, r2.body) self.assertEqual(r1.proxy, r2.proxy) self.assertEqual(r1.priority, r2.priority) self.assertEqual(r1.encoding, r2.encoding)
def test_prepare_method(self): r = Request(url=gh_url) self.assertEqual(r.method, 'GET') self.assertEqual(r._prepare_method('gEt'), 'GET') self.assertEqual(r._prepare_method('post'), 'POST') self.assertEqual(r._prepare_method('f.adsf/dsaf,'), 'F.ADSF/DSAF,') self.assertIsInstance(r._prepare_method(u'get'), str)
def test_get_handler(self): h = self.handler._get_handler(Request('file:///etc/fstab')) self.assertIsInstance(h, FileDownloadHandler) h = self.handler._get_handler(Request('http://www.github.com/')) self.assertIsInstance(h, HttpDownloadHandler) self.assertRaises(NotSupported, self.handler._get_handler, Request('https://www.githib.com/'))
def test_bad_scheme(self): mw = Filter(self._get_engine(FILTER_SCHEMES=['mailto'])) good1 = Request('http://a.b/') good2 = mw.process_request(good1) self.assertIs(good1, good2) bad1 = Request('mailto:[email protected]?subject=News') self.assertRaises(FilterError, mw.process_request, bad1)
def _process_site_url(self, url): for rule, cb in self._cbs: if rule.search(url): req = Request(url, callback=cb) req = self.process_site_request(req) if req: req.callback = partial(self._site_request_callback, req.callback) req.errback = partial(self._site_request_errback, req.errback) return req
def test_url_length_limit(self): mw = Filter(self._get_engine(FILTER_URL_LENGTH_LIMIT=11)) good1 = Request('http://a.b/') good2 = mw.process_request(good1) self.assertIs(good1, good2) bad1 = Request('http://a.bc/') self.assertRaises(FilterError, mw.process_request, bad1)
def test_timeout_download_from_spider(self): meta = {'DOWNLOAD_TIMEOUT': 0.2} # client connects but no data is received request = Request(self.getURL('wait'), meta=meta) d = self.download_request(request) yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError) # client connects, server send headers and some body bytes but hangs request = Request(self.getURL('hang-after-headers'), meta=meta) d = self.download_request(request) yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError)
def test_basic(self): headers = {'Set-Cookie': 'C1=value1; path=/'} req = Request('http://test.org/') self.assertIs(req, self.mw.process_request(req)) self.assertNotIn('Cookie', req.headers) res = Response('http://test.org/', request=req, headers=headers) self.assertIs(res, self.mw.process_response(res)) req2 = Request('http://test.org/sub1/') self.assertIs(req2, self.mw.process_request(req2)) self.assertEquals(req2.headers.get('Cookie'), 'C1=value1')
def test_merge_request_cookies(self): req = Request('http://test.org/', cookies={'galleta': 'salada'}) self.assertIs(self.mw.process_request(req), req) self.assertEquals(req.headers.get('Cookie'), 'galleta=salada') headers = {'Set-Cookie': 'C1=value1; path=/'} res = Response('http://test.org/', request=req, headers=headers) self.assertIs(self.mw.process_response(res), res) req2 = Request('http://test.org/sub1/') self.assertIs(self.mw.process_request(req2), req2) self.assertEquals(req2.headers.get('Cookie'), 'C1=value1; galleta=salada')
def test_request(self): req = Request(url='http://github.com', meta={'a': 'b'}) req.history = ['a', 'b'] r = Response(url='', request=req) self.assertIs(r.request, req) self.assertIs(r.meta, req.meta) self.assertIs(r.history, req.history) self.assertIs(r.original_url, req.original_url) r = Response(url='') from crawlmi.http.response.response import _no_request_error self.assertRaisesRegexp(AttributeError, _no_request_error, lambda: r.meta) self.assertRaisesRegexp(AttributeError, _no_request_error, lambda: r.history) self.assertRaisesRegexp(AttributeError, _no_request_error, lambda: r.original_url)
def test_replace(self): r1 = Request('http://www.example.com', method='GET') headers = Headers(dict(r1.headers, key='value')) r2 = r1.replace(method='POST', body='New body', headers=headers) self.assertEqual(r1.url, r2.url) self.assertEqual((r1.method, r2.method), ('GET', 'POST')) self.assertEqual((r1.body, r2.body), ('', 'New body')) self.assertEqual((r1.headers, r2.headers), (Headers(), headers)) r3 = Request('http://www.example.com', meta={'a': 1}) r4 = r3.replace(url='http://www.example.com/2', body='', meta={}) self.assertEqual(r4.url, 'http://www.example.com/2') self.assertEqual(r4.body, '') self.assertEqual(r4.meta, {})
def test_process_reqeust_restart(self): old_request = Request('http://gh.com/') new_request = Request('http://new.com/') def preq(r): if r is old_request: raise RestartPipeline(new_request) return r pm = self._get_pm(self._build('M1', preq=preq), self._build('M2', preq=True)) result = pm.process_request(old_request) self.assertIs(result, new_request) self.assertListEqual(self.mws, ['M1', 'M1', 'M2'])
def test_process_request(self): engine = get_engine() mw = DuplicateFilter(engine) r1 = Request('http://test.org/1') r2 = Request('http://test.org/2') r3 = Request('http://test.org/2') self.assertIs(mw.process_request(r1), r1) self.assertIs(mw.process_request(r2), r2) self.assertIsNone(mw.process_request(r3)) engine.signals.send(clear_duplicate_filter) self.assertIs(mw.process_request(r3), r3)
def test_404(self): req = Request('http://www.scrapytest.org/404') rsp = Response('http://www.scrapytest.org/404', body='', status=404, request=req) self.assertIs(self.mw.process_response(rsp), rsp)
def setUp(self): engine = get_engine() self.stats = engine.stats self.mw = DownloaderStats(engine) self.req = Request('http://github.com') self.resp = Response('scrapytest.org', status=400, request=self.req)
def test_payload(self): body = '1' * 100 # PayloadResource requires body length to be 100 request = Request(self.getURL('payload'), method='POST', body=body) d = self.download_request(request) d.addCallback(lambda r: r.body) d.addCallback(self.assertEquals, body) return d
def test_host_header_seted_in_request_headers(self): def _test(response): self.assertEquals(response.body, 'example.com') self.assertEquals(request.headers.get('Host'), 'example.com') request = Request(self.getURL('host'), headers={'Host': 'example.com'}) return self.download_request(request).addCallback(_test)
def test_host_header_not_in_request_headers(self): def _test(response): self.assertEquals(response.body, '127.0.0.1:%d' % self.portno) self.assertEquals(request.headers, {}) request = Request(self.getURL('host')) return self.download_request(request).addCallback(_test)
def test_redirect_302(self): url = 'http://www.example.com/302' url2 = 'http://www.example.com/redirected2' req = Request(url, method='POST', body='test', headers={ 'Content-Type': 'text/plain', 'Content-length': '4' }) resp = Response(url, headers={'Location': url2}, status=302, request=req) req2 = self.mw.process_response(resp) self.assertIsInstance(req2, Request) self.assertEqual(req2.url, url2) self.assertEqual(req2.method, 'GET') self.assertNotIn( 'Content-Type', req2.headers, 'Content-Type header must not be present in redirected request') self.assertNotIn( 'Content-Length', req2.headers, 'Content-Length header must not be present in redirected request') self.assertEqual(req2.body, '', 'Redirected body must be empty, not `%s`' % req2.body) # response without Location header but with status code is 3XX should be ignored del resp.headers['Location'] self.assertIs(self.mw.process_response(resp), resp)
def test_req_or_resp(self): req = Request('http://github.com/', meta={'a': 10, 'x': 'y'}) self.assertEqual(self.settings.get('a', req_or_resp=req), 10) self.assertEqual(self.settings.get('x', req_or_resp=req), 'y') resp = Response('', request=req) self.assertEqual(self.settings.get('a', req_or_resp=resp), 10) self.assertEqual(self.settings.get('x', req_or_resp=resp), 'y')
def test_download(self): self.engine.start() del self.sp.received[:] req = Request('http://github.com/') self.engine.download(req) self.clock.advance(0) self.check_signals([signals.request_received]) self.assertEqual(len(self.engine.request_queue), 1) # pipeline None self.pipeline.req = lambda req: None self.engine.download(req) self.clock.advance(0) self.assertEqual(len(self.engine.request_queue), 1) # pipeline response self.pipeline.req = lambda req: Response('') self.engine.download(req) self.clock.advance(0) self.assertEqual(len(self.engine.response_queue), 1) # download and stop self.pipeline.req = lambda req: Response('') d = self.engine.download(req) self.engine.stop('finished') self.clock.advance(0) return d
def test_update_headers(self): headers = {'Accept-Language': ['es'], 'Test-Header': ['test']} req = Request('http://github.com/', headers=headers) self.assertDictEqual(req.headers, headers) req = self.dh.process_request(req) self.defaults.update(headers) self.assertDictEqual(req.headers, self.defaults)
def setUp(self): self.mws = [] self.actions = [] self.req = Request('http://gh.com/') self.resp = Response('http://gh.com/', request=self.req) self.fail = Failure(Exception()) self.fail.request = self.req
def test_nothing(self): body = '''<html><head></head><body></body></html>''' req = Request('http://a.com') rsp = HtmlResponse(req.url, body=body, request=req) rsp2 = self.mw.process_response(rsp) self.assertIs(rsp, rsp2) self.assertNotIn('canonical_url', rsp.meta)
def test_priority_adjust(self): req = Request('http://a.com') resp = Response('http://a.com', headers={'Location': 'http://a.com/redirected'}, status=301, request=req) req2 = self.mw.process_response(resp) assert req2.priority > req.priority
def test_meta_refresh_with_high_interval(self): # meta-refresh with high intervals don't trigger redirects req = Request(url='http://example.org') rsp = HtmlResponse(url='http://example.org', body=self._body(interval=1000), request=req) rsp2 = self.mw.process_response(rsp) self.assertIs(rsp, rsp2)
def _clientfactory(*args, **kwargs): timeout = kwargs.pop('timeout', 0) download_size = kwargs.pop('download_size', 0) f = CrawlmiHTPPClientFactory(Request(*args, **kwargs), timeout=timeout, download_size=download_size) f.deferred.addCallback(lambda r: r.body) return f
def test_tags(self): engine = get_engine() mw = DuplicateFilter(engine) r1 = Request('http://test.org/', meta={'df_tag': '1'}) r2 = Request('http://test.org/', meta={'df_tag': '2'}) r3 = Request('http://test.org/', meta={'df_tag': '2'}) self.assertIs(mw.process_request(r1), r1) self.assertIs(mw.process_request(r2), r2) self.assertIsNone(mw.process_request(r3)) engine.signals.send(clear_duplicate_filter, df_tag='2') self.assertIsNone(mw.process_request(r1)) self.assertIs(mw.process_request(r2), r2) self.assertIsNone(mw.process_request(r3))
def test_download_without_proxy(self): def _test(response): self.assertEquals(response.status, 200) self.assertEquals(response.url, request.url) self.assertEquals(response.body, '/path/to/resource') request = Request(self.getURL('path/to/resource')) return self.download_request(request).addCallback(_test)
def run(self, args, options): if len(args) != 1: raise UsageError() url = any_to_uri(args[0]) request = Request(url, callback=open_in_browser) self.engine.download(request) self.process.start()
def test_priority_adjust(self): req = Request('http://www.scrapytest.org/503') rsp = Response('http://www.scrapytest.org/503', body='', status=503, request=req) req2 = self.mw.process_response(rsp) self.assertTrue(req2.priority < req.priority)
def test_encode_params(self): r = Request(url=gh_url) r_latin1 = Request(url=gh_url, encoding='latin1') # test interface self.assertEqual(r._encode_params('mimino'), 'mimino') self.assertEqual(r._encode_params(u'mi\xa3no'), 'mi\xc2\xa3no') self.assertEqual(r_latin1._encode_params(u'mi\xa3no'), 'mi\xa3no') self.assertEqual(r._encode_params({'hello': 'world'}), 'hello=world') self.assertIn(r._encode_params({'a': 'b', 'c': 'd'}), ['a=b&c=d', 'c=d&a=b']) self.assertEqual(r._encode_params([('a', 'b'), ('c', 'd')]), 'a=b&c=d') self.assertEqual(r._encode_params([('a', ''), ('c', '10')]), 'a=&c=10') self.assertRaises(Exception, r._encode_params) self.assertRaises(Exception, r._encode_params, 10) self.assertRaises(Exception, r._encode_params, ['hello', 'world']) # test quoting self.assertEqual(r._encode_params( [('a', u'mi\xa3no'), ('b', 'mi\xc2\xa3no')]), 'a=mi%C2%A3no&b=mi%C2%A3no') self.assertEqual(r._encode_params( {'! #$%&\'()*+,': '/:;=?@[]~'}), '%21+%23%24%25%26%27%28%29%2A%2B%2C=%2F%3A%3B%3D%3F%40%5B%5D%7E')
def test_prepare_body(self): r = Request(url=gh_url) r_latin1 = Request(url=gh_url, encoding='latin1') self.assertEqual(r._prepare_body(''), '') body = r._prepare_body(u'Price: \xa3100') self.assertIsInstance(body, str) self.assertEqual(body, 'Price: \xc2\xa3100') latin_body = r_latin1._prepare_body(u'Price: \xa3100') self.assertEqual(latin_body, 'Price: \xa3100') self.assertEqual(r._prepare_body(10), '10')
def test_request_fingerprint(self): r1 = Request('http://www.example.com/query?id=111&cat=222') r2 = Request('http://www.example.com/query?cat=222&id=111') self.assertEqual(request_fingerprint(r1), request_fingerprint(r1)) self.assertEqual(request_fingerprint(r1), request_fingerprint(r2)) r1 = Request('http://www.example.com/hnnoticiaj1.aspx?78132,199') r2 = Request('http://www.example.com/hnnoticiaj1.aspx?78160,199') self.assertNotEqual(request_fingerprint(r1), request_fingerprint(r2)) # make sure caching is working self.assertEqual(request_fingerprint(r1), _fingerprint_cache[r1][None]) r1 = Request('http://www.example.com/members/offers.html') r2 = Request('http://www.example.com/members/offers.html') r2.headers['SESSIONID'] = 'somehash' self.assertEqual(request_fingerprint(r1), request_fingerprint(r2)) r1 = Request('http://www.example.com/') r2 = Request('http://www.example.com/') r2.headers['Accept-Language'] = 'en' r3 = Request('http://www.example.com/') r3.headers['Accept-Language'] = 'en' r3.headers['SESSIONID'] = 'somehash' self.assertEqual(request_fingerprint(r1), request_fingerprint(r2), request_fingerprint(r3)) self.assertEqual(request_fingerprint(r1), request_fingerprint(r1, include_headers=['Accept-Language'])) self.assertNotEqual(request_fingerprint(r1), request_fingerprint(r2, include_headers=['Accept-Language'])) self.assertEqual(request_fingerprint(r3, include_headers=['accept-language', 'sessionid']), request_fingerprint(r3, include_headers=['SESSIONID', 'Accept-Language'])) r1 = Request('http://www.example.com') r2 = Request('http://www.example.com', method='POST') r3 = Request('http://www.example.com', method='POST', body='request body') self.assertNotEqual(request_fingerprint(r1), request_fingerprint(r2)) self.assertNotEqual(request_fingerprint(r2), request_fingerprint(r3)) # cached fingerprint must be cleared on request copy r1 = Request('http://www.example.com') fp1 = request_fingerprint(r1) r2 = r1.replace(url = 'http://www.example.com/other') fp2 = request_fingerprint(r2) self.assertNotEqual(fp1, fp2)