class TestHttpErrorMiddlewareIntegrational(TrialTestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_middleware_works(self): crawler = get_crawler(_HttpErrorSpider) yield crawler.crawl() assert not crawler.spider.skipped, crawler.spider.skipped self.assertEqual(crawler.spider.parsed, {'200'}) self.assertEqual(crawler.spider.failed, {'404', '402', '500'}) @defer.inlineCallbacks def test_logging(self): crawler = get_crawler(_HttpErrorSpider) yield crawler.crawl(bypass_status_codes={402}) # print(get_testlog()) self.assertEqual(crawler.spider.parsed, {'200', '402'}) self.assertEqual(crawler.spider.skipped, {'402'}) self.assertEqual(crawler.spider.failed, {'404', '500'}) log = get_testlog() self.assertIn('Ignoring response <404', log) self.assertIn('Ignoring response <500', log) self.assertNotIn('Ignoring response <200', log) self.assertNotIn('Ignoring response <402', log)
class TestHttpErrorMiddlewareIntegrational(TrialTestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_middleware_works(self): crawler = get_crawler(_HttpErrorSpider) yield crawler.crawl() assert not crawler.spider.skipped, crawler.spider.skipped self.assertEqual(crawler.spider.parsed, {"200"}) self.assertEqual(crawler.spider.failed, {"404", "402", "500"}) @defer.inlineCallbacks def test_logging(self): crawler = get_crawler(_HttpErrorSpider) with LogCapture() as log: yield crawler.crawl(bypass_status_codes={402}) self.assertEqual(crawler.spider.parsed, {"200", "402"}) self.assertEqual(crawler.spider.skipped, {"402"}) self.assertEqual(crawler.spider.failed, {"404", "500"}) self.assertIn("Ignoring response <404", str(log)) self.assertIn("Ignoring response <500", str(log)) self.assertNotIn("Ignoring response <200", str(log)) self.assertNotIn("Ignoring response <402", str(log))
class CallbackKeywordArgumentsTestCase(TestCase): maxDiff = None def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_callback_kwargs(self): crawler = self.runner.create_crawler(KeywordArgumentsSpider) with LogCapture() as log: yield crawler.crawl(mockserver=self.mockserver) self.assertTrue(all(crawler.spider.checks)) self.assertEqual(len(crawler.spider.checks), crawler.stats.get_value('boolean_checks')) # check exceptions for argument mismatch exceptions = {} for line in log.records: for key in ('takes_less', 'takes_more'): if key in line.getMessage(): exceptions[key] = line self.assertEqual(exceptions['takes_less'].exc_info[0], TypeError) self.assertEqual( str(exceptions['takes_less'].exc_info[1]), "parse_takes_less() got an unexpected keyword argument 'number'") self.assertEqual(exceptions['takes_more'].exc_info[0], TypeError) self.assertEqual( str(exceptions['takes_more'].exc_info[1]), "parse_takes_more() missing 1 required positional argument: 'other'" )
class ShowOrSkipMessagesTestCase(TwistedTestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.base_settings = { 'LOG_LEVEL': 'DEBUG', 'ITEM_PIPELINES': { __name__ + '.DropSomeItemsPipeline': 300, }, } def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_show_messages(self): crawler = CrawlerRunner(self.base_settings).create_crawler(ItemSpider) with LogCapture() as lc: yield crawler.crawl(mockserver=self.mockserver) self.assertIn("Scraped from <200 http://127.0.0.1:", str(lc)) self.assertIn("Crawled (200) <GET http://127.0.0.1:", str(lc)) self.assertIn("Dropped: Ignoring item", str(lc)) @defer.inlineCallbacks def test_skip_messages(self): settings = self.base_settings.copy() settings['LOG_FORMATTER'] = __name__ + '.SkipMessagesLogFormatter' crawler = CrawlerRunner(settings).create_crawler(ItemSpider) with LogCapture() as lc: yield crawler.crawl(mockserver=self.mockserver) self.assertNotIn("Scraped from <200 http://127.0.0.1:", str(lc)) self.assertNotIn("Crawled (200) <GET http://127.0.0.1:", str(lc)) self.assertNotIn("Dropped: Ignoring item", str(lc))
class TestCatching(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_success(self): crawler = get_crawler(SignalCatcherSpider) yield crawler.crawl(self.mockserver.url("/status?n=200")) self.assertEqual(crawler.spider.caught_times, 1) @defer.inlineCallbacks def test_timeout(self): crawler = get_crawler(SignalCatcherSpider, {"DOWNLOAD_TIMEOUT": 0.1}) yield crawler.crawl(self.mockserver.url("/delay?n=0.2")) self.assertEqual(crawler.spider.caught_times, 1) @defer.inlineCallbacks def test_disconnect(self): crawler = get_crawler(SignalCatcherSpider) yield crawler.crawl(self.mockserver.url("/drop")) self.assertEqual(crawler.spider.caught_times, 1) @defer.inlineCallbacks def test_noconnect(self): crawler = get_crawler(SignalCatcherSpider) yield crawler.crawl("http://thereisdefinetelynosuchdomain.com") self.assertEqual(crawler.spider.caught_times, 1)
class PipelineTestCase(unittest.TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() def tearDown(self): self.mockserver.__exit__(None, None, None) def _on_item_scraped(self, item): self.assertIsInstance(item, dict) self.assertTrue(item.get('pipeline_passed')) self.items.append(item) def _create_crawler(self, pipeline_class): settings = { 'ITEM_PIPELINES': { __name__ + '.' + pipeline_class.__name__: 1 }, } crawler = get_crawler(ItemSpider, settings) crawler.signals.connect(self._on_item_scraped, signals.item_scraped) self.items = [] return crawler @defer.inlineCallbacks def test_simple_pipeline(self): crawler = self._create_crawler(SimplePipeline) yield crawler.crawl(mockserver=self.mockserver) self.assertEqual(len(self.items), 1) @defer.inlineCallbacks def test_deferred_pipeline(self): crawler = self._create_crawler(DeferredPipeline) yield crawler.crawl(mockserver=self.mockserver) self.assertEqual(len(self.items), 1)
class Http11MockServerTestCase(unittest.TestCase): """HTTP 1.1 test case with MockServer""" if twisted_version < (11, 1, 0): skip = 'HTTP1.1 not supported in twisted < 11.1.0' def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_download_with_content_length(self): crawler = get_crawler(SingleRequestSpider) # http://localhost:8998/partial set Content-Length to 1024, use download_maxsize= 1000 to avoid # download it yield crawler.crawl(seed=Request(url='http://localhost:8998/partial', meta={'download_maxsize': 1000})) failure = crawler.spider.meta['failure'] self.assertIsInstance(failure.value, defer.CancelledError) @defer.inlineCallbacks def test_download(self): crawler = get_crawler(SingleRequestSpider) yield crawler.crawl(seed=Request(url='http://localhost:8998')) failure = crawler.spider.meta.get('failure') self.assertTrue(failure == None) reason = crawler.spider.meta['close_reason'] self.assertTrue(reason, 'finished') @defer.inlineCallbacks def test_download_gzip_response(self): if six.PY2 and twisted_version > (12, 3, 0): crawler = get_crawler(SingleRequestSpider) body = '1' * 100 # PayloadResource requires body length to be 100 request = Request('http://localhost:8998/payload', method='POST', body=body, meta={'download_maxsize': 50}) yield crawler.crawl(seed=request) failure = crawler.spider.meta['failure'] # download_maxsize < 100, hence the CancelledError self.assertIsInstance(failure.value, defer.CancelledError) request.headers.setdefault('Accept-Encoding', 'gzip,deflate') request = request.replace(url='http://localhost:8998/xpayload') yield crawler.crawl(seed=request) # download_maxsize = 50 is enough for the gzipped response failure = crawler.spider.meta.get('failure') self.assertTrue(failure == None) reason = crawler.spider.meta['close_reason'] self.assertTrue(reason, 'finished') else: raise unittest.SkipTest( "xpayload and payload endpoint only enabled for twisted > 12.3.0 and python 2.x" )
class TestAuthCookiesFromUrl(unittest.TestCase): base_url = 'http://127.0.0.1:{}'.format(PORT) url = base_url + Login.url url_no_change_cookie = base_url + LoginNoChangeCookie.url url_check_proxy = base_url + LoginCheckProxy.url def setUp(self): self.al = AutoLogin() self.mockserver = MockServer() self.mockserver.__enter__() def tearDown(self): self.mockserver.__exit__(None, None, None) def test_no_login_form(self): with pytest.raises(AutoLoginException) as e: self.al.auth_cookies_from_url( self.url + '?hide=', 'admin', 'secret') assert e.value.args[0] == 'nologinform' def test_wrong_password(self): with pytest.raises(AutoLoginException) as e: self.al.auth_cookies_from_url(self.url, 'admin', 'wrong') assert e.value.args[0] == 'badauth' def test_normal_auth(self): cookies = self.al.auth_cookies_from_url( self.url + '?foo=', 'admin', 'secret') assert {c.name: c.value for c in cookies} == {'_auth': 'yes'} def test_redirect_to_same_url(self): cookies = self.al.auth_cookies_from_url(self.url, 'admin', 'secret') assert {c.name: c.value for c in cookies} == {'_auth': 'yes'} def test_proxy(self): assert 'localhost' not in self.url, 'proxy_bypass bypasses localhost' with MockServer('tests.proxy'): with pytest.raises(AutoLoginException) as e: self.al.auth_cookies_from_url( self.url_check_proxy, 'admin', 'secret') cookies = self.al.auth_cookies_from_url( self.url_check_proxy, 'admin', 'secret', settings={ 'HTTP_PROXY': 'http://127.0.0.1:{}'.format(PROXY_PORT) }, ) assert {c.name: c.value for c in cookies} == {'_auth': 'yes'} def test_no_change_cookie(self): cookies = self.al.auth_cookies_from_url( self.url_no_change_cookie, 'admin', 'secret') assert {c.name: c.value for c in cookies} == {'session': '1'} def test_no_change_cookie_wrong_password(self): with pytest.raises(AutoLoginException) as e: self.al.auth_cookies_from_url( self.url_no_change_cookie, 'admin', 'wrong') assert e.value.args[0] == 'badauth'
class ProxyConnectTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self._oldenv = os.environ.copy() self._proxy = MitmProxy() proxy_url = self._proxy.start() os.environ['https_proxy'] = proxy_url os.environ['http_proxy'] = proxy_url def tearDown(self): self.mockserver.__exit__(None, None, None) self._proxy.stop() os.environ = self._oldenv @defer.inlineCallbacks def test_https_connect_tunnel(self): crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/status?n=200", is_secure=True)) self._assert_got_response_code(200, l) @pytest.mark.xfail(reason='Python 3.6+ fails this earlier', condition=sys.version_info.minor >= 6) @defer.inlineCallbacks def test_https_connect_tunnel_error(self): crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("https://localhost:99999/status?n=200") self._assert_got_tunnel_error(l) @defer.inlineCallbacks def test_https_tunnel_auth_error(self): os.environ['https_proxy'] = _wrong_credentials(os.environ['https_proxy']) crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/status?n=200", is_secure=True)) # The proxy returns a 407 error code but it does not reach the client; # he just sees a TunnelError. self._assert_got_tunnel_error(l) @defer.inlineCallbacks def test_https_tunnel_without_leak_proxy_authorization_header(self): request = Request(self.mockserver.url("/echo", is_secure=True)) crawler = get_crawler(SingleRequestSpider) with LogCapture() as l: yield crawler.crawl(seed=request) self._assert_got_response_code(200, l) echo = json.loads(crawler.spider.meta['responses'][0].text) self.assertTrue('Proxy-Authorization' not in echo['headers']) def _assert_got_response_code(self, code, log): print(log) self.assertEqual(str(log).count('Crawled (%d)' % code), 1) def _assert_got_tunnel_error(self, log): print(log) self.assertIn('TunnelError', str(log))
class ProxyConnectTestCase(TestCase): def setUp(self): try: import mitmproxy # noqa: F401 except ImportError: self.skipTest('mitmproxy is not installed') self.mockserver = MockServer() self.mockserver.__enter__() self._oldenv = os.environ.copy() self._proxy = MitmProxy() proxy_url = self._proxy.start() os.environ['https_proxy'] = proxy_url os.environ['http_proxy'] = proxy_url def tearDown(self): self.mockserver.__exit__(None, None, None) self._proxy.stop() os.environ = self._oldenv @defer.inlineCallbacks def test_https_connect_tunnel(self): crawler = get_crawler(SimpleSpider) with LogCapture() as log: yield crawler.crawl( self.mockserver.url("/status?n=200", is_secure=True)) self._assert_got_response_code(200, log) @defer.inlineCallbacks def test_https_tunnel_auth_error(self): os.environ['https_proxy'] = _wrong_credentials( os.environ['https_proxy']) crawler = get_crawler(SimpleSpider) with LogCapture() as log: yield crawler.crawl( self.mockserver.url("/status?n=200", is_secure=True)) # The proxy returns a 407 error code but it does not reach the client; # he just sees a TunnelError. self._assert_got_tunnel_error(log) @defer.inlineCallbacks def test_https_tunnel_without_leak_proxy_authorization_header(self): request = Request(self.mockserver.url("/echo", is_secure=True)) crawler = get_crawler(SingleRequestSpider) with LogCapture() as log: yield crawler.crawl(seed=request) self._assert_got_response_code(200, log) echo = json.loads(crawler.spider.meta['responses'][0].text) self.assertTrue('Proxy-Authorization' not in echo['headers']) def _assert_got_response_code(self, code, log): print(log) self.assertEqual(str(log).count(f'Crawled ({code})'), 1) def _assert_got_tunnel_error(self, log): print(log) self.assertIn('TunnelError', str(log))
class Http11MockServerTestCase(unittest.TestCase): """HTTP 1.1 test case with MockServer""" if twisted_version < (11, 1, 0): skip = "HTTP1.1 not supported in twisted < 11.1.0" def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_download_with_content_length(self): crawler = get_crawler(SingleRequestSpider) # http://localhost:8998/partial set Content-Length to 1024, use download_maxsize= 1000 to avoid # download it yield crawler.crawl(seed=Request(url="http://localhost:8998/partial", meta={"download_maxsize": 1000})) failure = crawler.spider.meta["failure"] self.assertIsInstance(failure.value, defer.CancelledError) @defer.inlineCallbacks def test_download(self): crawler = get_crawler(SingleRequestSpider) yield crawler.crawl(seed=Request(url="http://localhost:8998")) failure = crawler.spider.meta.get("failure") self.assertTrue(failure == None) reason = crawler.spider.meta["close_reason"] self.assertTrue(reason, "finished") @defer.inlineCallbacks def test_download_gzip_response(self): if twisted_version > (12, 3, 0): crawler = get_crawler(SingleRequestSpider) body = b"1" * 100 # PayloadResource requires body length to be 100 request = Request("http://localhost:8998/payload", method="POST", body=body, meta={"download_maxsize": 50}) yield crawler.crawl(seed=request) failure = crawler.spider.meta["failure"] # download_maxsize < 100, hence the CancelledError self.assertIsInstance(failure.value, defer.CancelledError) if six.PY2: request.headers.setdefault(b"Accept-Encoding", b"gzip,deflate") request = request.replace(url="http://localhost:8998/xpayload") yield crawler.crawl(seed=request) # download_maxsize = 50 is enough for the gzipped response failure = crawler.spider.meta.get("failure") self.assertTrue(failure == None) reason = crawler.spider.meta["close_reason"] self.assertTrue(reason, "finished") else: # See issue https://twistedmatrix.com/trac/ticket/8175 raise unittest.SkipTest("xpayload only enabled for PY2") else: raise unittest.SkipTest("xpayload and payload endpoint only enabled for twisted > 12.3.0")
class TestCloseSpider(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_closespider_itemcount(self): close_on = 5 crawler = get_crawler(ItemSpider, {'CLOSESPIDER_ITEMCOUNT': close_on}) yield crawler.crawl(mockserver=self.mockserver) reason = crawler.spider.meta['close_reason'] self.assertEqual(reason, 'closespider_itemcount') itemcount = crawler.stats.get_value('item_scraped_count') self.assertTrue(itemcount >= close_on) @defer.inlineCallbacks def test_closespider_pagecount(self): close_on = 5 crawler = get_crawler(FollowAllSpider, {'CLOSESPIDER_PAGECOUNT': close_on}) yield crawler.crawl(mockserver=self.mockserver) reason = crawler.spider.meta['close_reason'] self.assertEqual(reason, 'closespider_pagecount') pagecount = crawler.stats.get_value('response_received_count') self.assertTrue(pagecount >= close_on) @defer.inlineCallbacks def test_closespider_errorcount(self): close_on = 5 crawler = get_crawler(ErrorSpider, {'CLOSESPIDER_ERRORCOUNT': close_on}) yield crawler.crawl(total=1000000, mockserver=self.mockserver) reason = crawler.spider.meta['close_reason'] self.assertEqual(reason, 'closespider_errorcount') key = 'spider_exceptions/{name}'\ .format(name=crawler.spider.exception_cls.__name__) errorcount = crawler.stats.get_value(key) self.assertTrue(errorcount >= close_on) @defer.inlineCallbacks def test_closespider_timeout(self): close_on = 0.1 crawler = get_crawler(FollowAllSpider, {'CLOSESPIDER_TIMEOUT': close_on}) yield crawler.crawl(total=1000000, mockserver=self.mockserver) reason = crawler.spider.meta['close_reason'] self.assertEqual(reason, 'closespider_timeout') stats = crawler.stats start = stats.get_value('start_time') stop = stats.get_value('finish_time') diff = stop - start total_seconds = diff.seconds + diff.microseconds self.assertTrue(total_seconds >= close_on)
class TestCloseSpider(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_closespider_itemcount(self): close_on = 5 crawler = get_crawler(ItemSpider, {'CLOSESPIDER_ITEMCOUNT': close_on}) yield crawler.crawl() reason = crawler.spider.meta['close_reason'] self.assertEqual(reason, 'closespider_itemcount') itemcount = crawler.stats.get_value('item_scraped_count') self.assertTrue(itemcount >= close_on) @defer.inlineCallbacks def test_closespider_pagecount(self): close_on = 5 crawler = get_crawler(FollowAllSpider, {'CLOSESPIDER_PAGECOUNT': close_on}) yield crawler.crawl() reason = crawler.spider.meta['close_reason'] self.assertEqual(reason, 'closespider_pagecount') pagecount = crawler.stats.get_value('response_received_count') self.assertTrue(pagecount >= close_on) @defer.inlineCallbacks def test_closespider_errorcount(self): close_on = 5 crawler = get_crawler(ErrorSpider, {'CLOSESPIDER_ERRORCOUNT': close_on}) yield crawler.crawl(total=1000000) self.flushLoggedErrors(crawler.spider.exception_cls) reason = crawler.spider.meta['close_reason'] self.assertEqual(reason, 'closespider_errorcount') key = 'spider_exceptions/{name}'\ .format(name=crawler.spider.exception_cls.__name__) errorcount = crawler.stats.get_value(key) self.assertTrue(errorcount >= close_on) @defer.inlineCallbacks def test_closespider_timeout(self): close_on = 0.1 crawler = get_crawler(FollowAllSpider, {'CLOSESPIDER_TIMEOUT': close_on}) yield crawler.crawl(total=1000000) reason = crawler.spider.meta['close_reason'] self.assertEqual(reason, 'closespider_timeout') stats = crawler.stats start = stats.get_value('start_time') stop = stats.get_value('finish_time') diff = stop - start total_seconds = diff.seconds + diff.microseconds self.assertTrue(total_seconds >= close_on)
class TestHttpErrorMiddlewareIntegrational(TrialTestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_middleware_works(self): crawler = get_crawler(_HttpErrorSpider) yield crawler.crawl() assert not crawler.spider.skipped, crawler.spider.skipped self.assertEqual(crawler.spider.parsed, {'200'}) self.assertEqual(crawler.spider.failed, {'404', '402', '500'}) @defer.inlineCallbacks def test_logging(self): crawler = get_crawler(_HttpErrorSpider) with LogCapture() as log: yield crawler.crawl(bypass_status_codes={402}) self.assertEqual(crawler.spider.parsed, {'200', '402'}) self.assertEqual(crawler.spider.skipped, {'402'}) self.assertEqual(crawler.spider.failed, {'404', '500'}) self.assertIn('Ignoring response <404', str(log)) self.assertIn('Ignoring response <500', str(log)) self.assertNotIn('Ignoring response <200', str(log)) self.assertNotIn('Ignoring response <402', str(log)) @defer.inlineCallbacks def test_logging_level(self): # HttpError logs ignored responses with level INFO crawler = get_crawler(_HttpErrorSpider) with LogCapture(level=logging.INFO) as log: yield crawler.crawl() self.assertEqual(crawler.spider.parsed, {'200'}) self.assertEqual(crawler.spider.failed, {'404', '402', '500'}) self.assertIn('Ignoring response <402', str(log)) self.assertIn('Ignoring response <404', str(log)) self.assertIn('Ignoring response <500', str(log)) self.assertNotIn('Ignoring response <200', str(log)) # with level WARNING, we shouldn't capture anything from HttpError crawler = get_crawler(_HttpErrorSpider) with LogCapture(level=logging.WARNING) as log: yield crawler.crawl() self.assertEqual(crawler.spider.parsed, {'200'}) self.assertEqual(crawler.spider.failed, {'404', '402', '500'}) self.assertNotIn('Ignoring response <402', str(log)) self.assertNotIn('Ignoring response <404', str(log)) self.assertNotIn('Ignoring response <500', str(log)) self.assertNotIn('Ignoring response <200', str(log))
class Http11MockServerTestCase(unittest.TestCase): """HTTP 1.1 test case with MockServer""" if 'http11' not in optional_features: skip = 'HTTP1.1 not supported in twisted < 11.1.0' def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_download_with_content_length(self): crawler = get_crawler(SingleRequestSpider) # http://localhost:8998/partial set Content-Length to 1024, use download_maxsize= 1000 to avoid # download it yield crawler.crawl(seed=Request(url='http://localhost:8998/partial', meta={'download_maxsize': 1000})) failure = crawler.spider.meta['failure'] self.assertIsInstance(failure.value, defer.CancelledError) @defer.inlineCallbacks def test_download(self): crawler = get_crawler(SingleRequestSpider) yield crawler.crawl(seed=Request(url='http://localhost:8998')) failure = crawler.spider.meta.get('failure') self.assertTrue(failure == None) reason = crawler.spider.meta['close_reason'] self.assertTrue(reason, 'finished') @defer.inlineCallbacks def test_download_gzip_response(self): if six.PY2 and twisted_version > (12, 3, 0): crawler = get_crawler(SingleRequestSpider) body = '1'*100 # PayloadResource requires body length to be 100 request = Request('http://localhost:8998/payload', method='POST', body=body, meta={'download_maxsize': 50}) yield crawler.crawl(seed=request) failure = crawler.spider.meta['failure'] # download_maxsize < 100, hence the CancelledError self.assertIsInstance(failure.value, defer.CancelledError) request.headers.setdefault('Accept-Encoding', 'gzip,deflate') request = request.replace(url='http://localhost:8998/xpayload') yield crawler.crawl(seed=request) # download_maxsize = 50 is enough for the gzipped response failure = crawler.spider.meta.get('failure') self.assertTrue(failure == None) reason = crawler.spider.meta['close_reason'] self.assertTrue(reason, 'finished') else: raise unittest.SkipTest("xpayload and payload endpoint only enabled for twisted > 12.3.0 and python 2.x")
class Http11MockServerTestCase(unittest.TestCase): """HTTP 1.1 test case with MockServer""" def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_download_with_content_length(self): crawler = get_crawler(SingleRequestSpider) # http://localhost:8998/partial set Content-Length to 1024, use download_maxsize= 1000 to avoid # download it yield crawler.crawl(seed=Request(url=self.mockserver.url('/partial'), meta={'download_maxsize': 1000})) failure = crawler.spider.meta['failure'] self.assertIsInstance(failure.value, defer.CancelledError) @defer.inlineCallbacks def test_download(self): crawler = get_crawler(SingleRequestSpider) yield crawler.crawl(seed=Request(url=self.mockserver.url(''))) failure = crawler.spider.meta.get('failure') self.assertTrue(failure == None) reason = crawler.spider.meta['close_reason'] self.assertTrue(reason, 'finished') @defer.inlineCallbacks def test_download_gzip_response(self): crawler = get_crawler(SingleRequestSpider) body = b'1' * 100 # PayloadResource requires body length to be 100 request = Request(self.mockserver.url('/payload'), method='POST', body=body, meta={'download_maxsize': 50}) yield crawler.crawl(seed=request) failure = crawler.spider.meta['failure'] # download_maxsize < 100, hence the CancelledError self.assertIsInstance(failure.value, defer.CancelledError) if six.PY2: request.headers.setdefault(b'Accept-Encoding', b'gzip,deflate') request = request.replace(url=self.mockserver.url('/xpayload')) yield crawler.crawl(seed=request) # download_maxsize = 50 is enough for the gzipped response failure = crawler.spider.meta.get('failure') self.assertTrue(failure == None) reason = crawler.spider.meta['close_reason'] self.assertTrue(reason, 'finished') else: # See issue https://twistedmatrix.com/trac/ticket/8175 raise unittest.SkipTest("xpayload only enabled for PY2")
class AsyncSignalTestCase(unittest.TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.items = [] def tearDown(self): self.mockserver.__exit__(None, None, None) async def _on_item_scraped(self, item): item = await get_from_asyncio_queue(item) self.items.append(item) @mark.only_asyncio() @defer.inlineCallbacks def test_simple_pipeline(self): crawler = get_crawler(ItemSpider) crawler.signals.connect(self._on_item_scraped, signals.item_scraped) yield crawler.crawl(mockserver=self.mockserver) self.assertEqual(len(self.items), 10) for index in range(10): self.assertIn({'index': index}, self.items)
class CrawlTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_follow_all(self): spider = FollowAllSpider() yield docrawl(spider) self.assertEqual(len(spider.urls_visited), 11) # 10 + start_url @defer.inlineCallbacks def test_delay(self): # short to long delays yield self._test_delay(0.2, False) yield self._test_delay(1, False) # randoms yield self._test_delay(0.2, True) yield self._test_delay(1, True) @defer.inlineCallbacks def _test_delay(self, delay, randomize): settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize} spider = FollowAllSpider(maxlatency=delay * 2) yield docrawl(spider, settings) t = spider.times totaltime = t[-1] - t[0] avgd = totaltime / (len(t) - 1) tolerance = 0.6 if randomize else 0.2 self.assertTrue(avgd > delay * (1 - tolerance), "download delay too small: %s" % avgd) @defer.inlineCallbacks def test_timeout_success(self): spider = DelaySpider(n=0.5) yield docrawl(spider) self.assertTrue(spider.t1 > 0) self.assertTrue(spider.t2 > 0) self.assertTrue(spider.t2 > spider.t1) @defer.inlineCallbacks def test_timeout_failure(self): spider = DelaySpider(n=0.5) yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35}) self.assertTrue(spider.t1 > 0) self.assertTrue(spider.t2 == 0) self.assertTrue(spider.t2_err > 0) self.assertTrue(spider.t2_err > spider.t1) # server hangs after receiving response headers spider = DelaySpider(n=0.5, b=1) yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35}) self.assertTrue(spider.t1 > 0) self.assertTrue(spider.t2 == 0) self.assertTrue(spider.t2_err > 0) self.assertTrue(spider.t2_err > spider.t1) @defer.inlineCallbacks def test_retry_503(self): spider = SimpleSpider("http://localhost:8998/status?n=503") yield docrawl(spider) self._assert_retried() @defer.inlineCallbacks def test_retry_conn_failed(self): spider = SimpleSpider("http://localhost:65432/status?n=503") yield docrawl(spider) self._assert_retried() @defer.inlineCallbacks def test_retry_dns_error(self): with mock.patch('socket.gethostbyname', side_effect=socket.gaierror(-5, 'No address associated with hostname')): spider = SimpleSpider("http://example.com/") yield docrawl(spider) self._assert_retried() @defer.inlineCallbacks def test_start_requests_bug_before_yield(self): spider = BrokenStartRequestsSpider(fail_before_yield=1) yield docrawl(spider) errors = self.flushLoggedErrors(ZeroDivisionError) self.assertEqual(len(errors), 1) @defer.inlineCallbacks def test_start_requests_bug_yielding(self): spider = BrokenStartRequestsSpider(fail_yielding=1) yield docrawl(spider) errors = self.flushLoggedErrors(ZeroDivisionError) self.assertEqual(len(errors), 1) @defer.inlineCallbacks def test_start_requests_lazyness(self): settings = {"CONCURRENT_REQUESTS": 1} spider = BrokenStartRequestsSpider() yield docrawl(spider, settings) #self.assertTrue(False, spider.seedsseen) #self.assertTrue(spider.seedsseen.index(None) < spider.seedsseen.index(99), # spider.seedsseen) @defer.inlineCallbacks def test_start_requests_dupes(self): settings = {"CONCURRENT_REQUESTS": 1} spider = DuplicateStartRequestsSpider(dont_filter=True, distinct_urls=2, dupe_factor=3) yield docrawl(spider, settings) self.assertEqual(spider.visited, 6) spider = DuplicateStartRequestsSpider(dont_filter=False, distinct_urls=3, dupe_factor=4) yield docrawl(spider, settings) self.assertEqual(spider.visited, 3) @defer.inlineCallbacks def test_unbounded_response(self): # Completeness of responses without Content-Length or Transfer-Encoding # can not be determined, we treat them as valid but flagged as "partial" from urllib import urlencode query = urlencode({'raw': '''\ HTTP/1.1 200 OK Server: Apache-Coyote/1.1 X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0 Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/ Pragma: no-cache Expires: Thu, 01 Jan 1970 00:00:00 GMT Cache-Control: no-cache Cache-Control: no-store Content-Type: text/html;charset=UTF-8 Content-Language: en Date: Tue, 27 Aug 2013 13:05:05 GMT Connection: close foo body with multiples lines '''}) spider = SimpleSpider("http://localhost:8998/raw?{0}".format(query)) yield docrawl(spider) log = get_testlog() self.assertEqual(log.count("Got response 200"), 1) @defer.inlineCallbacks def test_retry_conn_lost(self): # connection lost after receiving data spider = SimpleSpider("http://localhost:8998/drop?abort=0") yield docrawl(spider) self._assert_retried() @defer.inlineCallbacks def test_retry_conn_aborted(self): # connection lost before receiving data spider = SimpleSpider("http://localhost:8998/drop?abort=1") yield docrawl(spider) self._assert_retried() def _assert_retried(self): log = get_testlog() self.assertEqual(log.count("Retrying"), 2) self.assertEqual(log.count("Gave up retrying"), 1) @defer.inlineCallbacks def test_referer_header(self): """Referer header is set by RefererMiddleware unless it is already set""" req0 = Request('http://localhost:8998/echo?headers=1&body=0', dont_filter=1) req1 = req0.replace() req2 = req0.replace(headers={'Referer': None}) req3 = req0.replace(headers={'Referer': 'http://example.com'}) req0.meta['next'] = req1 req1.meta['next'] = req2 req2.meta['next'] = req3 spider = SingleRequestSpider(seed=req0) yield docrawl(spider) # basic asserts in case of weird communication errors self.assertIn('responses', spider.meta) self.assertNotIn('failures', spider.meta) # start requests doesn't set Referer header echo0 = json.loads(spider.meta['responses'][2].body) self.assertNotIn('Referer', echo0['headers']) # following request sets Referer to start request url echo1 = json.loads(spider.meta['responses'][1].body) self.assertEqual(echo1['headers'].get('Referer'), [req0.url]) # next request avoids Referer header echo2 = json.loads(spider.meta['responses'][2].body) self.assertNotIn('Referer', echo2['headers']) # last request explicitly sets a Referer header echo3 = json.loads(spider.meta['responses'][3].body) self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com']) @defer.inlineCallbacks def test_engine_status(self): from pyrake.utils.engine import get_engine_status est = [] def cb(response): est.append(get_engine_status(spider.crawler.engine)) spider = SingleRequestSpider(seed='http://localhost:8998/', callback_func=cb) yield docrawl(spider) self.assertEqual(len(est), 1, est) s = dict(est[0]) self.assertEqual(s['engine.spider.name'], spider.name) self.assertEqual(s['len(engine.scraper.slot.active)'], 1)
class CrawlTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_follow_all(self): crawler = self.runner.create_crawler(FollowAllSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertEqual(len(crawler.spider.urls_visited), 11) # 10 + start_url @defer.inlineCallbacks def test_delay(self): # short to long delays yield self._test_delay(0.2, False) yield self._test_delay(1, False) # randoms yield self._test_delay(0.2, True) yield self._test_delay(1, True) @defer.inlineCallbacks def _test_delay(self, delay, randomize): settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize} crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider) yield crawler.crawl(maxlatency=delay * 2, mockserver=self.mockserver) t = crawler.spider.times totaltime = t[-1] - t[0] avgd = totaltime / (len(t) - 1) tolerance = 0.6 if randomize else 0.2 self.assertTrue(avgd > delay * (1 - tolerance), "download delay too small: %s" % avgd) @defer.inlineCallbacks def test_timeout_success(self): crawler = self.runner.create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 > 0) self.assertTrue(crawler.spider.t2 > crawler.spider.t1) @defer.inlineCallbacks def test_timeout_failure(self): crawler = CrawlerRunner({"DOWNLOAD_TIMEOUT": 0.35}).create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) # server hangs after receiving response headers yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) @defer.inlineCallbacks def test_retry_503(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/status?n=503"), mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_failed(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("http://localhost:65432/status?n=503", mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_dns_error(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: # try to fetch the homepage of a non-existent domain yield crawler.crawl("http://dns.resolution.invalid./", mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_start_requests_bug_before_yield(self): with LogCapture('scrapy', level=logging.ERROR) as l: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_before_yield=1, mockserver=self.mockserver) self.assertEqual(len(l.records), 1) record = l.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_bug_yielding(self): with LogCapture('scrapy', level=logging.ERROR) as l: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_yielding=1, mockserver=self.mockserver) self.assertEqual(len(l.records), 1) record = l.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_lazyness(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(mockserver=self.mockserver) #self.assertTrue(False, crawler.spider.seedsseen) #self.assertTrue(crawler.spider.seedsseen.index(None) < crawler.spider.seedsseen.index(99), # crawler.spider.seedsseen) @defer.inlineCallbacks def test_start_requests_dupes(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler(DuplicateStartRequestsSpider) yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 6) yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 3) @defer.inlineCallbacks def test_unbounded_response(self): # Completeness of responses without Content-Length or Transfer-Encoding # can not be determined, we treat them as valid but flagged as "partial" from six.moves.urllib.parse import urlencode query = urlencode({'raw': '''\ HTTP/1.1 200 OK Server: Apache-Coyote/1.1 X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0 Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/ Pragma: no-cache Expires: Thu, 01 Jan 1970 00:00:00 GMT Cache-Control: no-cache Cache-Control: no-store Content-Type: text/html;charset=UTF-8 Content-Language: en Date: Tue, 27 Aug 2013 13:05:05 GMT Connection: close foo body with multiples lines '''}) crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/raw?{0}".format(query)), mockserver=self.mockserver) self.assertEqual(str(l).count("Got response 200"), 1) @defer.inlineCallbacks def test_retry_conn_lost(self): # connection lost after receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/drop?abort=0"), mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_aborted(self): # connection lost before receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/drop?abort=1"), mockserver=self.mockserver) self._assert_retried(l) def _assert_retried(self, log): self.assertEqual(str(log).count("Retrying"), 2) self.assertEqual(str(log).count("Gave up retrying"), 1) @defer.inlineCallbacks def test_referer_header(self): """Referer header is set by RefererMiddleware unless it is already set""" req0 = Request(self.mockserver.url('/echo?headers=1&body=0'), dont_filter=1) req1 = req0.replace() req2 = req0.replace(headers={'Referer': None}) req3 = req0.replace(headers={'Referer': 'http://example.com'}) req0.meta['next'] = req1 req1.meta['next'] = req2 req2.meta['next'] = req3 crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=req0, mockserver=self.mockserver) # basic asserts in case of weird communication errors self.assertIn('responses', crawler.spider.meta) self.assertNotIn('failures', crawler.spider.meta) # start requests doesn't set Referer header echo0 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo0['headers']) # following request sets Referer to start request url echo1 = json.loads(to_unicode(crawler.spider.meta['responses'][1].body)) self.assertEqual(echo1['headers'].get('Referer'), [req0.url]) # next request avoids Referer header echo2 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo2['headers']) # last request explicitly sets a Referer header echo3 = json.loads(to_unicode(crawler.spider.meta['responses'][3].body)) self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com']) @defer.inlineCallbacks def test_engine_status(self): from scrapy.utils.engine import get_engine_status est = [] def cb(response): est.append(get_engine_status(crawler.engine)) crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=self.mockserver.url('/'), callback_func=cb, mockserver=self.mockserver) self.assertEqual(len(est), 1, est) s = dict(est[0]) self.assertEqual(s['engine.spider.name'], crawler.spider.name) self.assertEqual(s['len(engine.scraper.slot.active)'], 1) @defer.inlineCallbacks def test_graceful_crawl_error_handling(self): """ Test whether errors happening anywhere in Crawler.crawl() are properly reported (and not somehow swallowed) after a graceful engine shutdown. The errors should not come from within Scrapy's core but from within spiders/middlewares/etc., e.g. raised in Spider.start_requests(), SpiderMiddleware.process_start_requests(), etc. """ class TestError(Exception): pass class FaultySpider(SimpleSpider): def start_requests(self): raise TestError crawler = self.runner.create_crawler(FaultySpider) yield self.assertFailure(crawler.crawl(mockserver=self.mockserver), TestError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_open_spider_error_on_faulty_pipeline(self): settings = { "ITEM_PIPELINES": { "tests.pipelines.ZeroDivisionErrorPipeline": 300, } } crawler = CrawlerRunner(settings).create_crawler(SimpleSpider) yield self.assertFailure( self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver), ZeroDivisionError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_crawlerrunner_accepts_crawler(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) @defer.inlineCallbacks def test_crawl_multiple(self): self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=503"), mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self._assert_retried(log) self.assertIn("Got response 200", str(log))
class TestHttpErrorMiddlewareIntegrational(TrialTestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_middleware_works(self): crawler = get_crawler(_HttpErrorSpider) yield crawler.crawl(mockserver=self.mockserver) assert not crawler.spider.skipped, crawler.spider.skipped self.assertEqual(crawler.spider.parsed, {"200"}) self.assertEqual(crawler.spider.failed, {"404", "402", "500"}) get_value = crawler.stats.get_value self.assertEqual(get_value("httperror/response_ignored_count"), 3) self.assertEqual( get_value("httperror/response_ignored_status_count/404"), 1) self.assertEqual( get_value("httperror/response_ignored_status_count/402"), 1) self.assertEqual( get_value("httperror/response_ignored_status_count/500"), 1) @defer.inlineCallbacks def test_logging(self): crawler = get_crawler(_HttpErrorSpider) with LogCapture() as log: yield crawler.crawl(mockserver=self.mockserver, bypass_status_codes={402}) self.assertEqual(crawler.spider.parsed, {"200", "402"}) self.assertEqual(crawler.spider.skipped, {"402"}) self.assertEqual(crawler.spider.failed, {"404", "500"}) self.assertIn("Ignoring response <404", str(log)) self.assertIn("Ignoring response <500", str(log)) self.assertNotIn("Ignoring response <200", str(log)) self.assertNotIn("Ignoring response <402", str(log)) @defer.inlineCallbacks def test_logging_level(self): # HttpError logs ignored responses with level INFO crawler = get_crawler(_HttpErrorSpider) with LogCapture(level=logging.INFO) as log: yield crawler.crawl(mockserver=self.mockserver) self.assertEqual(crawler.spider.parsed, {"200"}) self.assertEqual(crawler.spider.failed, {"404", "402", "500"}) self.assertIn("Ignoring response <402", str(log)) self.assertIn("Ignoring response <404", str(log)) self.assertIn("Ignoring response <500", str(log)) self.assertNotIn("Ignoring response <200", str(log)) # with level WARNING, we shouldn't capture anything from HttpError crawler = get_crawler(_HttpErrorSpider) with LogCapture(level=logging.WARNING) as log: yield crawler.crawl(mockserver=self.mockserver) self.assertEqual(crawler.spider.parsed, {"200"}) self.assertEqual(crawler.spider.failed, {"404", "402", "500"}) self.assertNotIn("Ignoring response <402", str(log)) self.assertNotIn("Ignoring response <404", str(log)) self.assertNotIn("Ignoring response <500", str(log)) self.assertNotIn("Ignoring response <200", str(log))
class CrawlSpiderTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def _run_spider(self, spider_cls): items = [] def _on_item_scraped(item): items.append(item) crawler = self.runner.create_crawler(spider_cls) crawler.signals.connect(_on_item_scraped, signals.item_scraped) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=200"), mockserver=self.mockserver) return log, items, crawler.stats @defer.inlineCallbacks def test_crawlspider_with_parse(self): self.runner.crawl(CrawlSpiderWithParseMethod, mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("[parse] status 200 (foo: None)", str(log)) self.assertIn("[parse] status 201 (foo: None)", str(log)) self.assertIn("[parse] status 202 (foo: bar)", str(log)) @defer.inlineCallbacks def test_crawlspider_with_errback(self): self.runner.crawl(CrawlSpiderWithErrback, mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("[parse] status 200 (foo: None)", str(log)) self.assertIn("[parse] status 201 (foo: None)", str(log)) self.assertIn("[parse] status 202 (foo: bar)", str(log)) self.assertIn("[errback] status 404", str(log)) self.assertIn("[errback] status 500", str(log)) self.assertIn("[errback] status 501", str(log)) @defer.inlineCallbacks def test_async_def_parse(self): self.runner.crawl(AsyncDefSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("Got response 200", str(log)) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse(self): runner = CrawlerRunner({ "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor" }) runner.crawl(AsyncDefAsyncioSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) with LogCapture() as log: yield runner.join() self.assertIn("Got response 200", str(log)) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse_items_list(self): log, items, _ = yield self._run_spider(AsyncDefAsyncioReturnSpider) self.assertIn("Got response 200", str(log)) self.assertIn({'id': 1}, items) self.assertIn({'id': 2}, items) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse_items_single_element(self): items = [] def _on_item_scraped(item): items.append(item) crawler = self.runner.create_crawler( AsyncDefAsyncioReturnSingleElementSpider) crawler.signals.connect(_on_item_scraped, signals.item_scraped) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) self.assertIn({"foo": 42}, items) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncgen_parse(self): log, _, stats = yield self._run_spider(AsyncDefAsyncioGenSpider) self.assertIn("Got response 200", str(log)) itemcount = stats.get_value('item_scraped_count') self.assertEqual(itemcount, 1) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncgen_parse_loop(self): log, items, stats = yield self._run_spider( AsyncDefAsyncioGenLoopSpider) self.assertIn("Got response 200", str(log)) itemcount = stats.get_value('item_scraped_count') self.assertEqual(itemcount, 10) for i in range(10): self.assertIn({'foo': i}, items) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncgen_parse_complex(self): _, items, stats = yield self._run_spider( AsyncDefAsyncioGenComplexSpider) itemcount = stats.get_value('item_scraped_count') self.assertEqual(itemcount, 156) # some random items for i in [1, 4, 21, 22, 207, 311]: self.assertIn({'index': i}, items) for i in [10, 30, 122]: self.assertIn({'index2': i}, items) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse_reqs_list(self): log, *_ = yield self._run_spider(AsyncDefAsyncioReqsReturnSpider) for req_id in range(3): self.assertIn(f"Got response 200, req_id {req_id}", str(log)) @defer.inlineCallbacks def test_response_ssl_certificate_none(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url("/echo?body=test", is_secure=False) yield crawler.crawl(seed=url, mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta['responses'][0].certificate) @defer.inlineCallbacks def test_response_ssl_certificate(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url("/echo?body=test", is_secure=True) yield crawler.crawl(seed=url, mockserver=self.mockserver) cert = crawler.spider.meta['responses'][0].certificate self.assertIsInstance(cert, Certificate) self.assertEqual(cert.getSubject().commonName, b"localhost") self.assertEqual(cert.getIssuer().commonName, b"localhost") @mark.xfail( reason="Responses with no body return early and contain no certificate" ) @defer.inlineCallbacks def test_response_ssl_certificate_empty_response(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url("/status?n=200", is_secure=True) yield crawler.crawl(seed=url, mockserver=self.mockserver) cert = crawler.spider.meta['responses'][0].certificate self.assertIsInstance(cert, Certificate) self.assertEqual(cert.getSubject().commonName, b"localhost") self.assertEqual(cert.getIssuer().commonName, b"localhost") @defer.inlineCallbacks def test_dns_server_ip_address_none(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url('/status?n=200') yield crawler.crawl(seed=url, mockserver=self.mockserver) ip_address = crawler.spider.meta['responses'][0].ip_address self.assertIsNone(ip_address) @defer.inlineCallbacks def test_dns_server_ip_address(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url('/echo?body=test') expected_netloc, _ = urlparse(url).netloc.split(':') yield crawler.crawl(seed=url, mockserver=self.mockserver) ip_address = crawler.spider.meta['responses'][0].ip_address self.assertIsInstance(ip_address, IPv4Address) self.assertEqual(str(ip_address), gethostbyname(expected_netloc)) @defer.inlineCallbacks def test_bytes_received_stop_download_callback(self): crawler = self.runner.create_crawler(BytesReceivedCallbackSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta.get("failure")) self.assertIsInstance(crawler.spider.meta["response"], Response) self.assertEqual(crawler.spider.meta["response"].body, crawler.spider.meta.get("bytes_received")) self.assertLess(len(crawler.spider.meta["response"].body), crawler.spider.full_response_length) @defer.inlineCallbacks def test_bytes_received_stop_download_errback(self): crawler = self.runner.create_crawler(BytesReceivedErrbackSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta.get("response")) self.assertIsInstance(crawler.spider.meta["failure"], Failure) self.assertIsInstance(crawler.spider.meta["failure"].value, StopDownload) self.assertIsInstance(crawler.spider.meta["failure"].value.response, Response) self.assertEqual(crawler.spider.meta["failure"].value.response.body, crawler.spider.meta.get("bytes_received")) self.assertLess( len(crawler.spider.meta["failure"].value.response.body), crawler.spider.full_response_length) @defer.inlineCallbacks def test_headers_received_stop_download_callback(self): crawler = self.runner.create_crawler(HeadersReceivedCallbackSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta.get("failure")) self.assertIsInstance(crawler.spider.meta["response"], Response) self.assertEqual(crawler.spider.meta["response"].headers, crawler.spider.meta.get("headers_received")) @defer.inlineCallbacks def test_headers_received_stop_download_errback(self): crawler = self.runner.create_crawler(HeadersReceivedErrbackSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta.get("response")) self.assertIsInstance(crawler.spider.meta["failure"], Failure) self.assertIsInstance(crawler.spider.meta["failure"].value, StopDownload) self.assertIsInstance(crawler.spider.meta["failure"].value.response, Response) self.assertEqual(crawler.spider.meta["failure"].value.response.headers, crawler.spider.meta.get("headers_received"))
class ProxyConnectTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self._oldenv = os.environ.copy() self._proxy = MitmProxy() proxy_url = self._proxy.start() os.environ['https_proxy'] = proxy_url os.environ['http_proxy'] = proxy_url def tearDown(self): self.mockserver.__exit__(None, None, None) self._proxy.stop() os.environ = self._oldenv @defer.inlineCallbacks def test_https_connect_tunnel(self): crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl( self.mockserver.url("/status?n=200", is_secure=True)) self._assert_got_response_code(200, l) @pytest.mark.xfail(reason='Python 3.6+ fails this earlier', condition=sys.version_info.minor >= 6) @defer.inlineCallbacks def test_https_connect_tunnel_error(self): crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("https://localhost:99999/status?n=200") self._assert_got_tunnel_error(l) @defer.inlineCallbacks def test_https_tunnel_auth_error(self): os.environ['https_proxy'] = _wrong_credentials( os.environ['https_proxy']) crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl( self.mockserver.url("/status?n=200", is_secure=True)) # The proxy returns a 407 error code but it does not reach the client; # he just sees a TunnelError. self._assert_got_tunnel_error(l) @defer.inlineCallbacks def test_https_tunnel_without_leak_proxy_authorization_header(self): request = Request(self.mockserver.url("/echo", is_secure=True)) crawler = get_crawler(SingleRequestSpider) with LogCapture() as l: yield crawler.crawl(seed=request) self._assert_got_response_code(200, l) echo = json.loads(crawler.spider.meta['responses'][0].text) self.assertTrue('Proxy-Authorization' not in echo['headers']) # The noconnect mode isn't supported by the current mitmproxy, it returns # "Invalid request scheme: https" as it doesn't seem to support full URLs in GET at all, # and it's not clear what behavior is intended by Scrapy and by mitmproxy here. # https://github.com/mitmproxy/mitmproxy/issues/848 may be related. # The Scrapy noconnect mode was required, at least in the past, to work with Crawlera, # and https://github.com/scrapy-plugins/scrapy-crawlera/pull/44 seems to be related. @pytest.mark.xfail(reason='mitmproxy gives an error for noconnect requests' ) @defer.inlineCallbacks def test_https_noconnect(self): proxy = os.environ['https_proxy'] os.environ['https_proxy'] = proxy + '?noconnect' crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl( self.mockserver.url("/status?n=200", is_secure=True)) self._assert_got_response_code(200, l) @pytest.mark.xfail(reason='mitmproxy gives an error for noconnect requests' ) @defer.inlineCallbacks def test_https_noconnect_auth_error(self): os.environ['https_proxy'] = _wrong_credentials( os.environ['https_proxy']) + '?noconnect' crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl( self.mockserver.url("/status?n=200", is_secure=True)) self._assert_got_response_code(407, l) def _assert_got_response_code(self, code, log): print(log) self.assertEqual(str(log).count('Crawled (%d)' % code), 1) def _assert_got_tunnel_error(self, log): print(log) self.assertIn('TunnelError', str(log))
class ProxyConnectTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self._oldenv = os.environ.copy() self._proxy = HTTPSProxy(8888) self._proxy.start() # Wait for the proxy to start. time.sleep(1.0) os.environ['http_proxy'] = 'http://*****:*****@localhost:8888' os.environ['https_proxy'] = 'http://*****:*****@localhost:8888' def tearDown(self): self.mockserver.__exit__(None, None, None) self._proxy.shutdown() os.environ = self._oldenv @defer.inlineCallbacks def test_https_connect_tunnel(self): crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("https://localhost:8999/status?n=200") self._assert_got_response_code(200, l) @defer.inlineCallbacks def test_https_noconnect(self): os.environ['https_proxy'] = 'http://*****:*****@localhost:8888?noconnect' crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("https://*****:*****@localhost:8888' @defer.inlineCallbacks def test_https_connect_tunnel_error(self): crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("https://localhost:99999/status?n=200") self._assert_got_tunnel_error(l) @defer.inlineCallbacks def test_https_tunnel_auth_error(self): os.environ['https_proxy'] = 'http://*****:*****@localhost:8888' crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("https://*****:*****@localhost:8888' @defer.inlineCallbacks def test_https_tunnel_without_leak_proxy_authorization_header(self): request = Request("https://localhost:8999/echo") crawler = get_crawler(SingleRequestSpider) with LogCapture() as l: yield crawler.crawl(seed=request) self._assert_got_response_code(200, l) echo = json.loads(crawler.spider.meta['responses'][0].body) self.assertTrue('Proxy-Authorization' not in echo['headers']) @defer.inlineCallbacks def test_https_noconnect_auth_error(self): os.environ['https_proxy'] = 'http://*****:*****@localhost:8888?noconnect' crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("https://localhost:8999/status?n=200") self._assert_got_response_code(407, l) def _assert_got_response_code(self, code, log): print(log) self.assertEqual(str(log).count('Crawled (%d)' % code), 1) def _assert_got_tunnel_error(self, log): print(log) self.assertIn('TunnelError', str(log))
class ProxyConnectTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self._oldenv = os.environ.copy() self._proxy = HTTPSProxy(8888) self._proxy.start() # Wait for the proxy to start. time.sleep(1.0) os.environ['http_proxy'] = 'http://*****:*****@localhost:8888' os.environ['https_proxy'] = 'http://*****:*****@localhost:8888' def tearDown(self): self.mockserver.__exit__(None, None, None) self._proxy.shutdown() os.environ = self._oldenv @defer.inlineCallbacks def test_https_connect_tunnel(self): crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("https://localhost:8999/status?n=200") self._assert_got_response_code(200, l) @defer.inlineCallbacks def test_https_noconnect(self): os.environ['https_proxy'] = 'http://*****:*****@localhost:8888?noconnect' crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("https://*****:*****@localhost:8888' @defer.inlineCallbacks def test_https_connect_tunnel_error(self): crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("https://localhost:99999/status?n=200") self._assert_got_tunnel_error(l) @defer.inlineCallbacks def test_https_tunnel_auth_error(self): os.environ['https_proxy'] = 'http://*****:*****@localhost:8888' crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("https://*****:*****@localhost:8888' @defer.inlineCallbacks def test_https_tunnel_without_leak_proxy_authorization_header(self): request = Request("https://localhost:8999/echo") crawler = get_crawler(SingleRequestSpider) with LogCapture() as l: yield crawler.crawl(seed=request) self._assert_got_response_code(200, l) echo = json.loads(crawler.spider.meta['responses'][0].body) self.assertTrue('Proxy-Authorization' not in echo['headers']) @defer.inlineCallbacks def test_https_noconnect_auth_error(self): os.environ['https_proxy'] = 'http://*****:*****@localhost:8888?noconnect' crawler = get_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("https://localhost:8999/status?n=200") self._assert_got_response_code(407, l) def _assert_got_response_code(self, code, log): self.assertEqual(str(log).count('Crawled (%d)' % code), 1) def _assert_got_tunnel_error(self, log): self.assertEqual(str(log).count('TunnelError'), 1)
class CrawlTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_follow_all(self): crawler = get_crawler(FollowAllSpider) yield crawler.crawl() self.assertEqual(len(crawler.spider.urls_visited), 11) # 10 + start_url @defer.inlineCallbacks def test_delay(self): # short to long delays yield self._test_delay(0.2, False) yield self._test_delay(1, False) # randoms yield self._test_delay(0.2, True) yield self._test_delay(1, True) @defer.inlineCallbacks def _test_delay(self, delay, randomize): settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize} crawler = get_crawler(FollowAllSpider, settings) yield crawler.crawl(maxlatency=delay * 2) t = crawler.spider.times totaltime = t[-1] - t[0] avgd = totaltime / (len(t) - 1) tolerance = 0.6 if randomize else 0.2 self.assertTrue(avgd > delay * (1 - tolerance), "download delay too small: %s" % avgd) @defer.inlineCallbacks def test_timeout_success(self): crawler = get_crawler(DelaySpider) yield crawler.crawl(n=0.5) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 > 0) self.assertTrue(crawler.spider.t2 > crawler.spider.t1) @defer.inlineCallbacks def test_timeout_failure(self): crawler = get_crawler(DelaySpider, {"DOWNLOAD_TIMEOUT": 0.35}) yield crawler.crawl(n=0.5) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) # server hangs after receiving response headers yield crawler.crawl(n=0.5, b=1) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) @defer.inlineCallbacks def test_retry_503(self): crawler = get_crawler(SimpleSpider) yield crawler.crawl("http://localhost:8998/status?n=503") self._assert_retried() @defer.inlineCallbacks def test_retry_conn_failed(self): crawler = get_crawler(SimpleSpider) yield crawler.crawl("http://localhost:65432/status?n=503") self._assert_retried() @defer.inlineCallbacks def test_retry_dns_error(self): with mock.patch('socket.gethostbyname', side_effect=socket.gaierror(-5, 'No address associated with hostname')): crawler = get_crawler(SimpleSpider) yield crawler.crawl("http://example.com/") self._assert_retried() @defer.inlineCallbacks def test_start_requests_bug_before_yield(self): crawler = get_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_before_yield=1) errors = self.flushLoggedErrors(ZeroDivisionError) self.assertEqual(len(errors), 1) @defer.inlineCallbacks def test_start_requests_bug_yielding(self): crawler = get_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_yielding=1) errors = self.flushLoggedErrors(ZeroDivisionError) self.assertEqual(len(errors), 1) @defer.inlineCallbacks def test_start_requests_lazyness(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = get_crawler(BrokenStartRequestsSpider, settings) yield crawler.crawl() #self.assertTrue(False, crawler.spider.seedsseen) #self.assertTrue(crawler.spider.seedsseen.index(None) < crawler.spider.seedsseen.index(99), # crawler.spider.seedsseen) @defer.inlineCallbacks def test_start_requests_dupes(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = get_crawler(DuplicateStartRequestsSpider, settings) yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3) self.assertEqual(crawler.spider.visited, 6) yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4) self.assertEqual(crawler.spider.visited, 3) @defer.inlineCallbacks def test_unbounded_response(self): # Completeness of responses without Content-Length or Transfer-Encoding # can not be determined, we treat them as valid but flagged as "partial" from urllib import urlencode query = urlencode({'raw': '''\ HTTP/1.1 200 OK Server: Apache-Coyote/1.1 X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0 Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/ Pragma: no-cache Expires: Thu, 01 Jan 1970 00:00:00 GMT Cache-Control: no-cache Cache-Control: no-store Content-Type: text/html;charset=UTF-8 Content-Language: en Date: Tue, 27 Aug 2013 13:05:05 GMT Connection: close foo body with multiples lines '''}) crawler = get_crawler(SimpleSpider) yield crawler.crawl("http://localhost:8998/raw?{0}".format(query)) log = get_testlog() self.assertEqual(log.count("Got response 200"), 1) @defer.inlineCallbacks def test_retry_conn_lost(self): # connection lost after receiving data crawler = get_crawler(SimpleSpider) yield crawler.crawl("http://localhost:8998/drop?abort=0") self._assert_retried() @defer.inlineCallbacks def test_retry_conn_aborted(self): # connection lost before receiving data crawler = get_crawler(SimpleSpider) yield crawler.crawl("http://localhost:8998/drop?abort=1") self._assert_retried() def _assert_retried(self): log = get_testlog() self.assertEqual(log.count("Retrying"), 2) self.assertEqual(log.count("Gave up retrying"), 1) @defer.inlineCallbacks def test_referer_header(self): """Referer header is set by RefererMiddleware unless it is already set""" req0 = Request('http://localhost:8998/echo?headers=1&body=0', dont_filter=1) req1 = req0.replace() req2 = req0.replace(headers={'Referer': None}) req3 = req0.replace(headers={'Referer': 'http://example.com'}) req0.meta['next'] = req1 req1.meta['next'] = req2 req2.meta['next'] = req3 crawler = get_crawler(SingleRequestSpider) yield crawler.crawl(seed=req0) # basic asserts in case of weird communication errors self.assertIn('responses', crawler.spider.meta) self.assertNotIn('failures', crawler.spider.meta) # start requests doesn't set Referer header echo0 = json.loads(crawler.spider.meta['responses'][2].body) self.assertNotIn('Referer', echo0['headers']) # following request sets Referer to start request url echo1 = json.loads(crawler.spider.meta['responses'][1].body) self.assertEqual(echo1['headers'].get('Referer'), [req0.url]) # next request avoids Referer header echo2 = json.loads(crawler.spider.meta['responses'][2].body) self.assertNotIn('Referer', echo2['headers']) # last request explicitly sets a Referer header echo3 = json.loads(crawler.spider.meta['responses'][3].body) self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com']) @defer.inlineCallbacks def test_engine_status(self): from scrapy.utils.engine import get_engine_status est = [] def cb(response): est.append(get_engine_status(crawler.engine)) crawler = get_crawler(SingleRequestSpider) yield crawler.crawl(seed='http://localhost:8998/', callback_func=cb) self.assertEqual(len(est), 1, est) s = dict(est[0]) self.assertEqual(s['engine.spider.name'], crawler.spider.name) self.assertEqual(s['len(engine.scraper.slot.active)'], 1)
class CrawlTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_follow_all(self): crawler = self.runner.create_crawler(FollowAllSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertEqual(len(crawler.spider.urls_visited), 11) # 10 + start_url @defer.inlineCallbacks def test_fixed_delay(self): yield self._test_delay(total=3, delay=0.1) @defer.inlineCallbacks def test_randomized_delay(self): yield self._test_delay(total=3, delay=0.1, randomize=True) @defer.inlineCallbacks def _test_delay(self, total, delay, randomize=False): crawl_kwargs = dict( maxlatency=delay * 2, mockserver=self.mockserver, total=total, ) tolerance = (1 - (0.6 if randomize else 0.2)) settings = { "DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize } crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider) yield crawler.crawl(**crawl_kwargs) times = crawler.spider.times total_time = times[-1] - times[0] average = total_time / (len(times) - 1) self.assertTrue(average > delay * tolerance, "download delay too small: %s" % average) # Ensure that the same test parameters would cause a failure if no # download delay is set. Otherwise, it means we are using a combination # of ``total`` and ``delay`` values that are too small for the test # code above to have any meaning. settings["DOWNLOAD_DELAY"] = 0 crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider) yield crawler.crawl(**crawl_kwargs) times = crawler.spider.times total_time = times[-1] - times[0] average = total_time / (len(times) - 1) self.assertFalse(average > delay / tolerance, "test total or delay values are too small") @defer.inlineCallbacks def test_timeout_success(self): crawler = self.runner.create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 > 0) self.assertTrue(crawler.spider.t2 > crawler.spider.t1) @defer.inlineCallbacks def test_timeout_failure(self): crawler = CrawlerRunner({ "DOWNLOAD_TIMEOUT": 0.35 }).create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) # server hangs after receiving response headers yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) @defer.inlineCallbacks def test_retry_503(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/status?n=503"), mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_failed(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("http://localhost:65432/status?n=503", mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_dns_error(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: # try to fetch the homepage of a non-existent domain yield crawler.crawl("http://dns.resolution.invalid./", mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_start_requests_bug_before_yield(self): with LogCapture('scrapy', level=logging.ERROR) as l: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_before_yield=1, mockserver=self.mockserver) self.assertEqual(len(l.records), 1) record = l.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_bug_yielding(self): with LogCapture('scrapy', level=logging.ERROR) as l: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_yielding=1, mockserver=self.mockserver) self.assertEqual(len(l.records), 1) record = l.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_lazyness(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler( BrokenStartRequestsSpider) yield crawler.crawl(mockserver=self.mockserver) #self.assertTrue(False, crawler.spider.seedsseen) #self.assertTrue(crawler.spider.seedsseen.index(None) < crawler.spider.seedsseen.index(99), # crawler.spider.seedsseen) @defer.inlineCallbacks def test_start_requests_dupes(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler( DuplicateStartRequestsSpider) yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 6) yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 3) @defer.inlineCallbacks def test_unbounded_response(self): # Completeness of responses without Content-Length or Transfer-Encoding # can not be determined, we treat them as valid but flagged as "partial" from urllib.parse import urlencode query = urlencode({ 'raw': '''\ HTTP/1.1 200 OK Server: Apache-Coyote/1.1 X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0 Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/ Pragma: no-cache Expires: Thu, 01 Jan 1970 00:00:00 GMT Cache-Control: no-cache Cache-Control: no-store Content-Type: text/html;charset=UTF-8 Content-Language: en Date: Tue, 27 Aug 2013 13:05:05 GMT Connection: close foo body with multiples lines ''' }) crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/raw?{0}".format(query)), mockserver=self.mockserver) self.assertEqual(str(l).count("Got response 200"), 1) @defer.inlineCallbacks def test_retry_conn_lost(self): # connection lost after receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/drop?abort=0"), mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_aborted(self): # connection lost before receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/drop?abort=1"), mockserver=self.mockserver) self._assert_retried(l) def _assert_retried(self, log): self.assertEqual(str(log).count("Retrying"), 2) self.assertEqual(str(log).count("Gave up retrying"), 1) @defer.inlineCallbacks def test_referer_header(self): """Referer header is set by RefererMiddleware unless it is already set""" req0 = Request(self.mockserver.url('/echo?headers=1&body=0'), dont_filter=1) req1 = req0.replace() req2 = req0.replace(headers={'Referer': None}) req3 = req0.replace(headers={'Referer': 'http://example.com'}) req0.meta['next'] = req1 req1.meta['next'] = req2 req2.meta['next'] = req3 crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=req0, mockserver=self.mockserver) # basic asserts in case of weird communication errors self.assertIn('responses', crawler.spider.meta) self.assertNotIn('failures', crawler.spider.meta) # start requests doesn't set Referer header echo0 = json.loads(to_unicode( crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo0['headers']) # following request sets Referer to start request url echo1 = json.loads(to_unicode( crawler.spider.meta['responses'][1].body)) self.assertEqual(echo1['headers'].get('Referer'), [req0.url]) # next request avoids Referer header echo2 = json.loads(to_unicode( crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo2['headers']) # last request explicitly sets a Referer header echo3 = json.loads(to_unicode( crawler.spider.meta['responses'][3].body)) self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com']) @defer.inlineCallbacks def test_engine_status(self): from scrapy.utils.engine import get_engine_status est = [] def cb(response): est.append(get_engine_status(crawler.engine)) crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=self.mockserver.url('/'), callback_func=cb, mockserver=self.mockserver) self.assertEqual(len(est), 1, est) s = dict(est[0]) self.assertEqual(s['engine.spider.name'], crawler.spider.name) self.assertEqual(s['len(engine.scraper.slot.active)'], 1) @defer.inlineCallbacks def test_graceful_crawl_error_handling(self): """ Test whether errors happening anywhere in Crawler.crawl() are properly reported (and not somehow swallowed) after a graceful engine shutdown. The errors should not come from within Scrapy's core but from within spiders/middlewares/etc., e.g. raised in Spider.start_requests(), SpiderMiddleware.process_start_requests(), etc. """ class TestError(Exception): pass class FaultySpider(SimpleSpider): def start_requests(self): raise TestError crawler = self.runner.create_crawler(FaultySpider) yield self.assertFailure(crawler.crawl(mockserver=self.mockserver), TestError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_open_spider_error_on_faulty_pipeline(self): settings = { "ITEM_PIPELINES": { "tests.pipelines.ZeroDivisionErrorPipeline": 300, } } crawler = CrawlerRunner(settings).create_crawler(SimpleSpider) yield self.assertFailure( self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver), ZeroDivisionError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_crawlerrunner_accepts_crawler(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) @defer.inlineCallbacks def test_crawl_multiple(self): self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=503"), mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self._assert_retried(log) self.assertIn("Got response 200", str(log)) @defer.inlineCallbacks def test_crawlspider_with_errback(self): self.runner.crawl(CrawlSpiderWithErrback, mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("[callback] status 200", str(log)) self.assertIn("[callback] status 201", str(log)) self.assertIn("[errback] status 404", str(log)) self.assertIn("[errback] status 500", str(log)) @defer.inlineCallbacks def test_async_def_parse(self): self.runner.crawl(AsyncDefSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("Got response 200", str(log)) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse(self): runner = CrawlerRunner({"ASYNCIO_REACTOR": True}) runner.crawl(AsyncDefAsyncioSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) with LogCapture() as log: yield runner.join() self.assertIn("Got response 200", str(log)) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse_list(self): items = [] def _on_item_scraped(item): items.append(item) crawler = self.runner.create_crawler(AsyncDefAsyncioReturnSpider) crawler.signals.connect(_on_item_scraped, signals.item_scraped) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) self.assertIn({'id': 1}, items) self.assertIn({'id': 2}, items)
class FileDownloadCrawlTestCase(TestCase): pipeline_class = 'scrapy.pipelines.files.FilesPipeline' store_setting_key = 'FILES_STORE' media_key = 'files' media_urls_key = 'file_urls' expected_checksums = { '5547178b89448faf0015a13f904c936e', 'c2281c83670e31d8aaab7cb642b824db', 'ed3f6538dc15d4d9179dae57319edc5f' } def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() # prepare a directory for storing files self.tmpmediastore = self.mktemp() os.mkdir(self.tmpmediastore) self.settings = { 'ITEM_PIPELINES': { self.pipeline_class: 1 }, self.store_setting_key: self.tmpmediastore, } self.runner = CrawlerRunner(self.settings) self.items = [] def tearDown(self): shutil.rmtree(self.tmpmediastore) self.items = [] self.mockserver.__exit__(None, None, None) def _on_item_scraped(self, item): self.items.append(item) def _create_crawler(self, spider_class, **kwargs): crawler = self.runner.create_crawler(spider_class, **kwargs) crawler.signals.connect(self._on_item_scraped, signals.item_scraped) return crawler def _assert_files_downloaded(self, items, logs): self.assertEqual(len(items), 1) self.assertIn(self.media_key, items[0]) # check that logs show the expected number of successful file downloads file_dl_success = 'File (downloaded): Downloaded file from' self.assertEqual(logs.count(file_dl_success), 3) # check that the images/files status is `downloaded` for item in items: for i in item[self.media_key]: self.assertEqual(i['status'], 'downloaded') # check that the images/files checksums are what we know they should be if self.expected_checksums is not None: checksums = set(i['checksum'] for item in items for i in item[self.media_key]) self.assertEqual(checksums, self.expected_checksums) # check that the image files where actually written to the media store for item in items: for i in item[self.media_key]: self.assertTrue( os.path.exists(os.path.join(self.tmpmediastore, i['path']))) def _assert_files_download_failure(self, crawler, items, code, logs): # check that the item does NOT have the "images/files" field populated self.assertEqual(len(items), 1) self.assertIn(self.media_key, items[0]) self.assertFalse(items[0][self.media_key]) # check that there was 1 successful fetch and 3 other responses with non-200 code self.assertEqual( crawler.stats.get_value('downloader/request_method_count/GET'), 4) self.assertEqual(crawler.stats.get_value('downloader/response_count'), 4) self.assertEqual( crawler.stats.get_value('downloader/response_status_count/200'), 1) self.assertEqual( crawler.stats.get_value('downloader/response_status_count/%d' % code), 3) # check that logs do show the failure on the file downloads file_dl_failure = 'File (code: %d): Error downloading file from' % code self.assertEqual(logs.count(file_dl_failure), 3) # check that no files were written to the media store self.assertEqual(os.listdir(self.tmpmediastore), []) @defer.inlineCallbacks def test_download_media(self): crawler = self._create_crawler(MediaDownloadSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/files/images/"), media_key=self.media_key, media_urls_key=self.media_urls_key) self._assert_files_downloaded(self.items, str(log)) @defer.inlineCallbacks def test_download_media_wrong_urls(self): crawler = self._create_crawler(BrokenLinksMediaDownloadSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/files/images/"), media_key=self.media_key, media_urls_key=self.media_urls_key) self._assert_files_download_failure(crawler, self.items, 404, str(log)) @defer.inlineCallbacks def test_download_media_redirected_default_failure(self): crawler = self._create_crawler(RedirectedMediaDownloadSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/files/images/"), media_key=self.media_key, media_urls_key=self.media_urls_key, mockserver=self.mockserver) self._assert_files_download_failure(crawler, self.items, 302, str(log)) @defer.inlineCallbacks def test_download_media_redirected_allowed(self): settings = dict(self.settings) settings.update({'MEDIA_ALLOW_REDIRECTS': True}) self.runner = CrawlerRunner(settings) crawler = self._create_crawler(RedirectedMediaDownloadSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/files/images/"), media_key=self.media_key, media_urls_key=self.media_urls_key, mockserver=self.mockserver) self._assert_files_downloaded(self.items, str(log)) self.assertEqual( crawler.stats.get_value('downloader/response_status_count/302'), 3)
class CrawlTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_response_200(self): url = self.mockserver.url("/status?n=200") crawler = CrawlerRunner().create_crawler(SingleRequestSpider) yield crawler.crawl(seed=url, mockserver=self.mockserver) response = crawler.spider.meta["responses"][0] self.assertEqual(response.request.url, url) @defer.inlineCallbacks def test_response_error(self): for status in ("404", "500"): url = self.mockserver.url(f"/status?n={status}") crawler = CrawlerRunner().create_crawler(SingleRequestSpider) yield crawler.crawl(seed=url, mockserver=self.mockserver) failure = crawler.spider.meta["failure"] response = failure.value.response self.assertEqual(failure.request.url, url) self.assertEqual(response.request.url, url) @defer.inlineCallbacks def test_downloader_middleware_raise_exception(self): url = self.mockserver.url("/status?n=200") runner = CrawlerRunner(settings={ "DOWNLOADER_MIDDLEWARES": { RaiseExceptionRequestMiddleware: 590, }, }) crawler = runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=url, mockserver=self.mockserver) failure = crawler.spider.meta["failure"] self.assertEqual(failure.request.url, url) self.assertIsInstance(failure.value, ZeroDivisionError) @defer.inlineCallbacks def test_downloader_middleware_override_request_in_process_response(self): """ Downloader middleware which returns a response with an specific 'request' attribute. * The spider callback should receive the overridden response.request * Handlers listening to the response_received signal should receive the overridden response.request * The "crawled" log message should show the overridden response.request """ signal_params = {} def signal_handler(response, request, spider): signal_params["response"] = response signal_params["request"] = request url = self.mockserver.url("/status?n=200") runner = CrawlerRunner(settings={ "DOWNLOADER_MIDDLEWARES": { ProcessResponseMiddleware: 595, } }) crawler = runner.create_crawler(SingleRequestSpider) crawler.signals.connect(signal_handler, signal=signals.response_received) with LogCapture() as log: yield crawler.crawl(seed=url, mockserver=self.mockserver) response = crawler.spider.meta["responses"][0] self.assertEqual(response.request.url, OVERRIDEN_URL) self.assertEqual(signal_params["response"].url, url) self.assertEqual(signal_params["request"].url, OVERRIDEN_URL) log.check_present( ("scrapy.core.engine", "DEBUG", f"Crawled (200) <GET {OVERRIDEN_URL}> (referer: None)"), ) @defer.inlineCallbacks def test_downloader_middleware_override_in_process_exception(self): """ An exception is raised but caught by the next middleware, which returns a Response with a specific 'request' attribute. The spider callback should receive the overridden response.request """ url = self.mockserver.url("/status?n=200") runner = CrawlerRunner( settings={ "DOWNLOADER_MIDDLEWARES": { RaiseExceptionRequestMiddleware: 590, CatchExceptionOverrideRequestMiddleware: 595, }, }) crawler = runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=url, mockserver=self.mockserver) response = crawler.spider.meta["responses"][0] self.assertEqual(response.body, b"Caught ZeroDivisionError") self.assertEqual(response.request.url, OVERRIDEN_URL) @defer.inlineCallbacks def test_downloader_middleware_do_not_override_in_process_exception(self): """ An exception is raised but caught by the next middleware, which returns a Response without a specific 'request' attribute. The spider callback should receive the original response.request """ url = self.mockserver.url("/status?n=200") runner = CrawlerRunner( settings={ "DOWNLOADER_MIDDLEWARES": { RaiseExceptionRequestMiddleware: 590, CatchExceptionDoNotOverrideRequestMiddleware: 595, }, }) crawler = runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=url, mockserver=self.mockserver) response = crawler.spider.meta["responses"][0] self.assertEqual(response.body, b"Caught ZeroDivisionError") self.assertEqual(response.request.url, url) @defer.inlineCallbacks def test_downloader_middleware_alternative_callback(self): """ Downloader middleware which returns a response with a specific 'request' attribute, with an alternative callback """ runner = CrawlerRunner(settings={ "DOWNLOADER_MIDDLEWARES": { AlternativeCallbacksMiddleware: 595, } }) crawler = runner.create_crawler(AlternativeCallbacksSpider) with LogCapture() as log: url = self.mockserver.url("/status?n=200") yield crawler.crawl(seed=url, mockserver=self.mockserver) log.check_present(("alternative_callbacks_spider", "INFO", "alt_callback was invoked with foo=bar"), )
class FileDownloadCrawlTestCase(TestCase): pipeline_class = 'scrapy.pipelines.files.FilesPipeline' store_setting_key = 'FILES_STORE' media_key = 'files' media_urls_key = 'file_urls' expected_checksums = set([ '5547178b89448faf0015a13f904c936e', 'c2281c83670e31d8aaab7cb642b824db', 'ed3f6538dc15d4d9179dae57319edc5f']) def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() # prepare a directory for storing files self.tmpmediastore = self.mktemp() os.mkdir(self.tmpmediastore) self.settings = { 'ITEM_PIPELINES': {self.pipeline_class: 1}, self.store_setting_key: self.tmpmediastore, } self.runner = CrawlerRunner(self.settings) self.items = [] def tearDown(self): shutil.rmtree(self.tmpmediastore) self.items = [] self.mockserver.__exit__(None, None, None) def _on_item_scraped(self, item): self.items.append(item) def _create_crawler(self, spider_class, **kwargs): crawler = self.runner.create_crawler(spider_class, **kwargs) crawler.signals.connect(self._on_item_scraped, signals.item_scraped) return crawler def _assert_files_downloaded(self, items, logs): self.assertEqual(len(items), 1) self.assertIn(self.media_key, items[0]) # check that logs show the expected number of successful file downloads file_dl_success = 'File (downloaded): Downloaded file from' self.assertEqual(logs.count(file_dl_success), 3) # check that the images/files checksums are what we know they should be if self.expected_checksums is not None: checksums = set( i['checksum'] for item in items for i in item[self.media_key]) self.assertEqual(checksums, self.expected_checksums) # check that the image files where actually written to the media store for item in items: for i in item[self.media_key]: self.assertTrue( os.path.exists( os.path.join(self.tmpmediastore, i['path']))) def _assert_files_download_failure(self, crawler, items, code, logs): # check that the item does NOT have the "images/files" field populated self.assertEqual(len(items), 1) self.assertIn(self.media_key, items[0]) self.assertFalse(items[0][self.media_key]) # check that there was 1 successful fetch and 3 other responses with non-200 code self.assertEqual(crawler.stats.get_value('downloader/request_method_count/GET'), 4) self.assertEqual(crawler.stats.get_value('downloader/response_count'), 4) self.assertEqual(crawler.stats.get_value('downloader/response_status_count/200'), 1) self.assertEqual(crawler.stats.get_value('downloader/response_status_count/%d' % code), 3) # check that logs do show the failure on the file downloads file_dl_failure = 'File (code: %d): Error downloading file from' % code self.assertEqual(logs.count(file_dl_failure), 3) # check that no files were written to the media store self.assertEqual(os.listdir(self.tmpmediastore), []) @defer.inlineCallbacks def test_download_media(self): crawler = self._create_crawler(MediaDownloadSpider) with LogCapture() as log: yield crawler.crawl("http://localhost:8998/files/images/", media_key=self.media_key, media_urls_key=self.media_urls_key) self._assert_files_downloaded(self.items, str(log)) @defer.inlineCallbacks def test_download_media_wrong_urls(self): crawler = self._create_crawler(BrokenLinksMediaDownloadSpider) with LogCapture() as log: yield crawler.crawl("http://localhost:8998/files/images/", media_key=self.media_key, media_urls_key=self.media_urls_key) self._assert_files_download_failure(crawler, self.items, 404, str(log)) @defer.inlineCallbacks def test_download_media_redirected_default_failure(self): crawler = self._create_crawler(RedirectedMediaDownloadSpider) with LogCapture() as log: yield crawler.crawl("http://localhost:8998/files/images/", media_key=self.media_key, media_urls_key=self.media_urls_key) self._assert_files_download_failure(crawler, self.items, 302, str(log)) @defer.inlineCallbacks def test_download_media_redirected_allowed(self): settings = dict(self.settings) settings.update({'MEDIA_ALLOW_REDIRECTS': True}) self.runner = CrawlerRunner(settings) crawler = self._create_crawler(RedirectedMediaDownloadSpider) with LogCapture() as log: yield crawler.crawl("http://localhost:8998/files/images/", media_key=self.media_key, media_urls_key=self.media_urls_key) self._assert_files_downloaded(self.items, str(log)) self.assertEqual(crawler.stats.get_value('downloader/response_status_count/302'), 3)
class ProxyConnectTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self._oldenv = os.environ.copy() self._proxy = HTTPSProxy(8888) self._proxy.start() # Wait for the proxy to start. time.sleep(1.0) os.environ['http_proxy'] = 'http://*****:*****@localhost:8888' os.environ['https_proxy'] = 'http://*****:*****@localhost:8888' def tearDown(self): self.mockserver.__exit__(None, None, None) self._proxy.shutdown() os.environ = self._oldenv @defer.inlineCallbacks def test_https_connect_tunnel(self): spider = SimpleSpider("https://localhost:8999/status?n=200") yield docrawl(spider) self._assert_got_response_code(200) @defer.inlineCallbacks def test_https_noconnect(self): os.environ['https_proxy'] = 'http://*****:*****@localhost:8888?noconnect' spider = SimpleSpider("https://*****:*****@localhost:8888' @defer.inlineCallbacks def test_https_connect_tunnel_error(self): spider = SimpleSpider("https://localhost:99999/status?n=200") yield docrawl(spider) self._assert_got_tunnel_error() @defer.inlineCallbacks def test_https_tunnel_auth_error(self): os.environ['https_proxy'] = 'http://*****:*****@localhost:8888' spider = SimpleSpider("https://*****:*****@localhost:8888' @defer.inlineCallbacks def test_https_noconnect_auth_error(self): os.environ['https_proxy'] = 'http://*****:*****@localhost:8888?noconnect' spider = SimpleSpider("https://localhost:8999/status?n=200") yield docrawl(spider) self._assert_got_response_code(407) def _assert_got_response_code(self, code): log = get_testlog() self.assertEqual(log.count('Crawled (%d)' % code), 1) def _assert_got_tunnel_error(self): log = get_testlog() self.assertEqual(log.count('TunnelError'), 1)