def test_log_debug(self): with LogCapture() as l: settings = {'DUPEFILTER_DEBUG': True, 'DUPEFILTER_CLASS': __name__ + '.FromCrawlerRFPDupeFilter'} crawler = get_crawler(SimpleSpider, settings_dict=settings) scheduler = Scheduler.from_crawler(crawler) spider = SimpleSpider.from_crawler(crawler) dupefilter = scheduler.df dupefilter.open() r1 = Request('http://scrapytest.org/index.html') r2 = Request('http://scrapytest.org/index.html', headers={'Referer': 'http://scrapytest.org/INDEX.html'} ) dupefilter.log(r1, spider) dupefilter.log(r2, spider) assert crawler.stats.get_value('dupefilter/filtered') == 2 l.check_present(('scrapy.dupefilters', 'DEBUG', ('Filtered duplicate request: <GET http://scrapytest.org/index.html>' ' (referer: None)'))) l.check_present(('scrapy.dupefilters', 'DEBUG', ('Filtered duplicate request: <GET http://scrapytest.org/index.html>' ' (referer: http://scrapytest.org/INDEX.html)'))) dupefilter.close('finished')
def test_log(self): with LogCapture() as l: settings = { 'DUPEFILTER_DEBUG': False, 'DUPEFILTER_CLASS': __name__ + '.FromCrawlerRFPDupeFilter' } crawler = get_crawler(SimpleSpider, settings_dict=settings) scheduler = Scheduler.from_crawler(crawler) spider = SimpleSpider.from_crawler(crawler) dupefilter = scheduler.df dupefilter.open() r1 = Request('http://scrapytest.org/index.html') r2 = Request('http://scrapytest.org/index.html') dupefilter.log(r1, spider) dupefilter.log(r2, spider) assert crawler.stats.get_value('dupefilter/filtered') == 2 l.check_present(('scrapy.dupefilters', 'DEBUG', ( 'Filtered duplicate request: <GET http://scrapytest.org/index.html>' ' - no more duplicates will be shown' ' (see DUPEFILTER_DEBUG to show all duplicates)'))) dupefilter.close('finished')
def test_retry_dns_error(self): with mock.patch('socket.gethostbyname', side_effect=socket.gaierror( -5, 'No address associated with hostname')): spider = SimpleSpider("http://example.com/") yield docrawl(spider) self._assert_retried()
def test_unbounded_response(self): # Completeness of responses without Content-Length or Transfer-Encoding # can not be determined, we treat them as valid but flagged as "partial" from urllib import urlencode query = urlencode({ 'raw': '''\ HTTP/1.1 200 OK Server: Apache-Coyote/1.1 X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0 Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/ Pragma: no-cache Expires: Thu, 01 Jan 1970 00:00:00 GMT Cache-Control: no-cache Cache-Control: no-store Content-Type: text/html;charset=UTF-8 Content-Language: en Date: Tue, 27 Aug 2013 13:05:05 GMT Connection: close foo body with multiples lines ''' }) spider = SimpleSpider("http://localhost:8998/raw?{0}".format(query)) yield docrawl(spider) log = get_testlog() self.assertEqual(log.count("Got response 200"), 1)
def test_log(self): with LogCapture() as log: settings = { 'DUPEFILTER_DEBUG': False, 'DUPEFILTER_CLASS': FromCrawlerRFPDupeFilter, 'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION' } crawler = get_crawler(SimpleSpider, settings_dict=settings) spider = SimpleSpider.from_crawler(crawler) dupefilter = _get_dupefilter(crawler=crawler) r1 = Request('http://scrapytest.org/index.html') r2 = Request('http://scrapytest.org/index.html') dupefilter.log(r1, spider) dupefilter.log(r2, spider) assert crawler.stats.get_value('dupefilter/filtered') == 2 log.check_present(( 'scrapy.dupefilters', 'DEBUG', 'Filtered duplicate request: <GET http://scrapytest.org/index.html> - no more' ' duplicates will be shown (see DUPEFILTER_DEBUG to show all duplicates)' )) dupefilter.close('finished')
def test_log_debug_default_dupefilter(self): with LogCapture() as log: settings = { 'DUPEFILTER_DEBUG': True, 'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION' } crawler = get_crawler(SimpleSpider, settings_dict=settings) spider = SimpleSpider.from_crawler(crawler) dupefilter = _get_dupefilter(crawler=crawler) r1 = Request('http://scrapytest.org/index.html') r2 = Request( 'http://scrapytest.org/index.html', headers={'Referer': 'http://scrapytest.org/INDEX.html'}) dupefilter.log(r1, spider) dupefilter.log(r2, spider) assert crawler.stats.get_value('dupefilter/filtered') == 2 log.check_present(( 'scrapy.dupefilters', 'DEBUG', 'Filtered duplicate request: <GET http://scrapytest.org/index.html> (referer: None)' )) log.check_present(( 'scrapy.dupefilters', 'DEBUG', 'Filtered duplicate request: <GET http://scrapytest.org/index.html>' ' (referer: http://scrapytest.org/INDEX.html)')) dupefilter.close('finished')
def test_https_noconnect(self): os.environ[ 'https_proxy'] = 'http://*****:*****@localhost:8888?noconnect' spider = SimpleSpider("https://*****:*****@localhost:8888'
def test_https_tunnel_auth_error(self): os.environ['https_proxy'] = 'http://*****:*****@localhost:8888' spider = SimpleSpider("https://*****:*****@localhost:8888'
def test_log(self): with LogCapture() as l: settings = {'DUPEFILTER_DEBUG': False, 'DUPEFILTER_CLASS': __name__ + '.FromCrawlerRFPDupeFilter'} crawler = get_crawler(SimpleSpider, settings_dict=settings) scheduler = Scheduler.from_crawler(crawler) spider = SimpleSpider.from_crawler(crawler) dupefilter = scheduler.df dupefilter.open() r1 = Request('http://scrapytest.org/index.html') r2 = Request('http://scrapytest.org/index.html') dupefilter.log(r1, spider) dupefilter.log(r2, spider) assert crawler.stats.get_value('dupefilter/filtered') == 2 l.check_present(('scrapy.dupefilters', 'DEBUG', ('Filtered duplicate request: <GET http://scrapytest.org/index.html>' ' - no more duplicates will be shown' ' (see DUPEFILTER_DEBUG to show all duplicates)'))) dupefilter.close('finished')
def test_retry_conn_failed(self): spider = SimpleSpider("http://localhost:65432/status?n=503") yield docrawl(spider) self._assert_retried()
def test_retry_503(self): spider = SimpleSpider("http://localhost:8998/status?n=503") yield docrawl(spider) self._assert_retried()
def test_retry_conn_aborted(self): # connection lost before receiving data spider = SimpleSpider("http://localhost:8998/drop?abort=1") yield docrawl(spider) self._assert_retried()
def test_https_noconnect_auth_error(self): os.environ[ 'https_proxy'] = 'http://*****:*****@localhost:8888?noconnect' spider = SimpleSpider("https://localhost:8999/status?n=200") yield docrawl(spider) self._assert_got_response_code(407)
def test_https_connect_tunnel_error(self): spider = SimpleSpider("https://localhost:99999/status?n=200") yield docrawl(spider) self._assert_got_tunnel_error()
def test_https_connect_tunnel(self): spider = SimpleSpider("https://localhost:8999/status?n=200") yield docrawl(spider) self._assert_got_response_code(200)