def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 stats = spider.crawler.stats if retries <= self.max_retry_times: spider.logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) return retryreq else: stats.inc_value('retry/max_reached') spider.logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider}) response = Response('') response.replace(body="") response.status = 12138 return response
def test_spider_crawls_links(spider, scrape_request, html_headers, mock_html_twolinks): """Ensure spider always picks up relevant links to HTML pages""" # Use only 1 user agent for easier counting ua = factories.BatchUserAgentFactory.build(ua_string='Firefox / 11.0') spider.batch_user_agents = [ua] # Generate a mock response based on html containing two links mock_response = Response('http://test:12345', body=mock_html_twolinks) mock_response.request = scrape_request mock_response.headers = html_headers mock_response.meta['user_agent'] = ua mock_response.status = 200 mock_response.encoding = u'utf-8' mock_response.flags = [] # Call spider on the mock response pipeline_generator = spider.parse(mock_response) # Assert that we got the expected set of new requests generated in the # spider and nothing else sites_expected = set([ mock_response.url + '/link1.html', mock_response.url + '/link2.html', ]) sites_collected = [] for new_request in pipeline_generator: if isinstance(new_request, Request): sites_collected.append(new_request.url) else: pass assert sites_expected == set(sites_collected)
def test_useragents_spider(spider, scrape_request, html_headers, mock_html_nolinks): """Ensure multiple requests with different user agent strings emitted""" ua1 = factories.BatchUserAgentFactory.build(ua_string='Firefox / 11.0') ua2 = factories.BatchUserAgentFactory.build(ua_string='Chrome / 20.0') spider.batch_user_agents = [ua1, ua2] # Generate a mock response mock_response = Response('http://test:12345', body=mock_html_nolinks) mock_response.request = scrape_request mock_response.headers = html_headers mock_response.status = 200 mock_response.encoding = u'utf-8' mock_response.flags = [] # Call the spider on the mock response pipeline_generator = spider.parse(mock_response) # Assert that we have two requests for this linkless page, one for each # of the user agents we inserted request_uas = [] for new_request in pipeline_generator: if isinstance(new_request, Request): request_uas.append(new_request.meta['user_agent'].ua_string) else: # We're not expecting anything other than Requests assert False assert set(request_uas) == set([u'Firefox / 11.0', u'Chrome / 20.0'])
def test_js_item_emission(spider, linked_js_request, js_headers, mock_js): """JS items are emitted correctly""" # Generate a mock response based on JS mock_url = 'http://test:12345/default.js' mock_response = Response(mock_url, body=mock_js) mock_response.request = linked_js_request mock_response.headers = js_headers mock_response.status = 200 mock_response.encoding = u'ascii' mock_response.flags = [] # Generate a fake urlscan to use in our item comparison mock_urlscan = model.URLScan.objects.create( site_scan=linked_js_request.meta['sitescan'], page_url_hash=sha256("http://test:12345/").hexdigest(), page_url=mock_response.url, timestamp=spider.get_now_time()) # Send the mocks to the spider for processing pipeline_generator = spider.parse(mock_response) # Verify the item returned is what we expected item_expected = MarkupItem() item_expected['content_type'] = spider.get_content_type(js_headers) item_expected['filename'] = os.path.basename(urlparse(mock_url).path) item_expected['headers'] = unicode(js_headers) item_expected['meta'] = mock_response.meta item_expected['raw_content'] = mock_response.body item_expected['sitescan'] = linked_js_request.meta['sitescan'] item_expected['urlscan'] = mock_urlscan item_expected['url'] = mock_response.url item_expected['user_agent'] = mock_response.meta['user_agent'] item_expected['redirected_from'] = '' assert list(pipeline_generator) == [item_expected]
def test_css_item_emission(spider, linked_css_request, css_headers, mock_css): """CSS items are emitted correctly""" # Use only 1 user agent for easier counting ua1 = factories.BatchUserAgentFactory(ua_string='Firefox / 11.0') spider.user_agents = [ua1] # Generate a mock response based on CSS mock_url = 'http://test:12345/default.css' mock_response = Response(mock_url, body=mock_css) mock_response.request = linked_css_request mock_response.headers = css_headers mock_response.status = 200 mock_response.encoding = u'ascii' mock_response.flags = [] # Generate a fake urlscan to use in our item comparison mock_urlscan = model.URLScan.objects.create( site_scan=linked_css_request.meta['sitescan'], page_url_hash=sha256("http://test:12345/").hexdigest(), page_url=mock_response.url, timestamp=spider.get_now_time()) # Send the mocks to the spider for processing pipeline_generator = spider.parse(mock_response) # Verify the item returned is what we expected item_expected = MarkupItem() item_expected['content_type'] = spider.get_content_type(css_headers) item_expected['filename'] = os.path.basename(urlparse(mock_url).path) item_expected['headers'] = unicode(css_headers) item_expected['meta'] = mock_response.meta item_expected['raw_content'] = mock_response.body item_expected['sitescan'] = linked_css_request.meta['sitescan'] item_expected['urlscan'] = mock_urlscan item_expected['url'] = mock_response.url item_expected['user_agent'] = mock_response.meta['user_agent'] item_collected = None for item in pipeline_generator: if isinstance(item, MarkupItem): item_collected = item else: assert False assert item_expected == item_collected