def test_useragents_spider(spider, scrape_request, html_headers, mock_html_nolinks): """Ensure multiple requests with different user agent strings emitted""" ua1 = factories.BatchUserAgentFactory.build(ua_string='Firefox / 11.0') ua2 = factories.BatchUserAgentFactory.build(ua_string='Chrome / 20.0') spider.batch_user_agents = [ua1, ua2] # Generate a mock response mock_response = Response('http://test:12345', body=mock_html_nolinks) mock_response.request = scrape_request mock_response.headers = html_headers mock_response.status = 200 mock_response.encoding = u'utf-8' mock_response.flags = [] # Call the spider on the mock response pipeline_generator = spider.parse(mock_response) # Assert that we have two requests for this linkless page, one for each # of the user agents we inserted request_uas = [] for new_request in pipeline_generator: if isinstance(new_request, Request): request_uas.append(new_request.meta['user_agent'].ua_string) else: # We're not expecting anything other than Requests assert False assert set(request_uas) == set([u'Firefox / 11.0', u'Chrome / 20.0'])
def test_spider_crawls_links(spider, scrape_request, html_headers, mock_html_twolinks): """Ensure spider always picks up relevant links to HTML pages""" # Use only 1 user agent for easier counting ua = factories.BatchUserAgentFactory.build(ua_string='Firefox / 11.0') spider.batch_user_agents = [ua] # Generate a mock response based on html containing two links mock_response = Response('http://test:12345', body=mock_html_twolinks) mock_response.request = scrape_request mock_response.headers = html_headers mock_response.meta['user_agent'] = ua mock_response.status = 200 mock_response.encoding = u'utf-8' mock_response.flags = [] # Call spider on the mock response pipeline_generator = spider.parse(mock_response) # Assert that we got the expected set of new requests generated in the # spider and nothing else sites_expected = set([ mock_response.url + '/link1.html', mock_response.url + '/link2.html', ]) sites_collected = [] for new_request in pipeline_generator: if isinstance(new_request, Request): sites_collected.append(new_request.url) else: pass assert sites_expected == set(sites_collected)
def test_js_item_emission(spider, linked_js_request, js_headers, mock_js): """JS items are emitted correctly""" # Generate a mock response based on JS mock_url = 'http://test:12345/default.js' mock_response = Response(mock_url, body=mock_js) mock_response.request = linked_js_request mock_response.headers = js_headers mock_response.status = 200 mock_response.encoding = u'ascii' mock_response.flags = [] # Generate a fake urlscan to use in our item comparison mock_urlscan = model.URLScan.objects.create( site_scan=linked_js_request.meta['sitescan'], page_url_hash=sha256("http://test:12345/").hexdigest(), page_url=mock_response.url, timestamp=spider.get_now_time()) # Send the mocks to the spider for processing pipeline_generator = spider.parse(mock_response) # Verify the item returned is what we expected item_expected = MarkupItem() item_expected['content_type'] = spider.get_content_type(js_headers) item_expected['filename'] = os.path.basename(urlparse(mock_url).path) item_expected['headers'] = unicode(js_headers) item_expected['meta'] = mock_response.meta item_expected['raw_content'] = mock_response.body item_expected['sitescan'] = linked_js_request.meta['sitescan'] item_expected['urlscan'] = mock_urlscan item_expected['url'] = mock_response.url item_expected['user_agent'] = mock_response.meta['user_agent'] item_expected['redirected_from'] = '' assert list(pipeline_generator) == [item_expected]
def _responses(request, status_codes): responses = [] for code in status_codes: response = Response(request.url, status=code) response.request = request responses.append(response) return responses
def _responses(request, status_codes): responses = [] for code in status_codes: response = Response(request.url, status=code) response.request = request responses.append(response) return responses
def pytest_funcarg__mock_response(request): """ Fake response to the scrape request -- we only fill out the fields used by the middleware for testing purposes """ scrape_request = request.getfuncargvalue("scrape_request") mock_response = Response('http://test.com') mock_response.request = scrape_request return mock_response
def test_empty_content_type(self): name = "ebay4" spider = self.smanager.create(name) generic_form_request = list(spider.start_requests())[0] response = Response(url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read()) response.request = generic_form_request # must not raise an error for result in spider.parse(response): pass
def test_hs_mware_process_spider_output_filter_request(hs_mware): response = Response('http://resp-url') # provide a response and a new request in result child_response = Response('http://resp-url-child') child_response.request = Request('http://resp-url-child-req') child_request = Request('http://req-url-child') hs_mware._seen = WeakKeyDictionary({response: 'riq'}) result = list(hs_mware.process_spider_output( response, [child_response, child_request], Spider('test'))) assert len(result) == 2 # make sure that we update hsparent meta only for requests assert result[0].meta.get(HS_PARENT_ID_KEY) is None assert result[1].meta[HS_PARENT_ID_KEY] == 'riq'
def test_hs_middlewares(hs_downloader_middleware, hs_spider_middleware): assert hs_spider_middleware._seen_requests == WeakKeyDictionary() assert hs_downloader_middleware._seen_requests == WeakKeyDictionary() assert hs_spider_middleware._seen_requests is hs_downloader_middleware._seen_requests spider = Spider('test') url = 'http://resp-url' request_0 = Request(url) response_0 = Response(url) hs_downloader_middleware.process_request(request_0, spider) assert HS_REQUEST_ID_KEY not in request_0.meta assert HS_PARENT_ID_KEY not in request_0.meta assert len(hs_spider_middleware._seen_requests) == 0 assert len(hs_downloader_middleware._seen_requests) == 0 hs_downloader_middleware.process_response(request_0, response_0, spider) assert request_0.meta[HS_REQUEST_ID_KEY] == 0 assert request_0.meta[HS_PARENT_ID_KEY] is None assert hs_spider_middleware._seen_requests[request_0] == 0 response_0.request = request_0 request_1 = Request(url) request_2 = Request(url) item1 = {} item2 = Item() output = [request_1, request_2, item1, item2] processed_output = list( hs_spider_middleware.process_spider_output(response_0, output, spider)) assert processed_output[0] is request_1 assert request_1.meta[HS_PARENT_ID_KEY] == 0 assert processed_output[1] is request_2 assert request_2.meta[HS_PARENT_ID_KEY] == 0 assert processed_output[2] is item1 assert processed_output[3] is item2 response_1 = Response(url) hs_downloader_middleware.process_request(request_1, spider) hs_downloader_middleware.process_response(request_1, response_1, spider) assert request_1.meta[HS_REQUEST_ID_KEY] == 1 assert request_1.meta[HS_PARENT_ID_KEY] == 0 response_2 = Response(url) hs_downloader_middleware.process_request(request_2, spider) hs_downloader_middleware.process_response(request_2, response_2, spider) assert request_2.meta[HS_REQUEST_ID_KEY] == 2 assert request_2.meta[HS_PARENT_ID_KEY] == 0
def test_hs_middlewares(hs_downloader_middleware, hs_spider_middleware): assert hs_spider_middleware._seen_requests == WeakKeyDictionary() assert hs_downloader_middleware._seen_requests == WeakKeyDictionary() assert hs_spider_middleware._seen_requests is hs_downloader_middleware._seen_requests spider = Spider('test') url = 'http://resp-url' request_0 = Request(url) response_0 = Response(url) hs_downloader_middleware.process_request(request_0, spider) assert HS_REQUEST_ID_KEY not in request_0.meta assert HS_PARENT_ID_KEY not in request_0.meta assert len(hs_spider_middleware._seen_requests) == 0 assert len(hs_downloader_middleware._seen_requests) == 0 hs_downloader_middleware.process_response(request_0, response_0, spider) assert request_0.meta[HS_REQUEST_ID_KEY] == 0 assert request_0.meta[HS_PARENT_ID_KEY] is None assert hs_spider_middleware._seen_requests[request_0] == 0 response_0.request = request_0 request_1 = Request(url) request_2 = Request(url) item1 = {} item2 = Item() output = [request_1, request_2, item1, item2] processed_output = list(hs_spider_middleware.process_spider_output(response_0, output, spider)) assert processed_output[0] is request_1 assert request_1.meta[HS_PARENT_ID_KEY] == 0 assert processed_output[1] is request_2 assert request_2.meta[HS_PARENT_ID_KEY] == 0 assert processed_output[2] is item1 assert processed_output[3] is item2 response_1 = Response(url) hs_downloader_middleware.process_request(request_1, spider) hs_downloader_middleware.process_response(request_1, response_1, spider) assert request_1.meta[HS_REQUEST_ID_KEY] == 1 assert request_1.meta[HS_PARENT_ID_KEY] == 0 response_2 = Response(url) hs_downloader_middleware.process_request(request_2, spider) hs_downloader_middleware.process_response(request_2, response_2, spider) assert request_2.meta[HS_REQUEST_ID_KEY] == 2 assert request_2.meta[HS_PARENT_ID_KEY] == 0
def test_hs_mware_process_spider_input(hs_mware): response = Response('http://resp-url') response.request = Request('http://req-url') hs_mware.process_spider_input(response, Spider('test')) assert hs_mware.pipe_writer.write_request.call_count == 1 args = hs_mware.pipe_writer.write_request.call_args[1] assert args == { 'duration': 0, 'fp': request_fingerprint(response.request), 'method': 'GET', 'parent': None, 'rs': 0, 'status': 200, 'url': 'http://resp-url' } assert hs_mware._seen == WeakKeyDictionary({response: 0})
def test_hs_mware_process_spider_input(hs_mware): response = Response('http://resp-url') response.request = Request('http://req-url') hs_mware.process_spider_input(response, Spider('test')) assert hs_mware.pipe_writer.write_request.call_count == 1 args = hs_mware.pipe_writer.write_request.call_args[1] assert args == { 'duration': 0, 'fp': request_fingerprint(response.request), 'method': 'GET', 'parent': None, 'rs': 0, 'status': 200, 'url': 'http://resp-url' } assert hs_mware._seen == WeakKeyDictionary({response: 0})
def test_css_item_emission(spider, linked_css_request, css_headers, mock_css): """CSS items are emitted correctly""" # Use only 1 user agent for easier counting ua1 = factories.BatchUserAgentFactory(ua_string='Firefox / 11.0') spider.user_agents = [ua1] # Generate a mock response based on CSS mock_url = 'http://test:12345/default.css' mock_response = Response(mock_url, body=mock_css) mock_response.request = linked_css_request mock_response.headers = css_headers mock_response.status = 200 mock_response.encoding = u'ascii' mock_response.flags = [] # Generate a fake urlscan to use in our item comparison mock_urlscan = model.URLScan.objects.create( site_scan=linked_css_request.meta['sitescan'], page_url_hash=sha256("http://test:12345/").hexdigest(), page_url=mock_response.url, timestamp=spider.get_now_time()) # Send the mocks to the spider for processing pipeline_generator = spider.parse(mock_response) # Verify the item returned is what we expected item_expected = MarkupItem() item_expected['content_type'] = spider.get_content_type(css_headers) item_expected['filename'] = os.path.basename(urlparse(mock_url).path) item_expected['headers'] = unicode(css_headers) item_expected['meta'] = mock_response.meta item_expected['raw_content'] = mock_response.body item_expected['sitescan'] = linked_css_request.meta['sitescan'] item_expected['urlscan'] = mock_urlscan item_expected['url'] = mock_response.url item_expected['user_agent'] = mock_response.meta['user_agent'] item_collected = None for item in pipeline_generator: if isinstance(item, MarkupItem): item_collected = item else: assert False assert item_expected == item_collected
def test_parse_declaration_doc(self): response = Response('http://old.vtek.lt/vtek/.../deklaracija2012.doc', body='msword msword msword') response.request = scrapy.Request(response.url) response.request.meta['year'] = '2012' def mock_doc2xml(msword): assert msword == 'msword msword msword' return 'xml xml xml' with mock.patch('manoseimas.scrapy.spiders.lobbyist_declarations.doc2xml', mock_doc2xml): with mock.patch.object(self.spider, 'parse_declaration_xml') as p_d_x: list(self.spider.parse_declaration_doc(response)) assert p_d_x.call_count == 1 new_response = p_d_x.call_args[0][0] assert new_response.meta['year'] == '2012' assert new_response.body == 'xml xml xml' assert isinstance(new_response, XmlResponse)
def test_hs_mware_process_spider_input(hs_mware): response = Response('http://resp-url') response.request = Request('http://req-url') hs_mware.hsref.job.requests.add.return_value = 'riq' hs_mware.process_spider_input(response, Spider('test')) assert hs_mware.hsref.job.requests.add.call_count == 1 args = hs_mware.hsref.job.requests.add.call_args[1] ts = args.pop('ts', None) assert isinstance(ts, float) assert args == { 'duration': 0, 'fp': request_fingerprint(response.request), 'method': 'GET', 'parent': None, 'rs': 0, 'status': 200, 'url': 'http://resp-url'} assert hs_mware._seen == WeakKeyDictionary({response: 'riq'})
def test_process_spider_output(self): req = Request('http://scrapytest.org') resp = Response('http://scrapytest.org') resp.request = req result = [Request('http://scrapytest.org')] out = list(self.mw.process_spider_output(resp, result, self.spider)) self.assertEquals(out, result) rdc = self.stats.get_value('request_depth_count/1', spider=self.spider) self.assertEquals(rdc, 1) req.meta['depth'] = 1 out2 = list(self.mw.process_spider_output(resp, result, self.spider)) self.assertEquals(out2, []) rdm = self.stats.get_value('request_depth_max', spider=self.spider) self.assertEquals(rdm, 1)
def test_process_spider_output(self): req = Request('http://scrapytest.org') resp = Response('http://scrapytest.org') resp.request = req result = [Request('http://scrapytest.org')] out = list(self.mw.process_spider_output(resp, result, self.spider)) self.assertEquals(out, result) rdc = self.stats.get_value('request_depth_count/1', spider=self.spider) self.assertEquals(rdc, 1) req.meta['depth'] = 1 out2 = list(self.mw.process_spider_output(resp, result, self.spider)) self.assertEquals(out2, []) rdm = self.stats.get_value('request_depth_max', spider=self.spider) self.assertEquals(rdm, 1)
def _getresponse(self, coding): if coding not in FORMAT: raise ValueError() samplefile, contentencoding = FORMAT[coding] with open(join(SAMPLEDIR, samplefile), 'rb') as sample: body = sample.read() headers = { 'Server': 'Yaws/1.49 Yet Another Web Server', 'Date': 'Sun, 08 Mar 2009 00:41:03 GMT', 'Content-Length': len(body), 'Content-Type': 'text/html', 'Content-Encoding': contentencoding, } response = Response('http://scrapytest.org/', body=body, headers=headers) response.request = Request('http://scrapytest.org', headers={'Accept-Encoding': 'gzip,deflate'}) return response
def _getresponse(self, coding): if coding not in FORMAT: raise ValueError() samplefile, contentencoding = FORMAT[coding] with open(join(SAMPLEDIR, samplefile), "rb") as sample: body = sample.read() headers = { "Server": "Yaws/1.49 Yet Another Web Server", "Date": "Sun, 08 Mar 2009 00:41:03 GMT", "Content-Length": len(body), "Content-Type": "text/html", "Content-Encoding": contentencoding, } response = Response("http://scrapytest.org/", body=body, headers=headers) response.request = Request("http://scrapytest.org", headers={"Accept-Encoding": "gzip,deflate"}) return response
def _getresponse(self, coding): if coding not in FORMAT: raise ValueError() samplefile, contentencoding = FORMAT[coding] with open(join(SAMPLEDIR, samplefile), 'rb') as sample: body = sample.read() headers = { 'Server': 'Yaws/1.49 Yet Another Web Server', 'Date': 'Sun, 08 Mar 2009 00:41:03 GMT', 'Content-Length': len(body), 'Content-Type': 'text/html', 'Content-Encoding': contentencoding, } response = Response('http://scrapytest.org/', body=body, headers=headers) response.request = Request('http://scrapytest.org', headers={'Accept-Encoding': 'gzip, deflate'}) return response
def test_parse_declaration_doc(self): response = Response('http://old.vtek.lt/vtek/.../deklaracija2012.doc', body='msword msword msword') response.request = scrapy.Request(response.url) response.request.meta['year'] = '2012' def mock_doc2xml(msword): assert msword == 'msword msword msword' return 'xml xml xml' with mock.patch( 'manoseimas.scrapy.spiders.lobbyist_declarations.doc2xml', mock_doc2xml): with mock.patch.object(self.spider, 'parse_declaration_xml') as p_d_x: list(self.spider.parse_declaration_doc(response)) assert p_d_x.call_count == 1 new_response = p_d_x.call_args[0][0] assert new_response.meta['year'] == '2012' assert new_response.body == 'xml xml xml' assert isinstance(new_response, XmlResponse)
def test_js_item_emission(spider, linked_js_request, js_headers, mock_js): """JS items are emitted correctly""" # Generate a mock response based on JS mock_url = 'http://test:12345/default.js' mock_response = Response(mock_url, body=mock_js) mock_response.request = linked_js_request mock_response.headers = js_headers mock_response.status = 200 mock_response.encoding = u'ascii' mock_response.flags = [] # Generate a fake urlscan to use in our item comparison mock_urlscan = model.URLScan.objects.create( site_scan=linked_js_request.meta['sitescan'], page_url_hash=sha256("http://test:12345/").hexdigest(), page_url=mock_response.url, timestamp=spider.get_now_time()) # Send the mocks to the spider for processing pipeline_generator = spider.parse(mock_response) # Verify the item returned is what we expected item_expected = MarkupItem() item_expected['content_type'] = spider.get_content_type(js_headers) item_expected['filename'] = os.path.basename(urlparse(mock_url).path) item_expected['headers'] = unicode(js_headers) item_expected['meta'] = mock_response.meta item_expected['raw_content'] = mock_response.body item_expected['sitescan'] = linked_js_request.meta['sitescan'] item_expected['urlscan'] = mock_urlscan item_expected['url'] = mock_response.url item_expected['user_agent'] = mock_response.meta['user_agent'] item_expected['redirected_from'] = '' assert list(pipeline_generator) == [item_expected]