class TestHttpErrorMiddlewareHandleAll(TestCase): def setUp(self): self.spider = BaseSpider("foo") self.mw = HttpErrorMiddleware(Settings({"HTTPERROR_ALLOW_ALL": True})) self.req = Request("http://scrapytest.org") self.res200 = Response("http://scrapytest.org", status=200) self.res200.request = self.req self.res404 = Response("http://scrapytest.org", status=404) self.res404.request = self.req self.res402 = Response("http://scrapytest.org", status=402) self.res402.request = self.req def test_process_spider_input(self): self.assertEquals(None, self.mw.process_spider_input(self.res200, self.spider)) self.assertEquals(None, self.mw.process_spider_input(self.res404, self.spider)) def test_meta_overrides_settings(self): request = Request("http://scrapytest.org", meta={"handle_httpstatus_list": [404]}) res404 = self.res404.copy() res404.request = request res402 = self.res402.copy() res402.request = request self.assertEquals(None, self.mw.process_spider_input(res404, self.spider)) self.assertRaises(HttpError, self.mw.process_spider_input, res402, self.spider)
def test_request_cacheability(self): res0 = Response(self.request.url, status=200, headers={'Expires': self.tomorrow}) req0 = Request('http://example.com') req1 = req0.replace(headers={'Cache-Control': 'no-store'}) req2 = req0.replace(headers={'Cache-Control': 'no-cache'}) with self._middleware() as mw: # response for a request with no-store must not be cached res1 = self._process_requestresponse(mw, req1, res0) self.assertEqualResponse(res1, res0) assert mw.storage.retrieve_response(self.spider, req1) is None # Re-do request without no-store and expect it to be cached res2 = self._process_requestresponse(mw, req0, res0) assert 'cached' not in res2.flags res3 = mw.process_request(req0, self.spider) assert 'cached' in res3.flags self.assertEqualResponse(res2, res3) # request with no-cache directive must not return cached response # but it allows new response to be stored res0b = res0.replace(body=b'foo') res4 = self._process_requestresponse(mw, req2, res0b) self.assertEqualResponse(res4, res0b) assert 'cached' not in res4.flags res5 = self._process_requestresponse(mw, req0, None) self.assertEqualResponse(res5, res0b) assert 'cached' in res5.flags
def _responses(request, status_codes): responses = [] for code in status_codes: response = Response(request.url, status=code) response.request = request responses.append(response) return responses
def test_cached_and_stale(self): sampledata = [ (200, {'Date': self.today, 'Expires': self.yesterday}), (200, {'Date': self.today, 'Expires': self.yesterday, 'Last-Modified': self.yesterday}), (200, {'Expires': self.yesterday}), (200, {'Expires': self.yesterday, 'ETag': 'foo'}), (200, {'Expires': self.yesterday, 'Last-Modified': self.yesterday}), (200, {'Expires': self.tomorrow, 'Age': '86405'}), (200, {'Cache-Control': 'max-age=86400', 'Age': '86405'}), # no-cache forces expiration, also revalidation if validators exists (200, {'Cache-Control': 'no-cache'}), (200, {'Cache-Control': 'no-cache', 'ETag': 'foo'}), (200, {'Cache-Control': 'no-cache', 'Last-Modified': self.yesterday}), (200, {'Cache-Control': 'no-cache,must-revalidate', 'Last-Modified': self.yesterday}), (200, {'Cache-Control': 'must-revalidate', 'Expires': self.yesterday, 'Last-Modified': self.yesterday}), (200, {'Cache-Control': 'max-age=86400,must-revalidate', 'Age': '86405'}), ] with self._middleware() as mw: for idx, (status, headers) in enumerate(sampledata): req0 = Request('http://example-%d.com' % idx) res0a = Response(req0.url, status=status, headers=headers) # cache expired response res1 = self._process_requestresponse(mw, req0, res0a) self.assertEqualResponse(res1, res0a) assert 'cached' not in res1.flags # Same request but as cached response is stale a new response must # be returned res0b = res0a.replace(body=b'bar') res2 = self._process_requestresponse(mw, req0, res0b) self.assertEqualResponse(res2, res0b) assert 'cached' not in res2.flags cc = headers.get('Cache-Control', '') # Previous response expired too, subsequent request to same # resource must revalidate and succeed on 304 if validators # are present if 'ETag' in headers or 'Last-Modified' in headers: res0c = res0b.replace(status=304) res3 = self._process_requestresponse(mw, req0, res0c) self.assertEqualResponse(res3, res0b) assert 'cached' in res3.flags # get cached response on server errors unless must-revalidate # in cached response res0d = res0b.replace(status=500) res4 = self._process_requestresponse(mw, req0, res0d) if 'must-revalidate' in cc: assert 'cached' not in res4.flags self.assertEqualResponse(res4, res0d) else: assert 'cached' in res4.flags self.assertEqualResponse(res4, res0b) # Requests with max-stale can fetch expired cached responses # unless cached response has must-revalidate req1 = req0.replace(headers={'Cache-Control': 'max-stale'}) res5 = self._process_requestresponse(mw, req1, res0b) self.assertEqualResponse(res5, res0b) if 'no-cache' in cc or 'must-revalidate' in cc: assert 'cached' not in res5.flags else: assert 'cached' in res5.flags
def pytest_funcarg__mock_response(request): """ Fake response to the scrape request -- we only fill out the fields used by the middleware for testing purposes """ scrape_request = request.getfuncargvalue("scrape_request") mock_response = Response('http://test.com') mock_response.request = scrape_request return mock_response
def test_empty_content_type(self): name = "ebay4" spider = self.smanager.create(name) generic_form_request = list(spider.start_requests())[0] response = Response(url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read()) response.request = generic_form_request # must not raise an error for result in spider.parse(response): pass
def setUp(self): self.spider = BaseSpider("foo") self.mw = HttpErrorMiddleware(Settings({"HTTPERROR_ALLOW_ALL": True})) self.req = Request("http://scrapytest.org") self.res200 = Response("http://scrapytest.org", status=200) self.res200.request = self.req self.res404 = Response("http://scrapytest.org", status=404) self.res404.request = self.req self.res402 = Response("http://scrapytest.org", status=402) self.res402.request = self.req
def setUp(self): self.spider = Spider('foo') self.mw = HttpErrorMiddleware(Settings({'HTTPERROR_ALLOW_ALL': True})) self.req = Request('http://scrapytest.org') self.res200 = Response('http://scrapytest.org', status=200) self.res200.request = self.req self.res404 = Response('http://scrapytest.org', status=404) self.res404.request = self.req self.res402 = Response('http://scrapytest.org', status=402) self.res402.request = self.req
def test_hs_mware_process_spider_output_filter_request(hs_mware): response = Response('http://resp-url') # provide a response and a new request in result child_response = Response('http://resp-url-child') child_response.request = Request('http://resp-url-child-req') child_request = Request('http://req-url-child') hs_mware._seen = WeakKeyDictionary({response: 'riq'}) result = list(hs_mware.process_spider_output( response, [child_response, child_request], Spider('test'))) assert len(result) == 2 # make sure that we update hsparent meta only for requests assert result[0].meta.get(HS_PARENT_ID_KEY) is None assert result[1].meta[HS_PARENT_ID_KEY] == 'riq'
def test_hs_middlewares(hs_downloader_middleware, hs_spider_middleware): assert hs_spider_middleware._seen_requests == WeakKeyDictionary() assert hs_downloader_middleware._seen_requests == WeakKeyDictionary() assert hs_spider_middleware._seen_requests is hs_downloader_middleware._seen_requests spider = Spider('test') url = 'http://resp-url' request_0 = Request(url) response_0 = Response(url) hs_downloader_middleware.process_request(request_0, spider) assert HS_REQUEST_ID_KEY not in request_0.meta assert HS_PARENT_ID_KEY not in request_0.meta assert len(hs_spider_middleware._seen_requests) == 0 assert len(hs_downloader_middleware._seen_requests) == 0 hs_downloader_middleware.process_response(request_0, response_0, spider) assert request_0.meta[HS_REQUEST_ID_KEY] == 0 assert request_0.meta[HS_PARENT_ID_KEY] is None assert hs_spider_middleware._seen_requests[request_0] == 0 response_0.request = request_0 request_1 = Request(url) request_2 = Request(url) item1 = {} item2 = Item() output = [request_1, request_2, item1, item2] processed_output = list(hs_spider_middleware.process_spider_output(response_0, output, spider)) assert processed_output[0] is request_1 assert request_1.meta[HS_PARENT_ID_KEY] == 0 assert processed_output[1] is request_2 assert request_2.meta[HS_PARENT_ID_KEY] == 0 assert processed_output[2] is item1 assert processed_output[3] is item2 response_1 = Response(url) hs_downloader_middleware.process_request(request_1, spider) hs_downloader_middleware.process_response(request_1, response_1, spider) assert request_1.meta[HS_REQUEST_ID_KEY] == 1 assert request_1.meta[HS_PARENT_ID_KEY] == 0 response_2 = Response(url) hs_downloader_middleware.process_request(request_2, spider) hs_downloader_middleware.process_response(request_2, response_2, spider) assert request_2.meta[HS_REQUEST_ID_KEY] == 2 assert request_2.meta[HS_PARENT_ID_KEY] == 0
class TestHttpErrorMiddleware(TestCase): def setUp(self): self.spider = BaseSpider("foo") self.mw = HttpErrorMiddleware(Settings({})) self.req = Request("http://scrapytest.org") self.res200 = Response("http://scrapytest.org", status=200) self.res200.request = self.req self.res404 = Response("http://scrapytest.org", status=404) self.res404.request = self.req def test_process_spider_input(self): self.assertEquals(None, self.mw.process_spider_input(self.res200, self.spider)) self.assertRaises(HttpError, self.mw.process_spider_input, self.res404, self.spider) def test_process_spider_exception(self): self.assertEquals([], self.mw.process_spider_exception(self.res404, HttpError(self.res404), self.spider)) self.assertEquals(None, self.mw.process_spider_exception(self.res404, Exception(), self.spider)) def test_handle_httpstatus_list(self): res = self.res404.copy() res.request = Request("http://scrapytest.org", meta={"handle_httpstatus_list": [404]}) self.assertEquals(None, self.mw.process_spider_input(res, self.spider)) self.spider.handle_httpstatus_list = [404] self.assertEquals(None, self.mw.process_spider_input(self.res404, self.spider))
class TestHttpErrorMiddleware(TestCase): def setUp(self): self.spider = BaseSpider() self.mw = HttpErrorMiddleware() self.req = Request('http://scrapytest.org') self.res200 = Response('http://scrapytest.org', status=200) self.res200.request = self.req self.res404 = Response('http://scrapytest.org', status=404) self.res404.request = self.req def test_process_spider_input(self): self.assertEquals(self.mw.process_spider_input(self.res200, self.spider), None) self.assertEquals(self.mw.process_spider_input(self.res404, self.spider), []) def test_handle_httpstatus_list(self): res = self.res404.copy() res.request = Request('http://scrapytest.org', meta={'handle_httpstatus_list': [404]}) self.assertEquals(self.mw.process_spider_input(res, self.spider), None) self.spider.handle_httpstatus_list = [404] self.assertEquals(self.mw.process_spider_input(self.res404, self.spider), None)
def test_hs_mware_process_spider_input(hs_mware): response = Response('http://resp-url') response.request = Request('http://req-url') hs_mware.process_spider_input(response, Spider('test')) assert hs_mware.pipe_writer.write_request.call_count == 1 args = hs_mware.pipe_writer.write_request.call_args[1] assert args == { 'duration': 0, 'fp': request_fingerprint(response.request), 'method': 'GET', 'parent': None, 'rs': 0, 'status': 200, 'url': 'http://resp-url' } assert hs_mware._seen == WeakKeyDictionary({response: 0})
def test_response_cacheability(self): responses = [ # 304 is not cacheable no matter what servers sends (False, 304, {}), (False, 304, {'Last-Modified': self.yesterday}), (False, 304, {'Expires': self.tomorrow}), (False, 304, {'Etag': 'bar'}), (False, 304, {'Cache-Control': 'max-age=3600'}), # Always obey no-store cache control (False, 200, {'Cache-Control': 'no-store'}), # invalid (False, 200, {'Cache-Control': 'no-store, max-age=300'}), # invalid (False, 200, { 'Cache-Control': 'no-store', 'Expires': self.tomorrow}), # Ignore responses missing expiration and/or validation headers (False, 200, {}), (False, 302, {}), (False, 307, {}), (False, 404, {}), # Cache responses with expiration and/or validation headers (True, 200, {'Last-Modified': self.yesterday}), (True, 203, {'Last-Modified': self.yesterday}), (True, 300, {'Last-Modified': self.yesterday}), (True, 301, {'Last-Modified': self.yesterday}), (True, 401, {'Last-Modified': self.yesterday}), (True, 404, {'Cache-Control': 'public, max-age=600'}), (True, 302, {'Expires': self.tomorrow}), (True, 200, {'Etag': 'foo'}), ] with self._middleware() as mw: for idx, (shouldcache, status, headers) in enumerate(responses): req0 = Request('http://example-%d.com' % idx) res0 = Response(req0.url, status=status, headers=headers) res1 = self._process_requestresponse(mw, req0, res0) res304 = res0.replace(status=304) res2 = self._process_requestresponse( mw, req0, res304 if shouldcache else res0) self.assertEqualResponse(res1, res0) self.assertEqualResponse(res2, res0) resc = mw.storage.retrieve_response(self.spider, req0) if shouldcache: self.assertEqualResponse(resc, res1) assert 'cached' in res2.flags and res2.status != 304 else: self.assertFalse(resc) assert 'cached' not in res2.flags
def test_parse_declaration_doc(self): response = Response('http://old.vtek.lt/vtek/.../deklaracija2012.doc', body='msword msword msword') response.request = scrapy.Request(response.url) response.request.meta['year'] = '2012' def mock_doc2xml(msword): assert msword == 'msword msword msword' return 'xml xml xml' with mock.patch('manoseimas.scrapy.spiders.lobbyist_declarations.doc2xml', mock_doc2xml): with mock.patch.object(self.spider, 'parse_declaration_xml') as p_d_x: list(self.spider.parse_declaration_doc(response)) assert p_d_x.call_count == 1 new_response = p_d_x.call_args[0][0] assert new_response.meta['year'] == '2012' assert new_response.body == 'xml xml xml' assert isinstance(new_response, XmlResponse)
def test_cached_and_fresh(self): sampledata = [ (200, {'Date': self.yesterday, 'Expires': self.tomorrow}), (200, {'Date': self.yesterday, 'Cache-Control': 'max-age=86405'}), (200, {'Age': '299', 'Cache-Control': 'max-age=300'}), # Obey max-age if present over any others (200, {'Date': self.today, 'Age': '86405', 'Cache-Control': 'max-age=' + str(86400 * 3), 'Expires': self.yesterday, 'Last-Modified': self.yesterday, }), # obey Expires if max-age is not present (200, {'Date': self.yesterday, 'Age': '86400', 'Cache-Control': 'public', 'Expires': self.tomorrow, 'Last-Modified': self.yesterday, }), # Default missing Date header to right now (200, {'Expires': self.tomorrow}), # Firefox - Expires if age is greater than 10% of (Date - Last-Modified) (200, {'Date': self.today, 'Last-Modified': self.yesterday, 'Age': str(86400 / 10 - 1)}), # Firefox - Set one year maxage to permanent redirects missing expiration info (300, {}), (301, {}), (308, {}), ] with self._middleware() as mw: for idx, (status, headers) in enumerate(sampledata): req0 = Request('http://example-%d.com' % idx) res0 = Response(req0.url, status=status, headers=headers) # cache fresh response res1 = self._process_requestresponse(mw, req0, res0) self.assertEqualResponse(res1, res0) assert 'cached' not in res1.flags # return fresh cached response without network interaction res2 = self._process_requestresponse(mw, req0, None) self.assertEqualResponse(res1, res2) assert 'cached' in res2.flags # validate cached response if request max-age set as 0 req1 = req0.replace(headers={'Cache-Control': 'max-age=0'}) res304 = res0.replace(status=304) assert mw.process_request(req1, self.spider) is None res3 = self._process_requestresponse(mw, req1, res304) self.assertEqualResponse(res1, res3) assert 'cached' in res3.flags
def test_spider_crawls_links(spider, scrape_request, html_headers, mock_html_twolinks): """Ensure spider always picks up relevant links to HTML pages""" # Use only 1 user agent for easier counting ua = factories.BatchUserAgentFactory.build(ua_string='Firefox / 11.0') spider.batch_user_agents = [ua] # Generate a mock response based on html containing two links mock_response = Response('http://test:12345', body=mock_html_twolinks) mock_response.request = scrape_request mock_response.headers = html_headers mock_response.meta['user_agent'] = ua mock_response.status = 200 mock_response.encoding = u'utf-8' mock_response.flags = [] # Call spider on the mock response pipeline_generator = spider.parse(mock_response) # Assert that we got the expected set of new requests generated in the # spider and nothing else sites_expected = set([ mock_response.url + '/link1.html', mock_response.url + '/link2.html', ]) sites_collected = [] for new_request in pipeline_generator: if isinstance(new_request, Request): sites_collected.append(new_request.url) else: pass assert sites_expected == set(sites_collected)
def test_hs_mware_process_spider_input(hs_mware): response = Response('http://resp-url') response.request = Request('http://req-url') hs_mware.hsref.job.requests.add.return_value = 'riq' hs_mware.process_spider_input(response, Spider('test')) assert hs_mware.hsref.job.requests.add.call_count == 1 args = hs_mware.hsref.job.requests.add.call_args[1] ts = args.pop('ts', None) assert isinstance(ts, float) assert args == { 'duration': 0, 'fp': request_fingerprint(response.request), 'method': 'GET', 'parent': None, 'rs': 0, 'status': 200, 'url': 'http://resp-url'} assert hs_mware._seen == WeakKeyDictionary({response: 'riq'})
def setUp(self): self.spider = BaseSpider('foo') self.mw = HttpErrorMiddleware() self.req = Request('http://scrapytest.org') self.res200 = Response('http://scrapytest.org', status=200) self.res200.request = self.req self.res404 = Response('http://scrapytest.org', status=404) self.res404.request = self.req
def test_process_spider_output(self): req = Request('http://scrapytest.org') resp = Response('http://scrapytest.org') resp.request = req result = [Request('http://scrapytest.org')] out = list(self.mw.process_spider_output(resp, result, self.spider)) self.assertEquals(out, result) rdc = self.stats.get_value('request_depth_count/1', spider=self.spider) self.assertEquals(rdc, 1) req.meta['depth'] = 1 out2 = list(self.mw.process_spider_output(resp, result, self.spider)) self.assertEquals(out2, []) rdm = self.stats.get_value('request_depth_max', spider=self.spider) self.assertEquals(rdm, 1)
class TestHttpErrorMiddlewareSettings(TestCase): """Similar test, but with settings""" def setUp(self): self.spider = Spider('foo') self.mw = HttpErrorMiddleware(Settings({'HTTPERROR_ALLOWED_CODES': (402,)})) self.req = Request('http://scrapytest.org') self.res200 = Response('http://scrapytest.org', status=200) self.res200.request = self.req self.res404 = Response('http://scrapytest.org', status=404) self.res404.request = self.req self.res402 = Response('http://scrapytest.org', status=402) self.res402.request = self.req def test_process_spider_input(self): self.assertEquals(None, self.mw.process_spider_input(self.res200, self.spider)) self.assertRaises(HttpError, self.mw.process_spider_input, self.res404, self.spider) self.assertEquals(None, self.mw.process_spider_input(self.res402, self.spider)) def test_meta_overrides_settings(self): request = Request('http://scrapytest.org', meta={'handle_httpstatus_list': [404]}) res404 = self.res404.copy() res404.request = request res402 = self.res402.copy() res402.request = request self.assertEquals(None, self.mw.process_spider_input(res404, self.spider)) self.assertRaises(HttpError, self.mw.process_spider_input, res402, self.spider) def test_spider_override_settings(self): self.spider.handle_httpstatus_list = [404] self.assertEquals(None, self.mw.process_spider_input(self.res404, self.spider)) self.assertRaises(HttpError, self.mw.process_spider_input, self.res402, self.spider)
def _getresponse(self, coding): if coding not in FORMAT: raise ValueError() samplefile, contentencoding = FORMAT[coding] with open(join(SAMPLEDIR, samplefile), "rb") as sample: body = sample.read() headers = { "Server": "Yaws/1.49 Yet Another Web Server", "Date": "Sun, 08 Mar 2009 00:41:03 GMT", "Content-Length": len(body), "Content-Type": "text/html", "Content-Encoding": contentencoding, } response = Response("http://scrapytest.org/", body=body, headers=headers) response.request = Request("http://scrapytest.org", headers={"Accept-Encoding": "gzip,deflate"}) return response
def _getresponse(self, coding): if coding not in FORMAT: raise ValueError() samplefile, contentencoding = FORMAT[coding] with open(join(SAMPLEDIR, samplefile), 'rb') as sample: body = sample.read() headers = { 'Server': 'Yaws/1.49 Yet Another Web Server', 'Date': 'Sun, 08 Mar 2009 00:41:03 GMT', 'Content-Length': len(body), 'Content-Type': 'text/html', 'Content-Encoding': contentencoding, } response = Response('http://scrapytest.org/', body=body, headers=headers) response.request = Request('http://scrapytest.org', headers={'Accept-Encoding': 'gzip,deflate'}) return response
def _test_404_middleware(self): from twcrawler.middleware.handle_404 import Handle404 mw = Handle404.from_crawler(self.crawler) url = 'http://example.com/404' req = Request(url) req.meta['origin_url'] = url req.meta['proxy'] = 'xx.xx.xx.xx:404' for string_404_list in mw.settings.get('HTML_404_STRING'): body_normal_404 = '\n'.join(['<p>%s</p>'%s for s in string_404_list]) resp = Response(url, body=body_normal_404, status=404, request=req) ret = mw.process_spider_output(resp, [], self.spider) ret = list(ret) assert not ret resp = Response(url, body='bad_string', status=404, request=req) ret = mw.process_spider_output(resp, [], self.spider) ret = list(ret) # TODO, after add the new request to redis, no item return #self.assertEqual(ret[0].url, url) resp = Response(url, body='bad_string', status=200, request=req) from scrapy import Item, Field class TestItem(Item): uid = Field() item = TestItem() item['uid'] = 'uid_test' ret = mw.process_spider_output(resp, [item], self.spider) ret = list(ret) self.assertEqual(item, ret[0]) resp.meta['exception'] = 'test exception' ret = mw.process_spider_output(resp, [resp], self.spider) ret = list(ret) # TODO, after add the new request to redis, no item return #self.assertEqual(ret[0].url, url) for status in [503, 204, 500]: resp = Response(url, body='bad_string', status=status, request=req) ret = mw.process_spider_output(resp, [resp], self.spider) ret = list(ret)
def test_useragents_spider(spider, scrape_request, html_headers, mock_html_nolinks): """Ensure multiple requests with different user agent strings emitted""" ua1 = factories.BatchUserAgentFactory.build(ua_string='Firefox / 11.0') ua2 = factories.BatchUserAgentFactory.build(ua_string='Chrome / 20.0') spider.batch_user_agents = [ua1, ua2] # Generate a mock response mock_response = Response('http://test:12345', body=mock_html_nolinks) mock_response.request = scrape_request mock_response.headers = html_headers mock_response.status = 200 mock_response.encoding = u'utf-8' mock_response.flags = [] # Call the spider on the mock response pipeline_generator = spider.parse(mock_response) # Assert that we have two requests for this linkless page, one for each # of the user agents we inserted request_uas = [] for new_request in pipeline_generator: if isinstance(new_request, Request): request_uas.append(new_request.meta['user_agent'].ua_string) else: # We're not expecting anything other than Requests assert False assert set(request_uas) == set([u'Firefox / 11.0', u'Chrome / 20.0'])
def fake_response_from_file(file_name, url=None): """ Create a Scrapy fake HTTP response from a HTML file @param file_name: The relative filename from the responses directory, but absolute paths are also accepted. @param url: The URL of the response. returns: A scrapy HTTP response which can be used for unittesting. """ if not url: url = "http://www.example.com" request = Request(url=url) if not file_name[0] == "/": responses_dir = os.path.dirname(os.path.realpath(__file__)) file_path = os.path.join(responses_dir, file_name) else: file_path = file_name file_content = open(file_path, "r").read() response = Response(url=url, request=request, body=file_content) response.encoding = "utf-8" return response
def test_get_cached_beautifulsoup(self): r1 = Response('http://www.example.com', body='') soup1 = get_cached_beautifulsoup(r1) soup2 = get_cached_beautifulsoup(r1) assert isinstance(soup1, BeautifulSoup) assert isinstance(soup2, BeautifulSoup) # make sure it's cached assert soup1 is soup2 # when body is None, an empty soup should be returned r1 = Response('http://www.example.com') assert r1.body == "" assert isinstance(get_cached_beautifulsoup(r1), BeautifulSoup) r1 = Response('http://www.example.com', body='') soup1 = get_cached_beautifulsoup(r1) r2 = r1.copy() soup2 = get_cached_beautifulsoup(r1) soup3 = get_cached_beautifulsoup(r2) assert soup1 is soup2 assert soup1 is not soup3
def test_cached_and_stale(self): sampledata = [ (200, {"Date": self.today, "Expires": self.yesterday}), (200, {"Date": self.today, "Expires": self.yesterday, "Last-Modified": self.yesterday}), (200, {"Expires": self.yesterday}), (200, {"Expires": self.yesterday, "ETag": "foo"}), (200, {"Expires": self.yesterday, "Last-Modified": self.yesterday}), (200, {"Expires": self.tomorrow, "Age": "86405"}), (200, {"Cache-Control": "max-age=86400", "Age": "86405"}), # no-cache forces expiration, also revalidation if validators exists (200, {"Cache-Control": "no-cache"}), (200, {"Cache-Control": "no-cache", "ETag": "foo"}), (200, {"Cache-Control": "no-cache", "Last-Modified": self.yesterday}), ] with self._middleware() as mw: for idx, (status, headers) in enumerate(sampledata): req0 = Request("http://example-%d.com" % idx) res0a = Response(req0.url, status=status, headers=headers) # cache expired response res1 = self._process_requestresponse(mw, req0, res0a) self.assertEqualResponse(res1, res0a) assert "cached" not in res1.flags # Same request but as cached response is stale a new response must # be returned res0b = res0a.replace(body="bar") res2 = self._process_requestresponse(mw, req0, res0b) self.assertEqualResponse(res2, res0b) assert "cached" not in res2.flags # Previous response expired too, subsequent request to same # resource must revalidate and succeed on 304 if validators # are present if "ETag" in headers or "Last-Modified" in headers: res0c = res0b.replace(status=304) res3 = self._process_requestresponse(mw, req0, res0c) self.assertEqualResponse(res3, res0b) assert "cached" in res3.flags
def test(self): origin = 'http://www.scrapy.org' target = 'http://www.example.com' for settings, response_headers, request_meta, policy_class, check_warning in self.params[ 3:]: spider = Spider('foo') mw = RefererMiddleware(Settings(settings)) response = Response(origin, headers=response_headers) request = Request(target, meta=request_meta) with warnings.catch_warnings(record=True) as w: policy = mw.policy(response, request) self.assertIsInstance(policy, policy_class) if check_warning: self.assertEqual(len(w), 1) self.assertEqual(w[0].category, RuntimeWarning, w[0].message)
def test(self): for parent, target, redirections, init_referrer, final_referrer in self.scenarii: response = self.get_response(parent) request = self.get_request(target) out = list( self.referrermw.process_spider_output(response, [request], self.spider)) self.assertEqual(out[0].headers.get('Referer'), init_referrer) for status, url in redirections: response = Response(request.url, headers={'Location': url}, status=status) request = self.redirectmw.process_response( request, response, self.spider) self.referrermw.request_scheduled(request, self.spider) assert isinstance(request, Request) self.assertEqual(request.headers.get('Referer'), final_referrer)
def test_result_succeed(self): rsp = Response("http://url1") req = Request( "http://url1", meta=dict(response=rsp), callback=self._callback, errback=self._errback, ) item = dict(requests=req) new_item = yield self.pipe.process_item(item, self.spider) self.assertEqual(new_item["results"], [(True, rsp)]) self.assertEqual( self.pipe._mockcalled, [ "get_media_requests", "media_to_download", "media_downloaded", "request_callback", "item_completed", ], )
def test_thumbnail_name(self): thumb_path = self.pipeline.thumb_path name = '50' self.assertEqual( thumb_path(Request("file:///tmp/foo.jpg"), name), 'thumbs/50/38a86208c36e59d4404db9e37ce04be863ef0335.jpg') self.assertEqual( thumb_path(Request("file://foo.png"), name), 'thumbs/50/e55b765eba0ec7348e50a1df496040449071b96a.jpg') self.assertEqual( thumb_path(Request("file:///tmp/foo"), name), 'thumbs/50/0329ad83ebb8e93ea7c7906d46e9ed55f7349a50.jpg') self.assertEqual( thumb_path(Request("file:///tmp/some.name/foo"), name), 'thumbs/50/850233df65a5b83361798f532f1fc549cd13cbe9.jpg') self.assertEqual( thumb_path(Request("file:///tmp/some.name/foo"), name, response=Response("file:///tmp/some.name/foo"), info=object()), 'thumbs/50/850233df65a5b83361798f532f1fc549cd13cbe9.jpg')
def parse_info(self, response: Response): # 章节名称 book_id = response.meta['book_id'] seg_as = response.xpath('//div[@class="volume-wrap"]/div').css( '.cf li>a') for a in seg_as: # a-> Selector item = SegItem() item['seg_id'] = uuid.uuid4().hex item['book_id'] = book_id item['title'] = a.css('::text').get() item['url'] = 'https:' + a.xpath('./@href').get() yield item # 下载章节内容 yield Request(item['url'], callback=self.parse_seg, priority=1, meta={'seg_id': item['seg_id']})
def test_process_spider_output(self): res = Response('http://scrapytest.org') onsite_reqs = [ Request('http://scrapytest.org/1'), Request('http://scrapy.org/1'), Request('http://sub.scrapy.org/1'), Request('http://offsite.tld/letmepass', dont_filter=True) ] offsite_reqs = [ Request('http://scrapy2.org'), Request('http://offsite.tld/'), Request('http://offsite.tld/scrapytest.org'), Request('http://offsite.tld/rogue.scrapytest.org'), Request('http://rogue.scrapytest.org.haha.com'), Request('http://roguescrapytest.org') ] reqs = onsite_reqs + offsite_reqs out = list(self.mw.process_spider_output(res, reqs, self.spider)) self.assertEquals(out, onsite_reqs)
def process_response(self, request, response, spider): data = response.body if -1 == data.find("<html") == data.find("<meta") == data.find( "<body"): return response h = self.getHash(data) if self.treehash.has_key(h): if self.treehash[h] >= 5: log.msg(format="Filtered dom tree repeat %(request)s", level=log.DEBUG, spider=spider, request=request) return Response("") else: self.treehash[h] += 1 return response else: self.treehash[h] = 1 return response
def test_setting_enabled_cookies_debug(self): crawler = get_crawler(settings_dict={'COOKIES_DEBUG': True}) mw = CookiesMiddleware.from_crawler(crawler) with LogCapture('scrapy.downloadermiddlewares.cookies', propagate=False, level=logging.DEBUG) as l: req = Request('http://scrapytest.org/') res = Response('http://scrapytest.org/', headers={'Set-Cookie': 'C1=value1; path=/'}) mw.process_response(req, res, crawler.spider) req2 = Request('http://scrapytest.org/sub1/') mw.process_request(req2, crawler.spider) l.check( ('scrapy.downloadermiddlewares.cookies', 'DEBUG', 'Received cookies from: <200 http://scrapytest.org/>\n' 'Set-Cookie: C1=value1; path=/\n'), ('scrapy.downloadermiddlewares.cookies', 'DEBUG', 'Sending cookies to: <GET http://scrapytest.org/sub1/>\n' 'Cookie: C1=value1\n'), )
def test_503(self): req = Request('http://www.scrapytest.org/503') rsp = Response('http://www.scrapytest.org/503', body=b'', status=503) # first retry req = self.mw.process_response(req, rsp, self.spider) assert isinstance(req, Request) self.assertEqual(req.meta['retry_times'], 1) # second retry req = self.mw.process_response(req, rsp, self.spider) assert isinstance(req, Request) self.assertEqual(req.meta['retry_times'], 2) # discard it assert self.mw.process_response(req, rsp, self.spider) is rsp assert self.crawler.stats.get_value('retry/max_reached') == 1 assert self.crawler.stats.get_value( 'retry/reason_count/503 Service Unavailable') == 2 assert self.crawler.stats.get_value('retry/count') == 2
class TestHelper(unittest.TestCase): bbody = b'utf8-body' ubody = bbody.decode('utf8') txtresponse = TextResponse(url='http://example.org/', body=bbody, encoding='utf-8') response = Response(url='http://example.org/', body=bbody) def test_body_or_str(self): for obj in (self.bbody, self.ubody, self.txtresponse, self.response): r1 = _body_or_str(obj) self._assert_type_and_value(r1, self.ubody, obj) r2 = _body_or_str(obj, unicode=True) self._assert_type_and_value(r2, self.ubody, obj) r3 = _body_or_str(obj, unicode=False) self._assert_type_and_value(r3, self.bbody, obj) self.assertTrue(type(r1) is type(r2)) self.assertTrue(type(r1) is not type(r3)) def _assert_type_and_value(self, a, b, obj): self.assertTrue(type(a) is type(b), 'Got {}, expected {} for {!r}'.format(type(a), type(b), obj)) self.assertEqual(a, b)
def setUp(self): self.test_file = open(TEST_PDF, 'rb') self.spider = BaseSpider() self.spider.settings = get_project_settings() self.spider.crawler = Crawler() meta = { 'data_dict': { 'title': 'foo', } } headers = { 'content-type': b'application/pdf' } request = Request('http://foo.bar', meta=meta) self.pdf_response = Response( 'http://foo.bar', body=self.test_file.read(), request=request, headers=headers )
def get_contact(self, response: Response) -> list: """ Gets the contact information. :param response: the response object :return: a list of contact """ # //div[contains(@class,'field field-name-body field-type-text-with-summary')]/div/div/p[last()] contact = { 'email': '', 'phone': '', 'website': response.url, 'meet': '' } text = response.xpath( "string(//div[contains(@class,'field field-name-body field-type-text-with-summary')]/div/div/p[last()])" ).get() phone = extract_phone(text) if len(phone) > 0: contact['phone'] = phone[0] return contact
def get_meta(self, response: Response) -> dict: """ Get the meta data of the patent from the table. :return dict(str, object) """ title = 'Abstract' result = {} skip_first_paragraph = True for row in response.xpath( "//div[contains(@class,'field field-name-body field-type-text-with-summary')]/div/div/*" ): if row.xpath("name()").get() == 'h2': title = row.xpath("text()").get() elif row.xpath("name()").get() == 'p': if skip_first_paragraph: skip_first_paragraph = False continue result[title] = result.get( title, '') + '\n' + row.xpath("string()").get() return result
def parse_search_page(self, response: Response): """Parse the top-level page. The search page contains a list of Chapters, with the names, numbers, and internal id's. """ for option in response.css("#browseForm option"): db_id: Any = option.xpath("@value").get() if db_id == "-1": # Ignore the heading continue number, name = map(str.strip, option.xpath("text()").get().split("-", 1)) chapter = new_chapter(db_id, number, name) new_chapter_index = len(self.oar["chapters"]) self.oar["chapters"].append(chapter) request = Request(chapter["url"], callback=self.parse_chapter_page) request.meta["chapter_index"] = new_chapter_index yield request
def process_exception(self, request, exception, spider): """ 处理由于使用代理导致的连接异常 """ print("%s" % self.proxys[request.meta["proxy_index"]].get_proxy()) print("%s" % exception) # logger.debug("%s exception: %s" % (self.proxys[request.meta["proxy_index"]].get_proxy(), exception)) request_proxy_index = request.meta["proxy_index"] # 只有当proxy_index>fixed_proxy-1时才进行比较, 这样能保证至少本地直连是存在的. if isinstance(exception, self.DONT_RETRY_ERRORS): if request_proxy_index > self.fixed_proxy - 1 and self.invalid_proxy_flag: # WARNING 直连时超时的话换个代理还是重试? 这是策略问题 self.invalid_proxy(request_proxy_index) else: # 简单的切换而不禁用 if request.meta["proxy_index"] == self.proxy_index: self.inc_proxy_index() if "ex_count" not in request.meta.keys(): request.meta["ex_count"] = 1 else: request.meta["ex_count"] += 1 # 跳过抛出多次异常的请求 if request.meta["ex_count"] > self.max_exception_url_count: logger.info( "beyond max exception url count, url: %s, request next url" % request.url) r = redis_factory.get_instance() r.rpush(REDIS_KEY_URL_EXCEPTION, request.url) request.meta["ex_count"] = 0 request.meta["change_url"] = True response = Response(status=200, request=request, url=request.url) return response request.dont_filter = True return request else: logger.error("this middleware can not handle the exception") return None
def test_save_response(self): self.instance._writer = mock.MagicMock() self.instance._writer.maxitemsize = 10 # wrong response type self.instance.save_response( Response('http://resp', request=Request('http://req')), self.spider) assert not self.instance._writer.write.called # get request with large body resp1 = TextResponse('http://resp1', request=Request('http://req1'), body='looong loong body', encoding='cp1251') self.instance.save_response(resp1, self.spider) assert not self.instance._writer.write.called # get request with ok-body self.instance.hsref = mock.Mock() self.instance.hsref.job.key = '123/45/67' resp2 = TextResponse( 'http://resp2', request=Request('http://req2'), body='body', encoding='cp1251', headers={'Set-Cookie': [b'coo1=test;abc=1', b'coo2=tes1;cbd=2']}) self.instance.save_response(resp2, self.spider) self.instance._writer.write.assert_called_with({ 'body': u'body', '_encoding': 'cp1251', '_type': '_pageitem', '_key': 'bad42100b1d34e29973a79e512aabb4db885b712', 'cookies': ['coo1=test', 'coo2=tes1'], 'url': 'http://resp2', '_jobid': '123/45/67' })
def test_js_item_emission(spider, linked_js_request, js_headers, mock_js): """JS items are emitted correctly""" # Generate a mock response based on JS mock_url = 'http://test:12345/default.js' mock_response = Response(mock_url, body=mock_js) mock_response.request = linked_js_request mock_response.headers = js_headers mock_response.status = 200 mock_response.encoding = u'ascii' mock_response.flags = [] # Generate a fake urlscan to use in our item comparison mock_urlscan = model.URLScan.objects.create( site_scan=linked_js_request.meta['sitescan'], page_url_hash=sha256("http://test:12345/").hexdigest(), page_url=mock_response.url, timestamp=spider.get_now_time()) # Send the mocks to the spider for processing pipeline_generator = spider.parse(mock_response) # Verify the item returned is what we expected item_expected = MarkupItem() item_expected['content_type'] = spider.get_content_type(js_headers) item_expected['filename'] = os.path.basename(urlparse(mock_url).path) item_expected['headers'] = unicode(js_headers) item_expected['meta'] = mock_response.meta item_expected['raw_content'] = mock_response.body item_expected['sitescan'] = linked_js_request.meta['sitescan'] item_expected['urlscan'] = mock_urlscan item_expected['url'] = mock_response.url item_expected['user_agent'] = mock_response.meta['user_agent'] item_collected = None for item in pipeline_generator: if isinstance(item, MarkupItem): item_collected = item else: assert False assert item_expected == item_collected
def parse_sku(self, response: Response): attrs = [] name_elements = response.css( 'div.bul-showcase-push-name span::text').get() name = name_elements + " " + response.css( 'div.product-name h1::text').get() code = response.css('div[itemprop="sku"]::text').get().strip() price = {} price_cny = response.css('span.price::text').get() if price_cny: price_cny = price_cny.strip('¥').replace(',', '').strip() price = { 'cny': float(price_cny), } sizes = response.css('div.bul-size-select-list ul li::text').getall() sizes = [size.strip().strip('尺寸:') for size in sizes] if len(sizes): attrs.append({'name': '尺寸', 'value': ', '.join(sizes)}) description = response.css( 'div.bul-edito-texts div.data p::text').get().strip() if description is None or len(description) < 1: description = response.css( 'div[itemprop="description"]::text').get() # image_elements = response.css('div.fotorama__stage__frame img') # image_urls = [item.attrib['src'] for item in image_elements] page_data_str = response.css( 'div.product.media > script[type="text/x-magento-init"]::text' ).get() page_data = json.loads(page_data_str) image_data = page_data['[data-gallery-role=gallery-placeholder]'][ 'mage/gallery/gallery']['data'] image_urls = [img['full'] for img in image_data] sku = SKU(self.brand_name, '', '', code, name, response.url, price, description, image_urls, attrs) yield sku
def get_meta(self, response: Response) -> dict: """ Get the meta data of the patent from the table. :return dict(str, object) """ result = {} # Note: if running with JS, the data can be found in //div[@id='dynamic_content']/table[2]/tbody/tr for row in response.xpath("//table[@summary='Project Details']/table[@summary='Project Details']/tr"): try: title = row.xpath("string(th)").get() except Exception as e: self.log('Fail to find title for meta', level=logging.WARN) continue if len(title) < 1: continue result[title] = '' for line in row.xpath('td/*'): tag = line.xpath('name()').get() if tag.startswith('ul'): # it is a list, keep it in markdown format if len(result[title]) > 0: result[title] += '\n' result[title] += ' - ' result[title] += '\n - '.join(line.xpath("li").xpath('string()').getall()) else: # anything else, e.g., a paragraph if len(result[title]) > 0: result[title] += '\n' result[title] += line.xpath('string()').get() if len(row.xpath('td/*')) < 1: if len(result[title]) > 0: result[title] += '\n' result[title] += row.xpath('string(td)').get() if 'Tags' in title: result[title] = row.xpath('td/a/text()').getall() elif 'Abstract' in title: result['banner'] = self.get_pictures(row.xpath('td')) return result
def test_from_response_formname_exists(self): respbody = """ <form action="post.php" method="POST"> <input type="hidden" name="one" value="1"> <input type="hidden" name="two" value="2"> </form> <form name="form2" action="post.php" method="POST"> <input type="hidden" name="three" value="3"> <input type="hidden" name="four" value="4"> </form> """ response = Response("http://www.example.com/formname.html", body=respbody) r1 = self.request_class.from_response(response, formname="form2", callback=lambda x: x) self.assertEqual(r1.method, 'POST') fs = cgi.FieldStorage(StringIO(r1.body), r1.headers, environ={"REQUEST_METHOD": "POST"}) self.assertEqual(fs['three'].value, "3") self.assertEqual(fs['four'].value, "4")
def test_from_response_extra_headers(self): respbody = """ <form action="post.php" method="POST"> <input type="hidden" name="test" value="val1"> <input type="hidden" name="test" value="val2"> <input type="hidden" name="test2" value="xxx"> </form> """ headers = {"Accept-Encoding": "gzip,deflate"} response = Response("http://www.example.com/this/list.html", body=respbody) r1 = self.request_class.from_response(response, formdata={ 'one': ['two', 'three'], 'six': 'seven' }, headers=headers, callback=lambda x: x) self.assertEqual(r1.method, 'POST') self.assertEqual(r1.headers['Content-type'], 'application/x-www-form-urlencoded') self.assertEqual(r1.headers['Accept-Encoding'], 'gzip,deflate')
def parse(self, response: Response): """ This is an override of a spider method :param response: :return: """ print("Extracting...") items = self.post_transform_avro(response) if not self.dry_run: for item in items: self.loader.submit(item) url_base = 'http://forums.somethingawful.com/' url = response.xpath('//a[@title="Next page"]/@href').extract() if len(url) > 0: url = url_base + url[0] log.debug(str(url)) else: log.debug(str(url)) raise IndexError("No next page for thread!") sleep(0.2) # log.debug("Iterating in parse: " + str(url)) yield scrapy.Request(url, callback=self.parse)
def parse_info(self, response: Response): """获取章节信息""" book_id = response.meta['book_id'] seg_as = response.xpath('//div[@class="volume-wrap"]/div').css( '.cf li>a') for a in seg_as: item = SegItem() item['seg_id'] = uuid.uuid4().hex item['book_id'] = book_id item['seg_title'] = a.css('::text').get() item['url'] = 'https:' + a.css('::attr("href")').get() # 下载章节内容 yield Request(item['url'], callback=self.parse_seg, priority=10, meta={ 'book_id': book_id, 'seg_id': item['seg_id'] }) yield item
def test_process_response_encoding_inside_body(self): headers = { 'Content-Type': 'text/html', 'Content-Encoding': 'gzip', } f = BytesIO() plainbody = ( b'<html><head><title>Some page</title>' b'<meta http-equiv="Content-Type" content="text/html; charset=gb2312">' ) zf = GzipFile(fileobj=f, mode='wb') zf.write(plainbody) zf.close() response = Response("http;//www.example.com/", headers=headers, body=f.getvalue()) request = Request("http://www.example.com/") newresponse = self.mw.process_response(request, response, self.spider) assert isinstance(newresponse, HtmlResponse) self.assertEqual(newresponse.body, plainbody) self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
def test_file_path(self): file_path = self.pipeline.file_path self.assertEqual( file_path( Request("https://dev.mydeco.com/mydeco.pdf")), 'full/c9b564df929f4bc635bdd19fde4f3d4847c757c5.pdf') self.assertEqual( file_path( Request("http://www.maddiebrown.co.uk///catalogue-items//image_54642_12175_95307.txt")), 'full/4ce274dd83db0368bafd7e406f382ae088e39219.txt') self.assertEqual( file_path( Request("https://dev.mydeco.com/two/dirs/with%20spaces%2Bsigns.doc")), 'full/94ccc495a17b9ac5d40e3eabf3afcb8c2c9b9e1a.doc') self.assertEqual( file_path( Request("http://www.dfsonline.co.uk/get_prod_image.php?img=status_0907_mdm.jpg")), 'full/4507be485f38b0da8a0be9eb2e1dfab8a19223f2.jpg') self.assertEqual( file_path( Request("http://www.dorma.co.uk/images/product_details/2532/")), 'full/97ee6f8a46cbbb418ea91502fd24176865cf39b2') self.assertEqual( file_path( Request("http://www.dorma.co.uk/images/product_details/2532")), 'full/244e0dd7d96a3b7b01f54eded250c9e272577aa1') self.assertEqual( file_path( Request("http://www.dorma.co.uk/images/product_details/2532"), response=Response("http://www.dorma.co.uk/images/product_details/2532"), info=object()), 'full/244e0dd7d96a3b7b01f54eded250c9e272577aa1') self.assertEqual( file_path( Request("http://www.dfsonline.co.uk/get_prod_image.php?img=status_0907_mdm.jpg.bohaha")), 'full/76c00cef2ef669ae65052661f68d451162829507') self.assertEqual(file_path(Request("\ //+F0tzCwMK76ZKQ21AMqr7oAAC96JvD5aWM2kvZ78J0N7fmAAC46Y4Ap7y")), 'full/178059cbeba2e34120a67f2dc1afc3ecc09b61cb.png')
def test_process_spider_output(self): fake_response = mock.Mock() fake_response.request = Request('http://source-request') fake_result = sorted([Request('ftp://req1'), Request('https://req2'), Response('http://source-request'), DictItem()]) results = self.instance.process_spider_output( fake_response, fake_result, self.spider) assert isinstance(results, types.GeneratorType) for r in results: assert isinstance(r, type(fake_result.pop(0))) if isinstance(r, DictItem): self.assertEqual( r["_cached_page_id"], request_fingerprint(fake_response.request)) bad_fake_request = DictItem() bad_fake_request._values = None self.instance.process_spider_exception = mock.Mock() with self.assertRaises(TypeError): for _ in self.instance.process_spider_output( fake_response, [bad_fake_request], self.spider): pass assert self.instance.process_spider_exception.called
def _get_crawler(self): crawler = mock.MagicMock() crawler.settings = Settings() crawler.settings.set('USER_AGENT', 'CustomAgent') self.assertRaises(NotConfigured, RobotsTxtMiddleware, crawler) crawler.settings.set('ROBOTSTXT_OBEY', True) crawler.engine.download = mock.MagicMock() ROBOTS = re.sub( r'^\s+(?m)', '', ''' User-Agent: * Disallow: /admin/ Disallow: /static/ ''') response = Response('http://site.local/robots.txt', body=ROBOTS) def return_response(request, spider): deferred = Deferred() reactor.callFromThread(deferred.callback, response) return deferred crawler.engine.download.side_effect = return_response return crawler
def get_response_for_testing(callback: Callable) -> Response: """ Return a response with fake content with the configured callback. It is useful for testing providers. """ url = "http://example.com" html = """ <html> <body> <div class="breadcrumbs"> <a href="/food">Food</a> / <a href="/food/sweets">Sweets</a> </div> <h1 class="name">Chocolate</h1> <p>Price: <span class="price">22€</span></p> <p class="description">The best chocolate ever</p> </body> </html> """.encode("utf-8") request = Request(url, callback=callback) response = Response(url, 200, None, html, request=request) return response
def parse_external_link( response: Response, link_text: str, resource_language_service: IResourceLanguageService, spider_name: str, language: Optional[Language] = None, ) -> ExternalResource: if response.status != 200: return iso_code = language.id if language is not None else None language_id = language.airtable_id if language is not None else None if isinstance(response, HtmlResponse): lang_attrs = LangAttributeParser.get_lang_values(response) resource_language_ids = resource_language_service.get_resource_language_ids( lang_attrs) yield ExternalResource( title=response.css(TITLE_SELECTOR).get(), link_text=link_text, url=response.url, iso_code=iso_code, language_id=language_id, spider_name=spider_name, resource_languages=resource_language_ids, resource_languages_raw=lang_attrs, ) else: yield ExternalResource( title=link_text, link_text=link_text, url=response.url, iso_code=iso_code, language_id=language_id, spider_name=spider_name, )
def parse_sku(self, response: Response): attrs = [] name = response.css( 'p.product-details-section-name__description::text').get().strip() price_cny = response.css( 'div.product-details-section-price__sell-price::text').get().strip( ).strip('¥ ').replace(',', '') price = {'cny': float(price_cny)} color = response.css( 'span.product-details-section-color__checked-attribute-value::text' ).get() attrs.append({ 'name': '颜色', 'value': color, }) composition = response.css( 'div.product-details-section-description__text::text').getall() for s in composition: attrs.append({'name': '参数', 'value': s}) code = response.url.split('/')[-1].strip('.html').upper() page_data = response.xpath('/html/body/script[2]//text()').get() image_urls = [ item for item in re.findall(r'"(.+?)"', page_data) if 'Large' in item ] image_urls = utils.list_unique(image_urls) image_urls = [ url.encode('utf-8').decode('unicode_escape') for url in image_urls ] sizes = response.css('div.component-size-option::text').getall() sizes = [size.strip() for size in sizes] if len(sizes): attrs.append({'name': '尺寸', 'value': ', '.join(sizes)}) sku = SKU(self.brand_name, '', '', code, name, response.url, price, '', image_urls, attrs) yield sku
def test_css_item_emission(spider, linked_css_request, css_headers, mock_css): """CSS items are emitted correctly""" # Use only 1 user agent for easier counting ua1 = factories.BatchUserAgentFactory(ua_string='Firefox / 11.0') spider.user_agents = [ua1] # Generate a mock response based on CSS mock_url = 'http://test:12345/default.css' mock_response = Response(mock_url, body=mock_css) mock_response.request = linked_css_request mock_response.headers = css_headers mock_response.status = 200 mock_response.encoding = u'ascii' mock_response.flags = [] # Generate a fake urlscan to use in our item comparison mock_urlscan = model.URLScan.objects.create( site_scan=linked_css_request.meta['sitescan'], page_url_hash=sha256("http://test:12345/").hexdigest(), page_url=mock_response.url, timestamp=spider.get_now_time()) # Send the mocks to the spider for processing pipeline_generator = spider.parse(mock_response) # Verify the item returned is what we expected item_expected = MarkupItem() item_expected['content_type'] = spider.get_content_type(css_headers) item_expected['filename'] = os.path.basename(urlparse(mock_url).path) item_expected['headers'] = unicode(css_headers) item_expected['meta'] = mock_response.meta item_expected['raw_content'] = mock_response.body item_expected['sitescan'] = linked_css_request.meta['sitescan'] item_expected['urlscan'] = mock_urlscan item_expected['url'] = mock_response.url item_expected['user_agent'] = mock_response.meta['user_agent'] item_collected = None for item in pipeline_generator: if isinstance(item, MarkupItem): item_collected = item else: assert False assert item_expected == item_collected
def test_js_item_emission(spider, linked_js_request, js_headers, mock_js): """JS items are emitted correctly""" # Generate a mock response based on JS mock_url = 'http://test:12345/default.js' mock_response = Response(mock_url, body=mock_js) mock_response.request = linked_js_request mock_response.headers = js_headers mock_response.status = 200 mock_response.encoding = u'ascii' mock_response.flags = [] # Generate a fake urlscan to use in our item comparison mock_urlscan = model.URLScan.objects.create( site_scan=linked_js_request.meta['sitescan'], page_url_hash=sha256("http://test:12345/").hexdigest(), page_url=mock_response.url, timestamp=spider.get_now_time()) # Send the mocks to the spider for processing pipeline_generator = spider.parse(mock_response) # Verify the item returned is what we expected item_expected = MarkupItem() item_expected['content_type'] = spider.get_content_type(js_headers) item_expected['filename'] = os.path.basename(urlparse(mock_url).path) item_expected['headers'] = unicode(js_headers) item_expected['meta'] = mock_response.meta item_expected['raw_content'] = mock_response.body item_expected['sitescan'] = linked_js_request.meta['sitescan'] item_expected['urlscan'] = mock_urlscan item_expected['url'] = mock_response.url item_expected['user_agent'] = mock_response.meta['user_agent'] item_expected['redirected_from'] = '' assert list(pipeline_generator) == [item_expected]