def test_skip_elems_by_selector(): """ should test for nested elements that should have been removed i.e. if a class='link' should be skipped when inside a parent element that is not skipped """ spider = serialize.SerializeSpider() fake_response = TextResponse(encoding='utf-8', url='https://doc.scrapy.org') spider.exclude_selectors = ['.skip-elem'] with open('test/data/skip_by_selector.html', mode='rb') as fd: body = fd.read() fake_response._set_body(body) item = spider.parse(fake_response) text = next(item).get('text').strip() with open('test/data/skip_by_selector.txt') as fd: test_text = fd.read().strip() # ignore trailing whitespace text = re.sub('\n\s', '\n', text).strip() test_text = re.sub('\n\s', '\n', test_text).strip() assert text == test_text
def test_block_elem_with_children(): """ tests 2 block elems with children ignores trailing whitespace """ spider = serialize.SerializeSpider() fake_response = TextResponse(encoding='utf-8', url='https://doc.scrapy.org') with open('test/data/block_elem.html', mode='rb') as fd: body = fd.read() fake_response._set_body(body) item = spider.parse(fake_response) text = next(item).get('text') with open('test/data/block_elem.txt') as fd: test_text = fd.read() # ignore trailing whitespace text = re.sub('\n\s', '\n', text).strip() test_text = re.sub('\n\s', '\n', test_text).strip() assert text == test_text
def test_skip_elems_by_tag(): """ make sure elems are skipped by tag name e.g. script """ spider = serialize.SerializeSpider() fake_response = TextResponse(encoding='utf-8', url='https://doc.scrapy.org') spider.exclude_tags.append('footer') with open('test/data/skip_by_tag.html', mode='rb') as fd: body = fd.read() fake_response._set_body(body) item = spider.parse(fake_response) text = next(item).get('text').strip() with open('test/data/skip_by_tag.txt') as fd: test_text = fd.read().strip() # ignore trailing whitespace text = re.sub('\n\s', '\n', text).strip() test_text = re.sub('\n\s', '\n', test_text).strip() assert text == test_text
def inspect_spider( s ): news = s() try: req1 = list( news.start_requests() )[0] html1 = requests.get( req1.url ).content response1 = TextResponse( url = req1.url, body = html1, encoding = 'utf-8' ) req2 = list( news.parse( response1 ) )[0] html2 = requests.get( req2.url ).content response2 = TextResponse( url = req2.url, body = html2, encoding = 'utf-8' ) for d in news.parse_descr( response2 ): print("One course description you found is:", d ) break except: print("Oh no! Something is wrong with the code. Keep trying!")
def parse(self, response: TextResponse): items = response.css('ul.sellListContent li') for li in items: item = ScrapyLianjiaErshoufangItem() item['title'] = li.css('div.title a::text').get().replace(':', '').replace(',', ' ').replace("\n", '') house_infos = li.css('div.address .houseInfo::text').re( r'\|\s+(.*)\s+\|\s+(.*)平米\s+\|\s+(.*)\s+\|\s+(.*)\s+\|\s+(.*)') item['room'] = house_infos[0] item['area'] = house_infos[1] item['orientation'] = house_infos[2] item['decoration'] = house_infos[3] item['elevator'] = house_infos[4] item['xiaoqu'] = li.css('div.address a::text').get() item['flood'] = li.css('div.flood .positionInfo::text').get().replace('-', '').strip() item['location'] = li.css('div.flood .positionInfo a::text').get() follow_infos = li.css('div.followInfo::text').re(r'(.*)人关注\s+/\s+共(.*)次带看\s+/\s+(.*)发布') item['follow_number'] = follow_infos[0] item['look_number'] = follow_infos[1] item['pub_duration'] = follow_infos[2] item['total_price'] = li.css('div.priceInfo div.totalPrice span::text').get() unit_price = li.css('div.priceInfo .unitPrice span::text').re(r'单价(.*)元/平米') item['unit_price'] = unit_price[0] item['total_unit'] = li.css('div.totalPrice::text').get() item['crawl_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') item['house_id'] = self.genearteMD5(''.join((str(item['title']), str(item['room']), str(item['area']), str(item['orientation']), str(item['elevator']), str(item['xiaoqu']), str(item['flood']), str(item['location'])))) yield item
def test_text_cdr_item(): response = TextResponse(url='http://example.com', headers={ 'Content-Type': 'text/plain', 'another-header': 'text/complain, text/explain' }, body=b'a body', encoding='utf8') item = text_cdr_item(response, crawler_name='crawler', team_name='team') item = dict(item) item_id = item.pop('_id') # type: str assert item_id.isupper() check_timestamp_crawl(item) assert dict(item) == { 'content_type': 'text/plain', 'crawler': 'crawler', 'objects': [], 'raw_content': 'a body', 'response_headers': { 'content-type': 'text/plain', 'another-header': 'text/complain, text/explain' }, 'team': 'team', 'url': 'http://example.com', 'version': 3.1 }
def fakeResponseFromFile(file_name, url=None): if not url: url = 'http://www.example.com' file_path = getAbsolutePath(file_name) file_content = open(file_path, 'r').read() response = TextResponse(url=url, request=Request(url=url), body=file_content) return response
def test_stats(mocked_time): middleware = get_test_middleware() spider = Spider("foo") count = 100 nums = list(range(count)) random.shuffle(nums) status_list = [random.randint(1, 15) for _ in range(count)] method_list = [ random.choice(["GET", "POST", "PUT", "DELETE", "HEAD"]) for _ in range(count) ] # expected values latencies = [2**n - n for n in nums] total_latency = sum(latencies) avg_latency = total_latency / count max_latency = max(latencies) for n, status, method in zip(nums, status_list, method_list): request = Request("https://example.org", method=method) mocked_time.return_value = n # start_ts processed_request = middleware.process_request(request, spider) response = TextResponse( url="https://example.org", request=processed_request, body=json.dumps({ "headers": {}, "original_status": status, "body": "", "url": "http://" }).encode("utf-8"), ) mocked_time.return_value = 2**n # end_ts middleware.process_response(processed_request, response, spider) middleware.spider_closed(spider, "finished") assert middleware.stats.get_value("crawlera_fetch/request_count") == count assert middleware.stats.get_value("crawlera_fetch/response_count") == count assert middleware.stats.get_value( "crawlera_fetch/total_latency") == total_latency assert middleware.stats.get_value( "crawlera_fetch/avg_latency") == avg_latency assert middleware.stats.get_value( "crawlera_fetch/max_latency") == max_latency for status in set(status_list): sc = middleware.stats.get_value( "crawlera_fetch/response_status_count/{}".format(status)) assert sc == status_list.count(status) for method in set(method_list): mc = middleware.stats.get_value( "crawlera_fetch/request_method_count/{}".format(method)) assert mc == method_list.count(method)
def process_request(self, request, spider): # 但是这样不能保持并发, 只能一个一个url, 如何改进 if spider.name == 'suning_phone' and request.meta.get('use_selenium'): html = self.fetch_dynamic_html(request.url) return TextResponse(request.url, encoding='utf-8', body=html, request=request)
def parse_exercise(self, response: Response): # `ItemLoader` will only accept (subclasses of) `TextResponse`, so we forge a # `TextResponse` with everything of the actual response except `body`. response_copy = TextResponse( url=response.url, status=response.status, headers=response.headers, flags=response.flags, request=response.request, ) return self.exercise_loader.parse(response_copy)
def test_feed_url(self): url = 'http://example.com/feed' feed = FeedGenerator(lambda: 0) response = TextResponse(url, body=( 'http://example.com/1\r' 'http://example.com/2\r\n' 'http://example.com/3\n\r' 'http://example.com/4\n')) self.assertEqual([r.url for r in feed.parse_urls(response)], [ 'http://example.com/1', 'http://example.com/2', 'http://example.com/3', 'http://example.com/4', ])
def test_process_response_skip(): response = TextResponse( url="https://example.org", status=200, headers={ "Content-Encoding": "gzip", "Transfer-Encoding": "chunked", "Date": "Fri, 24 Apr 2020 18:06:42 GMT", }, request=Request(url="https://example.org", meta={"crawlera_fetch": { "skip": True }}), body=b"""<html></html>""", ) middleware = get_test_middleware() processed = middleware.process_response(response.request, response, Spider("foo")) assert response is processed
def test_write(self): self.response = TextResponse(url=self.url, body="OK".encode("utf-8")) resp = self.cfr.process_response(self.request, self.response, self.spider) self.assertIsInstance(resp, TextResponse)
def test_process_response_error(): response_list = [ TextResponse( url="https://crawlera.com/fake/api/endpoint", request=Request( url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": { "start_ts": mocked_time() }, "original_request": request_to_dict( Request("https://example.org"), spider=foo_spider, ), } }, ), headers={ "X-Crawlera-Error": "bad_proxy_auth", "Proxy-Authenticate": 'Basic realm="Crawlera"', "Content-Length": "0", "Date": "Mon, 04 May 2020 13:06:15 GMT", "Proxy-Connection": "close", "Connection": "close", }, ), TextResponse( url="https://crawlera.com/fake/api/endpoint", request=Request( url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": { "start_ts": mocked_time() }, "original_request": request_to_dict( Request("https://example.org"), spider=foo_spider, ), } }, ), body=b'{"Bad": "JSON', ), TextResponse( url="https://crawlera.com/fake/api/endpoint", request=Request( url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": { "start_ts": mocked_time() }, "original_request": request_to_dict( Request("https://example.org"), spider=foo_spider, ), } }, ), body=json.dumps({ "url": "https://example.org", "original_status": 503, "headers": {}, "crawlera_status": "fail", "crawlera_error": "serverbusy", "body_encoding": "plain", "body": "Server busy: too many outstanding requests", }), encoding="utf8", ), ] middleware_raise = get_test_middleware( settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": True}) for response in response_list: with pytest.raises(CrawleraFetchException): middleware_raise.process_response(response.request, response, foo_spider) assert middleware_raise.stats.get_value( "crawlera_fetch/response_error") == 3 assert middleware_raise.stats.get_value( "crawlera_fetch/response_error/bad_proxy_auth") == 1 assert middleware_raise.stats.get_value( "crawlera_fetch/response_error/JSONDecodeError") == 1 assert middleware_raise.stats.get_value( "crawlera_fetch/response_error/serverbusy") == 1 middleware_log = get_test_middleware( settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": False}) with LogCapture() as logs: for response in response_list: processed = middleware_log.process_response( response.request, response, foo_spider) assert response is processed logs.check_present( ( "crawlera-fetch-middleware", "WARNING", "Error downloading <GET https://example.org> (status: 200, X-Crawlera-Error header: bad_proxy_auth)", # noqa: E501 ), ( "crawlera-fetch-middleware", "WARNING", "Error decoding <GET https://example.org> (status: 200, message: Unterminated string starting at, lineno: 1, colno: 9)", # noqa: E501 ), ( "crawlera-fetch-middleware", "WARNING", "Error downloading <GET https://example.org> (Original status: 503, Fetch API error message: Server busy: too many outstanding requests, Request ID: unknown)", # noqa: E501 ), ) assert middleware_log.stats.get_value("crawlera_fetch/response_error") == 3 assert middleware_log.stats.get_value( "crawlera_fetch/response_error/bad_proxy_auth") == 1 assert middleware_log.stats.get_value( "crawlera_fetch/response_error/JSONDecodeError") == 1 assert middleware_log.stats.get_value( "crawlera_fetch/response_error/serverbusy") == 1
}, "original_request": request_to_dict( Request("https://fake.host.com"), spider=foo_spider, ), } }, ), body= b"""{"url":"https://fake.host.com","original_status":123,"headers":{"fake-header":"true"},"body":"foobar"}""", # noqa: E501 ), "expected": TextResponse( url="https://fake.host.com", status=123, headers={"Fake-Header": "true"}, body=b"""foobar""", # noqa: E501 ), }) test_responses.append({ "original": HtmlResponse( url=SETTINGS["CRAWLERA_FETCH_URL"], status=200, headers={ "Content-Type": "application/json", "Content-Encoding": "gzip", "Transfer-Encoding": "chunked", "Date": "Fri, 24 Apr 2020 18:06:42 GMT", "Proxy-Connection": "close",
def getResponse(self, url, browser): res = TextResponse(url, body=browser.page_source.encode("utf-8")) return res
def test_process_response_error(): response_list = [ TextResponse( url="https://crawlera.com/fake/api/endpoint", request=Request( url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": { "start_ts": mocked_time() }, "original_request": { "url": "https://example.org", "method": "GET" }, } }, ), headers={ "X-Crawlera-Error": "bad_proxy_auth", "Proxy-Authenticate": 'Basic realm="Crawlera"', "Content-Length": "0", "Date": "Mon, 04 May 2020 13:06:15 GMT", "Proxy-Connection": "close", "Connection": "close", }, ), TextResponse( url="https://crawlera.com/fake/api/endpoint", request=Request( url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": { "start_ts": mocked_time() }, "original_request": { "url": "https://example.org", "method": "GET" }, } }, ), body=b'{"Bad": "JSON', ), ] middleware_raise = get_test_middleware( settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": True}) for response in response_list: with pytest.raises(CrawleraFetchException): middleware_raise.process_response(response.request, response, Spider("foo")) assert middleware_raise.stats.get_value( "crawlera_fetch/response_error") == 2 assert middleware_raise.stats.get_value( "crawlera_fetch/response_error/bad_proxy_auth") == 1 assert middleware_raise.stats.get_value( "crawlera_fetch/response_error/JSONDecodeError") == 1 middleware_log = get_test_middleware( settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": False}) with LogCapture() as logs: for response in response_list: processed = middleware_log.process_response( response.request, response, Spider("foo")) assert response is processed logs.check_present( ( "crawlera-fetch-middleware", "ERROR", "Error downloading <GET https://example.org> (status: 200, X-Crawlera-Error header: bad_proxy_auth)", # noqa: E501 ), ( "crawlera-fetch-middleware", "ERROR", "Error decoding <GET https://example.org> (status: 200, message: Unterminated string starting at, lineno: 1, colno: 9)", # noqa: E501 ), ) assert middleware_log.stats.get_value("crawlera_fetch/response_error") == 2 assert middleware_log.stats.get_value( "crawlera_fetch/response_error/bad_proxy_auth") == 1 assert middleware_log.stats.get_value( "crawlera_fetch/response_error/JSONDecodeError") == 1
def test_get_review_items_from_rdfa_using_alphr_syntax(self): # This is extract from http://alphr.com/go/1006047 on 2017/08/08 html_text = ''' <body id="pid-htc-1006047-htc-u11-review-htcs-flagship-is-a-squeezy-pleaser" class="html not-front not-logged-in page-node page-node- page-node-1006047 node-type-review one-sidebar sidebar-second narrow-stacked snap" prefix="v: http://rdf.data-vocabulary.org/# schema: http://schema.org/"> <div id="main" class="page-main-area" typeof="schema:Review"> <a id="main-content-area"></a> <main id="group-content" class="group group-content" > <div id="page_title_content"> <h1 id="page-title" class="page-title title">HTC U11 review: HTC's flagship is a squeezy pleaser</h1> <span property="schema:headline" content="HTC U11 review: HTC's flagship is a squeezy pleaser " class="rdf-meta element-hidden"></span> </div> <div id="content" class="region region-content content"> <div id="block-system-main" class="block block-system"> <div class="content"> <div class="node node-review odd node-full"> <div class="content"> <span property="schema:itemReviewed" content="HTC U11"></span> <div class="field field-name-kicker field-label-inline"> <div class="field-items"><a href="/htc">HTC</a></div> </div> <h2 class="short-teaser" property="schema:description">Shiny but pricey; HTC once again falls into the Samsung comparison trap</h2> <div class="field-group-format group_meta required-fields group-meta"> <span class="field field-name-field-author field-type-node-reference field-label-hidden"> <span class="field-item even" property="schema:author" typeof="schema:Person"> <div class="node node-author node-sticky even node-inline-block" > <div class="content" > <div class="field field-name-field-author-first-name field-type-text field-label-hidden"> <div class="field-items"> <div class="field-item even"><span property="schema:name"><a href="http://www.alphr.com/authors/alan-martin" title="Alan Martin" class="author-link" property="schema:url">Alan Martin</a></span></div> </div> </div> <div class="field field-name-field-twitter-username field-type-text field-label-hidden"> <div class="field-items"> <div class="field-item even"><a href="http://www.twitter.com/alan_p_martin" class="follow-button-twitter" target="_blank" title="Follow on Twitter" rel="">@alan_p_martin</a></div> </div> </div> </div> </div> </span> </span> <div class="field-name-field-published-date" ><span class="date-display-single" property="schema:datePublished" content="2017-07-12" datatype="xsd:dateTime">12 Jul 2017</span></div> </div> <div class="field field-name-field-review-score-overall field-type-fivestar field-label-hidden" property="schema:reviewRating" typeof="schema:Rating"><div class="field-items"> <div class="field-item even" property="schema:ratingValue" content="5"></div> </div> </body> ''' response = TextResponse(url='http://alphr.com/go/1006047', body=html_text) rdfa_items = extruct_helper.extract_all_rdfa(response) self.assertIsNotNone(rdfa_items) review = extruct_helper.get_review_items_from_rdfa( response, rdfa_items) self.assertEqual(len(review), 1) review = review[0] self.assertEqual(review['ProductName'], 'HTC U11') self.assertEquals(review['Author'], 'Alan Martin') self.assertEquals(review['TestDateText'], '2017-07-12') self.assertEquals( review['TestTitle'], "HTC U11 review: HTC's flagship is a squeezy pleaser") self.assertEquals( review['TestSummary'], 'Shiny but pricey; HTC once again falls into the Samsung comparison trap' )