def test_header_early_close(self): stream = self.new_stream() request = Request(self.get_url('/header_early_close')) try: yield from self.fetch(stream, request) except NetworkError: pass else: self.fail() # pragma: no cover request = Request(self.get_url('/')) yield from self.fetch(stream, request)
def test_html_detect(self): self.assertTrue(HTMLReader.is_file( io.BytesIO('<html><body>hi</body></html>'.encode('utf-16le')) )) self.assertFalse(HTMLReader.is_file( io.BytesIO('hello world!'.encode('utf-16le')) )) self.assertTrue(HTMLReader.is_file( io.BytesIO(b'<title>hello</title>hi') )) self.assertTrue(HTMLReader.is_file( io.BytesIO(b'<html><body>hello') )) self.assertTrue(HTMLReader.is_file( io.BytesIO( b'The document has moved <a href="somewhere.html">here</a>' ) )) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.htm')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.html')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.dhtm')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.xhtml')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.xht')) ) self.assertFalse( HTMLReader.is_url(URLInfo.parse('example.com/image.jpg')) ) self.assertTrue( HTMLReader.is_request(Request('example.com/index.html')) ) self.assertFalse( HTMLReader.is_request(Request('example.com/image.jpg')) ) response = Response(200, 'OK') response.fields['Content-Type'] = 'text/html' self.assertTrue(HTMLReader.is_response(response)) response = Response(200, 'OK') response.fields['Content-Type'] = 'image/png' self.assertFalse(HTMLReader.is_response(response))
def test_javascript_heavy_inline_monstrosity(self): scraper = JavaScriptScraper() request = Request('http://example.com/test.js') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'twitchplayspokemonfirered.html') with open(html_file_path, 'rb') as in_file: in_file.seek(0x147) shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertIn( 'http://cdn.bulbagarden.net/upload/archive/a/a4/' '20090718115357%21195Quagsire.png', inline_urls) self.assertIn( 'http://www.google.com/url?q=http%3A%2F%2Fwww.reddit.com%2F' 'user%2FGoldenSandslash15&sa=D&sntz=1&' 'usg=AFQjCNElFBxZYdNm5mWoRSncf5tbdIJQ-A', linked_urls) print('\n'.join(inline_urls)) print('\n'.join(linked_urls))
def test_http_request(self): request = Request('http://example.com') request.fields['hello'] = 'world' new_request = convert_http_request(request) self.assertEqual('example.com', new_request.host) self.assertEqual('world', new_request.get_header('Hello'))
def test_duration_timeout(self): client = WebClient() session = client.session(Request(self.get_url('/sleep_long'))) with self.assertRaises(DurationTimeout): yield from session.start() yield from session.download(duration_timeout=0.1)
def test_sitemap_scraper_xml(self): scraper = SitemapScraper(self.get_html_parser()) request = Request('http://example.com/sitemap.xml') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> <loc>http://www.example.com/</loc> <lastmod>2005-01-01</lastmod> <changefreq>monthly</changefreq> <priority>0.8</priority> </url> </urlset> ''') scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://www.example.com/', }, linked_urls) self.assertFalse(inline_urls)
def test_status_line_only(self): stream = self.new_stream('127.0.0.1', self._port) request = Request(self.get_url('/status_line_only')) response, content = yield from self.fetch(stream, request) self.assertEqual(200, response.status_code) self.assertEqual(b'Hey', content)
def test_utf8_header(self): stream = self.new_stream() request = Request(self.get_url('/utf8_header')) response, dummy = yield from self.fetch(stream, request) self.assertEqual(200, response.status_code) self.assertEqual('🐱'.encode('utf-8').decode('latin-1'), response.fields['whoa'])
def test_false_gzip(self): stream = self.new_stream('127.0.0.1', self._port) request = Request(self.get_url('/false_gzip')) response, content = yield from self.fetch(stream, request) self.assertEqual('gzip', response.fields['Content-Encoding']) self.assertEqual(b'a' * 100, content)
def test_redirect_loop(self): checker = RobotsTxtChecker(web_client=MockWebClient()) request = Request('http://example.com') request.prepare_for_send() nonlocal_dict = {'counter': 0} def response_callback(request): request.prepare_for_send() self.assertTrue(request.url_info.url.endswith('robots.txt')) response = Response(302, 'See else') response.request = request response.fields['Location'] = '/robots.txt' nonlocal_dict['counter'] += 1 if nonlocal_dict['counter'] > 20: raise ProtocolError('Mock redirect loop error.') return response checker.web_client.mock_response_callback = response_callback self.assertTrue((yield from checker.can_fetch(request))) self.assertTrue(checker.can_fetch_pool(request))
def test_rss_as_html(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['content-type'] = 'application/rss+xml' with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'rss.xml') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) self.assertTrue(scrape_result) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertFalse(inline_urls) self.assertEqual( { 'http://www.someexamplerssdomain.com/main.html', 'http://www.wikipedia.org/' }, linked_urls)
def test_sitemap_scraper_xml_index(self): scraper = SitemapScraper(self.get_html_parser()) request = Request('http://example.com/sitemap.xml') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?> <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap> <loc>http://www.example.com/sitemap1.xml.gz</loc> <lastmod>2004-10-01T18:23:17+00:00</lastmod> </sitemap> </sitemapindex> ''') scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://www.example.com/sitemap1.xml.gz', }, linked_urls) self.assertFalse(inline_urls)
def test_html_soup(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['Refresh'] = 'yes' with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'soup.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({'http://example.com/ABOUTM~1.JPG'}, inline_urls) self.assertEqual( { 'http://example.com/BLOG', 'http://example.com/web ring/Join.htm', }, linked_urls)
def test_html_scraper_links_base_href(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'basehref.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual('utf-8', scrape_result.encoding) self.assertEqual( { 'http://cdn.example.com/stylesheet1.css', 'http://www.example.com/stylesheet2.css', 'http://example.com/a/stylesheet3.css', 'http://example.com/a/dir/image1.png', 'http://example.com/dir/image2.png', 'http://example.net/image3.png', 'http://example.com/dir/image4.png', }, inline_urls) self.assertEqual({'http://example.com/a/'}, linked_urls)
def test_client_duration_timeout(self): client = Client() with self.assertRaises(DurationTimeout), client.session() as session: request = Request(self.get_url('/sleep_long')) yield from session.start(request) yield from session.download(duration_timeout=0.1)
def test_content_length_and_chunked(self): stream = self.new_stream() request = Request(self.get_url('/content_length_and_chunked')) response, content = yield from self.fetch(stream, request) self.assertEqual(200, response.status_code) self.assertEqual('chunked', response.fields['Transfer-Encoding']) self.assertEqual(b'hello world!', content)
def test_basic_chunked_trailer(self): stream = self.new_stream() request = Request(self.get_url('/chunked_trailer')) response, content = yield from self.fetch(stream, request) self.assertEqual(200, response.status_code) self.assertEqual('chunked', response.fields['Transfer-Encoding']) self.assertEqual('dolphin', response.fields['Animal']) self.assertEqual(b'hello world!', content)
def test_connection_reuse(self): stream = self.new_stream() request = Request(self.get_url('/')) request.version = 'HTTP/1.0' response, dummy = yield from self.fetch(stream, request) self.assertEqual(200, response.status_code) response, dummy = yield from self.fetch(stream, request) self.assertEqual(200, response.status_code)
def test_client_exception_recovery(self): connection_factory = functools.partial(Connection, timeout=2.0) connection_pool = ConnectionPool(connection_factory=connection_factory) client = Client(connection_pool=connection_pool) for dummy in range(7): with self.assertRaises(NetworkError), client.session() as session: request = Request(self.get_url('/header_early_close')) yield from session.start(request) for dummy in range(7): with client.session() as session: request = Request(self.get_url('/')) response = yield from session.start(request) self.assertEqual(200, response.status_code) yield from session.download() self.assertTrue(session.done())
def test_client_exception_throw(self): client = Client() with client.session() as session: request = Request('http://wpull-no-exist.invalid') with self.assertRaises(NetworkError): yield from session.start(request)
def test_basic_content_length(self): stream = self.new_stream() request = Request(self.get_url('/content_length')) response, content = yield from self.fetch(stream, request) self.assertEqual(200, response.status_code) self.assertEqual('100', response.fields['Content-Length']) self.assertEqual(b'a' * 100, content) self.assertEqual(100, len(content))
def test_connection_refused(self): stream = self.new_stream('127.0.0.1', 1) try: yield from self.fetch(stream, Request('http://localhost:1/')) except ConnectionRefused: pass else: self.fail() # pragma: no cover
def test_bad_redirect_ipv6(self): client = WebClient() session = client.session(Request(self.get_url('/bad_redirect_ipv6'))) with self.assertRaises(ProtocolError): while not session.done(): yield from session.start() yield from session.download()
def test_add_referer_https_to_http(self): request = Request() url_record = URLRecord() url_record.parent_url = 'https://example.com/' url_record.url = 'http://example.com/image.png' WebProcessorSession._add_referrer(request, url_record) self.assertNotIn('referer', request.fields)
def test_no_such_host(self): stream = self.new_stream('wpull-no-exist.invalid', 80) try: yield from \ self.fetch(stream, Request('http://wpull-no-exist.invalid')) except NetworkError: pass else: self.fail() # pragma: no cover
def test_read_timeout(self): stream = self.new_stream(connection_kwargs=dict(timeout=0.1)) request = Request(self.get_url('/sleep_long')) try: yield from self.fetch(stream, request) except NetworkError: pass else: self.fail() # pragma: no cover
def test_add_referer(self): request = Request() url_record = URLRecord() url_record.parent_url = 'http://example.com/' url_record.url = 'http://example.com/image.png' WebProcessorSession._add_referrer(request, url_record) self.assertEqual('http://example.com/', request.fields['Referer'])
def test_gzip_corrupt_footer(self): stream = self.new_stream() request = Request(self.get_url('/gzip_corrupt_footer')) try: yield from self.fetch(stream, request) except ProtocolError: pass else: self.fail() # pragma: no cover
def test_buffer_overflow_header(self): stream = self.new_stream() request = Request(self.get_url('/buffer_overflow_header')) try: yield from self.fetch(stream, request) except ProtocolError: pass else: self.fail() # pragma: no cover
def test_bad_chunk_size(self): stream = self.new_stream() request = Request(self.get_url('/bad_chunk_size')) try: yield from self.fetch(stream, request) except ProtocolError: pass else: self.fail() # pragma: no cover