def test_html_layout(self): reader = HTMLReader() elements = tuple( reader.read_tree(io.BytesIO(b''' <html> <head> <title>hi</title> </head> <body> <img> </body> </html>'''), encoding='ascii') ) print(elements) self.assertEqual('html', elements[0].tag) self.assertEqual('head', elements[1].tag) self.assertEqual('title', elements[2].tag) self.assertEqual('title', elements[3].tag) self.assertEqual('head', elements[4].tag) self.assertEqual('body', elements[5].tag) self.assertEqual('img', elements[6].tag) self.assertEqual('img', elements[7].tag) self.assertEqual('body', elements[8].tag) self.assertEqual('html', elements[9].tag)
def test_html_encoding(self): reader = HTMLReader() for name in CODEC_NAMES: data = io.BytesIO('<img>'.encode(name)) elements = tuple(reader.read_links(data, encoding=name)) html_element = elements[0] self.assertEqual('html', html_element.tag)
def _process_phantomjs(self, request, response): '''Process PhantomJS.''' if not self._processor.instances.phantomjs_controller: return if response.status_code != 200: return if not HTMLReader.is_supported(request=request, response=response): return _logger.debug('Starting PhantomJS processing.') controller = self._processor.instances.phantomjs_controller with controller.client.remote() as remote: self._hook_phantomjs_logging(remote) yield controller.apply_page_size(remote) yield remote.call('page.open', request.url_info.url) yield remote.wait_page_event('load_finished') yield controller.control(remote) # FIXME: not sure where the logic should fit in if controller._snapshot: yield self._take_phantomjs_snapshot(controller, remote) content = yield remote.eval('page.content') mock_response = self._new_phantomjs_response(response, content) self._scrape_document(request, mock_response) _logger.debug('Ended PhantomJS processing.')
def _process_phantomjs(self, request, response): '''Process PhantomJS.''' if not self._processor.instances.phantomjs_controller: return if response.status_code != 200: return if not HTMLReader.is_supported(request=request, response=response): return _logger.debug('Starting PhantomJS processing.') controller = self._processor.instances.phantomjs_controller attempts = int(os.environ.get('WPULL_PHANTOMJS_TRIES', 5)) for dummy in range(attempts): # FIXME: this is a quick hack for handling time outs. See #137. try: with controller.client.remote() as remote: self._hook_phantomjs_logging(remote) yield controller.apply_page_size(remote) yield remote.call('page.open', request.url_info.url) yield remote.wait_page_event('load_finished') yield controller.control(remote) # FIXME: not sure where the logic should fit in if controller._snapshot: yield self._take_phantomjs_snapshot(controller, remote) content = yield remote.eval('page.content') except PhantomJSRPCTimedOut: _logger.exception('PhantomJS timed out.') else: break mock_response = self._new_phantomjs_response(response, content) self._scrape_document(request, mock_response) _logger.debug('Ended PhantomJS processing.')
def test_html_parse_doctype(self): self.assertIn( 'html', HTMLReader.parse_doctype( io.BytesIO(b'<!DOCTYPE HTML><html></html>') ) ) self.assertIn( 'XHTML', HTMLReader.parse_doctype( io.BytesIO(b''' <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html></html> ''') ) ) self.assertFalse(HTMLReader.parse_doctype(io.BytesIO(b'hello world!'))) self.assertFalse(HTMLReader.parse_doctype(io.BytesIO(b''))) self.assertFalse(HTMLReader.parse_doctype(io.BytesIO(b'\x00'))) self.assertFalse(HTMLReader.parse_doctype(io.BytesIO(b'A\xfe')))
def test_html_early_html(self): reader = HTMLReader() for test_string in [ b'''<!DOCTYPE HTML><html></html><img>''', b'''<html></html><img>''', b'''<!DOCTYPE HTML><img><html></html>''', b'''<img><html></html>''', b'''<!DOCTYPE HTML> <html><body></body></html><p><img>''', b''' <html><body></body></html><p><img>''', b''' <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <title>Download</title> </head> <body> <br /> </body> </html> <pre><img></pre> ''', b''' <html> <head> <title>Download</title> </head> <body> <br /> </body> </html> <pre><img></pre> ''', b''' <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <body> <br /> </body> <head> <title>Download</title> </head> </html> <pre><img></pre> ''', b''' <html> <body> <br /> </body> <head> <title>Download</title> </head> </html> <pre><img></pre> ''', ]: elements = tuple( reader.read_links(io.BytesIO(test_string), encoding='ascii') ) self.assertEqual('img', elements[-1].tag) elements = tuple( reader.read_tree(io.BytesIO(test_string), encoding='ascii') ) self.assertEqual('img', elements[-4].tag)
def test_html_detect(self): self.assertTrue(HTMLReader.is_file( io.BytesIO('<html><body>hi</body></html>'.encode('utf-16le')) )) self.assertFalse(HTMLReader.is_file( io.BytesIO('hello world!'.encode('utf-16le')) )) self.assertTrue(HTMLReader.is_file( io.BytesIO(b'<title>hello</title>hi') )) self.assertTrue(HTMLReader.is_file( io.BytesIO(b'<html><body>hello') )) self.assertTrue(HTMLReader.is_file( io.BytesIO( b'The document has moved <a href="somewhere.html">here</a>' ) )) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.htm')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.html')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.dhtm')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.xhtml')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.xht')) ) self.assertFalse( HTMLReader.is_url(URLInfo.parse('example.com/image.jpg')) ) self.assertTrue( HTMLReader.is_request(Request.new('example.com/index.html')) ) self.assertFalse( HTMLReader.is_request(Request.new('example.com/image.jpg')) ) response = Response('HTTP/1.0', '200', 'OK') response.fields['Content-Type'] = 'text/html' self.assertTrue(HTMLReader.is_response(response)) response = Response('HTTP/1.0', '200', 'OK') response.fields['Content-Type'] = 'image/png' self.assertFalse(HTMLReader.is_response(response))