def test_html_encoding(self): html_parser = HTMLParser() reader = HTMLReader(html_parser) bom_map = { 'utf_16_le': codecs.BOM_UTF16_LE, 'utf_16_be': codecs.BOM_UTF16_BE, 'utf_32_le': codecs.BOM_UTF32_LE, 'utf_32_be': codecs.BOM_UTF32_BE, } for name in CODEC_NAMES: if name in EBCDIC or name == 'utf_8_sig': # XXX: we're assuming that all codecs are ASCII backward # compatable continue if name.startswith('utf_16') or name.startswith('utf_32'): # FIXME: libxml/lxml doesn't like it when we pass in a codec # name but don't specify the endian but BOM is included continue print('->', name) data = io.BytesIO(bom_map.get(name, b'') + '<img>'.encode(name)) elements = tuple(reader.iter_elements(data, encoding=name)) html_element = elements[0] self.assertEqual('html', html_element.tag)
def test_html_layout(self): html_parser = self.get_html_parser() reader = HTMLReader(html_parser) elements = tuple( reader.iter_elements(io.BytesIO(b''' <html> <head> <title>hi</title> </head> <body> <img> </body> </html>'''), encoding='ascii') ) print(elements) self.assertEqual('html', elements[0].tag) self.assertEqual('head', elements[1].tag) self.assertEqual('title', elements[2].tag) self.assertEqual('title', elements[3].tag) self.assertEqual('head', elements[4].tag) self.assertEqual('body', elements[5].tag) self.assertEqual('img', elements[6].tag) if isinstance(html_parser, LxmlHTMLParser): self.assertEqual('img', elements[7].tag) self.assertEqual('body', elements[8].tag) self.assertEqual('html', elements[9].tag) else: self.assertEqual('body', elements[7].tag) self.assertEqual('html', elements[8].tag)
def test_html_encoding(self): html_parser = self.get_html_parser() is_lxml = isinstance(html_parser, LxmlHTMLParser) reader = HTMLReader(html_parser) bom_map = { 'utf_16_le': codecs.BOM_UTF16_LE, 'utf_16_be': codecs.BOM_UTF16_BE, 'utf_32_le': codecs.BOM_UTF32_LE, 'utf_32_be': codecs.BOM_UTF32_BE, } for name in CODEC_NAMES: if name in EBCDIC or name == 'utf_8_sig': # XXX: we're assuming that all codecs are ASCII backward # compatable continue if is_lxml and (name.startswith('utf_16') or name.startswith('utf_32')): # FIXME: libxml/lxml doesn't like it when we pass in a codec # name but don't specify the endian but BOM is included continue print('->', name) data = io.BytesIO( bom_map.get(name, b'') +'<img>'.encode(name)) elements = tuple(reader.iter_elements(data, encoding=name)) html_element = elements[0] if isinstance(html_parser, LxmlHTMLParser): self.assertEqual('html', html_element.tag) else: self.assertEqual('img', html_element.tag)
def test_html_script_comment(self): test_string = b'''<script><!-- blah --></script>''' reader = HTMLReader(HTMLParser()) elements = reader.iter_elements(io.BytesIO(test_string), encoding='ascii') elements = tuple(elements) self.assertTrue( all(isinstance(element, Element) for element in elements))
def test_html_script_comment(self): test_string = b'''<script><!-- blah --></script>''' reader = HTMLReader(self.get_html_parser()) elements = reader.iter_elements(io.BytesIO(test_string), encoding='ascii') elements = tuple(elements) self.assertTrue( all(isinstance(element, Element) for element in elements))
def test_html_encoding(self): html_parser = self.get_html_parser() reader = HTMLReader(html_parser) for name in CODEC_NAMES: if name in EBCDIC or name == 'utf_8_sig': # XXX: we're assuming that all codecs are ASCII backward # compatable continue if name.endswith('_le') or name.endswith('_be'): # XXX: Assume BOM is always included continue print(name) data = io.BytesIO('<img>'.encode(name)) elements = tuple(reader.iter_elements(data, encoding=name)) html_element = elements[0] if isinstance(html_parser, LxmlHTMLParser): self.assertEqual('html', html_element.tag) else: self.assertEqual('img', html_element.tag)
def test_html_early_html(self): reader = HTMLReader(HTMLParser()) for test_string in [ b'''<!DOCTYPE HTML><html></html><img>''', b'''<html></html><img>''', b'''<!DOCTYPE HTML><img><html></html>''', b'''<img><html></html>''', b'''<!DOCTYPE HTML> <html><body></body></html><p><img>''', b''' <html><body></body></html><p><img>''', b''' <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <title>Download</title> </head> <body> <br /> </body> </html> <pre><img></pre> ''', b''' <html> <head> <title>Download</title> </head> <body> <br /> </body> </html> <pre><img></pre> ''', b''' <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <body> <br /> </body> <head> <title>Download</title> </head> </html> <pre><img></pre> ''', b''' <html> <body> <br /> </body> <head> <title>Download</title> </head> </html> <pre><img></pre> ''', ]: elements = [] print() print('a' * 10) print(test_string) for element in reader.iter_elements(io.BytesIO(test_string), encoding='ascii'): if isinstance(element, Element): print(element) elements.append(element) element_tags = tuple(element.tag for element in elements) self.assertIn('img', element_tags)
def test_html_early_html(self): reader = HTMLReader(self.get_html_parser()) for test_string in [ b'''<!DOCTYPE HTML><html></html><img>''', b'''<html></html><img>''', b'''<!DOCTYPE HTML><img><html></html>''', b'''<img><html></html>''', b'''<!DOCTYPE HTML> <html><body></body></html><p><img>''', b''' <html><body></body></html><p><img>''', b''' <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <title>Download</title> </head> <body> <br /> </body> </html> <pre><img></pre> ''', b''' <html> <head> <title>Download</title> </head> <body> <br /> </body> </html> <pre><img></pre> ''', b''' <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <body> <br /> </body> <head> <title>Download</title> </head> </html> <pre><img></pre> ''', b''' <html> <body> <br /> </body> <head> <title>Download</title> </head> </html> <pre><img></pre> ''', ]: elements = [] print() print('a' * 10) print(test_string) for element in reader.iter_elements(io.BytesIO(test_string), encoding='ascii'): if isinstance(element, Element): print(element) elements.append(element) element_tags = tuple(element.tag for element in elements) self.assertIn('img', element_tags)