Exemple #1
0
    def test_html_encoding(self):
        html_parser = HTMLParser()
        reader = HTMLReader(html_parser)

        bom_map = {
            'utf_16_le': codecs.BOM_UTF16_LE,
            'utf_16_be': codecs.BOM_UTF16_BE,
            'utf_32_le': codecs.BOM_UTF32_LE,
            'utf_32_be': codecs.BOM_UTF32_BE,
        }

        for name in CODEC_NAMES:
            if name in EBCDIC or name == 'utf_8_sig':
                # XXX: we're assuming that all codecs are ASCII backward
                # compatable
                continue

            if name.startswith('utf_16') or name.startswith('utf_32'):
                # FIXME: libxml/lxml doesn't like it when we pass in a codec
                # name but don't specify the endian but BOM is included
                continue

            print('->', name)

            data = io.BytesIO(bom_map.get(name, b'') + '<img>'.encode(name))
            elements = tuple(reader.iter_elements(data, encoding=name))

            html_element = elements[0]
            self.assertEqual('html', html_element.tag)
Exemple #2
0
    def test_html_layout(self):
        html_parser = self.get_html_parser()
        reader = HTMLReader(html_parser)

        elements = tuple(
            reader.iter_elements(io.BytesIO(b'''
            <html>
                <head>
                    <title>hi</title>
                </head>
                <body>
                    <img>
                </body>
            </html>'''), encoding='ascii')
        )

        print(elements)

        self.assertEqual('html', elements[0].tag)
        self.assertEqual('head', elements[1].tag)
        self.assertEqual('title', elements[2].tag)
        self.assertEqual('title', elements[3].tag)
        self.assertEqual('head', elements[4].tag)
        self.assertEqual('body', elements[5].tag)
        self.assertEqual('img', elements[6].tag)

        if isinstance(html_parser, LxmlHTMLParser):
            self.assertEqual('img', elements[7].tag)
            self.assertEqual('body', elements[8].tag)
            self.assertEqual('html', elements[9].tag)
        else:
            self.assertEqual('body', elements[7].tag)
            self.assertEqual('html', elements[8].tag)
Exemple #3
0
    def test_html_encoding(self):
        html_parser = self.get_html_parser()
        is_lxml = isinstance(html_parser, LxmlHTMLParser)
        reader = HTMLReader(html_parser)

        bom_map = {
            'utf_16_le': codecs.BOM_UTF16_LE,
            'utf_16_be': codecs.BOM_UTF16_BE,
            'utf_32_le': codecs.BOM_UTF32_LE,
            'utf_32_be': codecs.BOM_UTF32_BE,
        }

        for name in CODEC_NAMES:
            if name in EBCDIC or name == 'utf_8_sig':
                # XXX: we're assuming that all codecs are ASCII backward
                # compatable
                continue

            if is_lxml and (name.startswith('utf_16') or name.startswith('utf_32')):
                # FIXME: libxml/lxml doesn't like it when we pass in a codec
                # name but don't specify the endian but BOM is included
                continue

            print('->', name)

            data = io.BytesIO( bom_map.get(name, b'') +'<img>'.encode(name))
            elements = tuple(reader.iter_elements(data, encoding=name))

            html_element = elements[0]
            if isinstance(html_parser, LxmlHTMLParser):
                self.assertEqual('html', html_element.tag)
            else:
                self.assertEqual('img', html_element.tag)
Exemple #4
0
    def test_html_script_comment(self):
        test_string = b'''<script><!-- blah --></script>'''

        reader = HTMLReader(HTMLParser())
        elements = reader.iter_elements(io.BytesIO(test_string),
                                        encoding='ascii')
        elements = tuple(elements)

        self.assertTrue(
            all(isinstance(element, Element) for element in elements))
Exemple #5
0
    def test_html_script_comment(self):
        test_string = b'''<script><!-- blah --></script>'''

        reader = HTMLReader(self.get_html_parser())
        elements = reader.iter_elements(io.BytesIO(test_string),
                                        encoding='ascii')
        elements = tuple(elements)

        self.assertTrue(
            all(isinstance(element, Element) for element in elements))
Exemple #6
0
    def test_html_encoding(self):
        html_parser = self.get_html_parser()
        reader = HTMLReader(html_parser)

        for name in CODEC_NAMES:
            if name in EBCDIC or name == 'utf_8_sig':
                # XXX: we're assuming that all codecs are ASCII backward
                # compatable
                continue

            if name.endswith('_le') or name.endswith('_be'):
                # XXX: Assume BOM is always included
                continue

            print(name)
            data = io.BytesIO('<img>'.encode(name))
            elements = tuple(reader.iter_elements(data, encoding=name))

            html_element = elements[0]
            if isinstance(html_parser, LxmlHTMLParser):
                self.assertEqual('html', html_element.tag)
            else:
                self.assertEqual('img', html_element.tag)
Exemple #7
0
    def test_html_early_html(self):
        reader = HTMLReader(HTMLParser())

        for test_string in [
                b'''<!DOCTYPE HTML><html></html><img>''',
                b'''<html></html><img>''',
                b'''<!DOCTYPE HTML><img><html></html>''',
                b'''<img><html></html>''',
                b'''<!DOCTYPE HTML>
                <html><body></body></html><p><img>''',
                b'''
                <html><body></body></html><p><img>''',
                b'''
                <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
                "http://www.w3.org/TR/html4/loose.dtd">
                <html>
                <head>
                <title>Download</title>
                </head>
                <body>
                <br />
                </body>
                </html>
                <pre><img></pre>
            ''',
                b'''
                <html>
                <head>
                <title>Download</title>
                </head>
                <body>
                <br />
                </body>
                </html>
                <pre><img></pre>
            ''',
                b'''
                <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
                "http://www.w3.org/TR/html4/loose.dtd">
                <html>
                <body>
                <br />
                </body>
                <head>
                <title>Download</title>
                </head>
                </html>
                <pre><img></pre>
            ''',
                b'''
                <html>
                <body>
                <br />
                </body>
                <head>
                <title>Download</title>
                </head>
                </html>
                <pre><img></pre>
            ''',
        ]:
            elements = []
            print()
            print('a' * 10)
            print(test_string)
            for element in reader.iter_elements(io.BytesIO(test_string),
                                                encoding='ascii'):
                if isinstance(element, Element):
                    print(element)
                    elements.append(element)

            element_tags = tuple(element.tag for element in elements)
            self.assertIn('img', element_tags)
Exemple #8
0
    def test_html_early_html(self):
        reader = HTMLReader(self.get_html_parser())

        for test_string in [
            b'''<!DOCTYPE HTML><html></html><img>''',
            b'''<html></html><img>''',
            b'''<!DOCTYPE HTML><img><html></html>''',
            b'''<img><html></html>''',
            b'''<!DOCTYPE HTML>
                <html><body></body></html><p><img>''',
            b'''
                <html><body></body></html><p><img>''',
            b'''
                <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
                "http://www.w3.org/TR/html4/loose.dtd">
                <html>
                <head>
                <title>Download</title>
                </head>
                <body>
                <br />
                </body>
                </html>
                <pre><img></pre>
            ''',
            b'''
                <html>
                <head>
                <title>Download</title>
                </head>
                <body>
                <br />
                </body>
                </html>
                <pre><img></pre>
            ''',
            b'''
                <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
                "http://www.w3.org/TR/html4/loose.dtd">
                <html>
                <body>
                <br />
                </body>
                <head>
                <title>Download</title>
                </head>
                </html>
                <pre><img></pre>
            ''',
            b'''
                <html>
                <body>
                <br />
                </body>
                <head>
                <title>Download</title>
                </head>
                </html>
                <pre><img></pre>
            ''',
        ]:
            elements = []
            print()
            print('a' * 10)
            print(test_string)
            for element in reader.iter_elements(io.BytesIO(test_string),
                                                encoding='ascii'):
                if isinstance(element, Element):
                    print(element)
                    elements.append(element)

            element_tags = tuple(element.tag for element in elements)
            self.assertIn('img', element_tags)