コード例 #1
0
ファイル: sitemap_test.py プロジェクト: fakegit/ludios_wpull
    def test_sitemap_encoding(self):
        parser = HTMLParser()
        reader = SitemapReader(parser)

        bom_map = {
            'utf_16_le': codecs.BOM_UTF16_LE,
            'utf_16_be': codecs.BOM_UTF16_BE,
            'utf_32_le': codecs.BOM_UTF32_LE,
            'utf_32_be': codecs.BOM_UTF32_BE,
        }

        for name in CODEC_NAMES:
            if name in EBCDIC or name == 'utf_8_sig':
                # XXX: we're assuming that all codecs are ASCII backward
                # compatable
                continue

            if name.startswith('utf_16') or name.startswith('utf_32'):
                # FIXME: libxml/lxml doesn't like it when we pass in a codec
                # name but don't specify the endian but BOM is included
                continue

            data = io.BytesIO(
                bom_map.get(name, b'') +
                '<?xml version="1.0" encoding="UTF-8"?>'
                '<urlset><url><loc>blah</loc></url></urlset>'.encode(name)
            )

            print('->', name)

            links = tuple(reader.iter_links(data, encoding=name))
            link = links[0]
            self.assertEqual('blah', link)
コード例 #2
0
ファイル: sitemap_test.py プロジェクト: Super-Rad/wpull
    def test_sitemap_encoding(self):
        parser = self.get_html_parser()
        is_lxml = isinstance(parser, LxmlHTMLParser)
        reader = SitemapReader(parser)

        bom_map = {
            'utf_16_le': codecs.BOM_UTF16_LE,
            'utf_16_be': codecs.BOM_UTF16_BE,
            'utf_32_le': codecs.BOM_UTF32_LE,
            'utf_32_be': codecs.BOM_UTF32_BE,
        }

        for name in CODEC_NAMES:
            if name in EBCDIC or name == 'utf_8_sig':
                # XXX: we're assuming that all codecs are ASCII backward
                # compatable
                continue

            if is_lxml and (name.startswith('utf_16') or name.startswith('utf_32')):
                # FIXME: libxml/lxml doesn't like it when we pass in a codec
                # name but don't specify the endian but BOM is included
                continue

            data = io.BytesIO(
                bom_map.get(name, b'') +
                '<?xml version="1.0" encoding="UTF-8"?>'
                '<urlset><url><loc>blah</loc></url></urlset>'.encode(name)
            )

            print('->', name)

            links = tuple(reader.iter_links(data, encoding=name))
            link = links[0]
            self.assertEqual('blah', link)
コード例 #3
0
    def test_sitemap_encoding(self):
        reader = SitemapReader(self.get_html_parser())

        for name in CODEC_NAMES:
            if name in EBCDIC or name == 'utf_8_sig':
                # XXX: we're assuming that all codecs are ASCII backward
                # compatable
                continue

            if name.endswith('_le') or name.endswith('_be'):
                # XXX: Assume BOM is always included
                continue

            data = io.BytesIO(
                '<?xml version="1.0" encoding="UTF-8"?>'
                '<urlset><url><loc>blah</loc></url></urlset>'.encode(name)
            )
            print('->', name)
            links = tuple(reader.iter_links(data, encoding=name))
            link = links[0]
            self.assertEqual('blah', link)