コード例 #1
0
ファイル: sitemap_test.py プロジェクト: fakegit/ludios_wpull
    def test_sitemap_encoding(self):
        parser = HTMLParser()
        reader = SitemapReader(parser)

        bom_map = {
            'utf_16_le': codecs.BOM_UTF16_LE,
            'utf_16_be': codecs.BOM_UTF16_BE,
            'utf_32_le': codecs.BOM_UTF32_LE,
            'utf_32_be': codecs.BOM_UTF32_BE,
        }

        for name in CODEC_NAMES:
            if name in EBCDIC or name == 'utf_8_sig':
                # XXX: we're assuming that all codecs are ASCII backward
                # compatable
                continue

            if name.startswith('utf_16') or name.startswith('utf_32'):
                # FIXME: libxml/lxml doesn't like it when we pass in a codec
                # name but don't specify the endian but BOM is included
                continue

            data = io.BytesIO(
                bom_map.get(name, b'') +
                '<?xml version="1.0" encoding="UTF-8"?>'
                '<urlset><url><loc>blah</loc></url></urlset>'.encode(name)
            )

            print('->', name)

            links = tuple(reader.iter_links(data, encoding=name))
            link = links[0]
            self.assertEqual('blah', link)
コード例 #2
0
ファイル: sitemap_test.py プロジェクト: Super-Rad/wpull
    def test_sitemap_encoding(self):
        parser = self.get_html_parser()
        is_lxml = isinstance(parser, LxmlHTMLParser)
        reader = SitemapReader(parser)

        bom_map = {
            'utf_16_le': codecs.BOM_UTF16_LE,
            'utf_16_be': codecs.BOM_UTF16_BE,
            'utf_32_le': codecs.BOM_UTF32_LE,
            'utf_32_be': codecs.BOM_UTF32_BE,
        }

        for name in CODEC_NAMES:
            if name in EBCDIC or name == 'utf_8_sig':
                # XXX: we're assuming that all codecs are ASCII backward
                # compatable
                continue

            if is_lxml and (name.startswith('utf_16') or name.startswith('utf_32')):
                # FIXME: libxml/lxml doesn't like it when we pass in a codec
                # name but don't specify the endian but BOM is included
                continue

            data = io.BytesIO(
                bom_map.get(name, b'') +
                '<?xml version="1.0" encoding="UTF-8"?>'
                '<urlset><url><loc>blah</loc></url></urlset>'.encode(name)
            )

            print('->', name)

            links = tuple(reader.iter_links(data, encoding=name))
            link = links[0]
            self.assertEqual('blah', link)
コード例 #3
0
    def test_sitemap_encoding(self):
        reader = SitemapReader(self.get_html_parser())

        for name in CODEC_NAMES:
            if name in EBCDIC or name == 'utf_8_sig':
                # XXX: we're assuming that all codecs are ASCII backward
                # compatable
                continue

            if name.endswith('_le') or name.endswith('_be'):
                # XXX: Assume BOM is always included
                continue

            data = io.BytesIO(
                '<?xml version="1.0" encoding="UTF-8"?>'
                '<urlset><url><loc>blah</loc></url></urlset>'.encode(name)
            )
            print('->', name)
            links = tuple(reader.iter_links(data, encoding=name))
            link = links[0]
            self.assertEqual('blah', link)
コード例 #4
0
    def test_sitemap_detect(self):
        # It should detect without BOM
        self.assertTrue(SitemapReader.is_file(
            io.BytesIO('<?xml > <urlset >'.encode('utf-16le'))
        ))
        self.assertFalse(SitemapReader.is_file(
            io.BytesIO('<!DOCTYPE html><html><body>'.encode('utf-16le'))
        ))
        self.assertFalse(SitemapReader.is_file(
            io.BytesIO(b'<html><body>hello<urlset>')
        ))
        self.assertTrue(SitemapReader.is_file(
            io.BytesIO(b'<?xml version> <urlset>')
        ))

        data_file = io.BytesIO()
        g_file = gzip.GzipFile(fileobj=data_file, mode='wb')
        g_file.write('<?xml version> <urlset>'.encode('utf-16le'))
        g_file.close()
        data_file.seek(0)
        self.assertTrue(SitemapReader.is_file(
            data_file
        ))

        self.assertTrue(
            SitemapReader.is_url(URLInfo.parse('example.com/sitemaps1.xml'))
        )
        self.assertTrue(
            SitemapReader.is_url(URLInfo.parse('example.com/robots.txt'))
        )
        self.assertFalse(
            SitemapReader.is_url(URLInfo.parse('example.com/image.jpg'))
        )
        self.assertTrue(
            SitemapReader.is_request(Request('example.com/sitemaps34.xml'))
        )
        self.assertFalse(
            SitemapReader.is_request(Request('example.com/image.jpg'))
        )