Beispiel #1
0
def test_sniffer():
    tests = [
        (r"", "UTF-8"),
        (r"""<?xml version='1.0' encoding='ISO-8859-1' ?>""", "ISO-8859-1"),
        (r"""<?xml version="1.0" encoding='ISO-8859-1' ?>""", "ISO-8859-1"),
        (r"""<?xml version="1.0" encoding='ISO-8859-1' standalone='no'?>""", "ISO-8859-1"),
        (r"""<?xml version='1.1' encoding="ISO-8859-1" standalone="yes" ?>""", "ISO-8859-1"),
        (r"""<?xml version="1.0" encoding="EBCDIC-some-cp" ?>""".encode("cp037"), "EBCDIC-some-cp"),
        # and now the really viciously pedantic refusals...
        (r""" <?xml version="1.0" encoding="ISO-8859-1" ?>""", "UTF-8"),  # bad: space before decl
        (r"""<?xml version=1.0 encoding="ISO-8859-1" ?>""", "UTF-8"),  # bad: no quotes around version value
        (r"""<?xml encoding="ISO-8859-1" version="1.0" ?>""", "UTF-8"),  # bad: wrong order for attributes
        (
            r"""<?xml version="1.0" encoding="ISO-8859-1" standalone=no ?>""",
            "UTF-8",
        ),  # bad: no quotes around standalone value
        (r"""<?xml version=" 1.0" encoding="ISO-8859-1" ?>""", "UTF-8"),  # bad: whitespace before version value
        (r"""<?xml version="1.0 " encoding="ISO-8859-1" ?>""", "UTF-8"),  # bad: whitespace after version value
        (r"""<?xml version="1.0" encoding=" ISO-8859-1" ?>""", "UTF-8"),  # bad: whitespace before encoding value
        (r"""<?xml version="1.0" encoding="ISO-8859-1 " ?>""", "UTF-8"),  # bad: whitespace after encoding value
        (r"""<?xml version="1.0" encoding=Big5 ?>""", "UTF-8"),  # bad: no quotes around encoding value
    ]
    for i, (s, e) in enumerate(tests):
        try:
            r = sniff_encoding(s)
        except ValidationError as exc:
            assert False, (exc, i)
        else:
            assert r == e, (r, i)
Beispiel #2
0
def test_sniffer_exc():
    s = six.u('<?xml version="1.0" encoding="Cp037" ?>').encode("utf-8-sig")
    e_exc = r"Multiply-specified encoding (BOM: utf_8_sig, XML decl: Cp037) at line 1, column 1 (char 1)"
    try:
        r = sniff_encoding(s)
    except ValidationError as exc:
        assert str(exc) == e_exc, exc
    else:
        assert False, r
Beispiel #3
0
def test_fix_xmldecl():
    # Slow compared to the other tests, but still only a few seconds.
    for encoding in encodings.aliases.aliases.values():
        if encoding in (
            "rot_13",
            "quopri_codec",
            "zlib_codec",
            "base64_codec",
            "uu_codec",
            "tactis",
            "hex_codec",
            "bz2_codec",
        ):
            continue
        try:
            "".encode(encoding)
        except LookupError:  # not trying to handle unknown encodings yet
            continue
        xmldecl = fix_xmldecl(six.u("  <?xml>").encode(encoding), encoding, add_encoding=True)
        if encoding.lower().startswith("utf"):
            if "16" in encoding:
                if "le" in encoding.lower():
                    assert xmldecl.startswith(codecs.BOM_UTF16_LE)
                if "be" in encoding.lower():
                    assert xmldecl.startswith(codecs.BOM_UTF16_BE)
        sniffed = sniff_encoding(xmldecl)
        assert sniffed == encoding, (xmldecl, encoding, sniffed)
        xmldecl = fix_xmldecl(six.u("  <?xml>").encode(encoding), encoding, add_encoding=True)
        if encoding.lower().startswith("utf"):
            if "16" in encoding:
                if "le" in encoding.lower():
                    assert xmldecl.startswith(codecs.BOM_UTF16_LE)
                if "be" in encoding.lower():
                    assert xmldecl.startswith(codecs.BOM_UTF16_BE)
        sniffed = sniff_encoding(xmldecl)
        assert sniffed == encoding, (xmldecl, encoding, sniffed)