Exemple #1
0
    def test_charmap_decode_1(self):
        import codecs
        assert codecs.charmap_encode(u'xxx') == ('xxx', 3)
        assert codecs.charmap_encode(u'xxx', 'strict', {ord('x'): 'XX'}) == ('XXXXXX', 3)

        res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab")
        assert res == (u"ab\ufffd", 3)
        res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe")
        assert res == (u'ab\ufffd', 3)
Exemple #2
0
    def test_charmap_decode_1(self):
        import codecs

        assert codecs.charmap_encode(u"xxx") == ("xxx", 3)
        assert codecs.charmap_encode(u"xxx", "strict", {ord("x"): "XX"}) == ("XXXXXX", 3)

        res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab")
        assert res == (u"ab\ufffd", 3)
        res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe")
        assert res == (u"ab\ufffd", 3)
    def test_callbacks(self):
        def handler1(exc):
            r = range(exc.start, exc.end)
            if isinstance(exc, UnicodeEncodeError):
                l = ["<%d>" % ord(exc.object[pos]) for pos in r]
            elif isinstance(exc, UnicodeDecodeError):
                l = ["<%d>" % exc.object[pos] for pos in r]
            else:
                raise TypeError("don't know how to handle %r" % exc)
            return ("[%s]" % "".join(l), exc.end)

        codecs.register_error("test.handler1", handler1)

        def handler2(exc):
            if not isinstance(exc, UnicodeDecodeError):
                raise TypeError("don't know how to handle %r" % exc)
            l = ["<%d>" % exc.object[pos] for pos in range(exc.start, exc.end)]
            return ("[%s]" % "".join(l), exc.end + 1)  # skip one character

        codecs.register_error("test.handler2", handler2)

        s = b"\x00\x81\x7f\x80\xff"

        self.assertEqual(s.decode("ascii", "test.handler1"), "\x00[<129>]\x7f[<128>][<255>]")
        self.assertEqual(s.decode("ascii", "test.handler2"), "\x00[<129>][<128>]")

        self.assertEqual(b"\\u3042\u3xxx".decode("unicode-escape", "test.handler1"), "\u3042[<92><117><51><120>]xx")

        self.assertEqual(b"\\u3042\u3xx".decode("unicode-escape", "test.handler1"), "\u3042[<92><117><51><120><120>]")

        self.assertEqual(codecs.charmap_decode(b"abc", "test.handler1", {ord("a"): "z"})[0], "z[<98>][<99>]")

        self.assertEqual("g\xfc\xdfrk".encode("ascii", "test.handler1"), b"g[<252><223>]rk")

        self.assertEqual("g\xfc\xdf".encode("ascii", "test.handler1"), b"g[<252><223>]")
Exemple #4
0
def _decode(input,errors='strict'):
    # opposite of above, look for multibye 'marker'
    # and handle it ourselves, pass the rest to the
    # standard decoder
    
    # split to see if we have any 'extended' characters
    runs = str_splitter.split(input)

    # now iterate through handling any 'multibyte' ourselves
    out_uni = []
    consumed = 0
    for run in runs:
        if len(run)==0:
            # first char was a marker, but we don't care
            # the marker itself will come up in the next run
            continue
        if len(run)==2 and run[0]==extended_indicator:
            try:
                out_uni.append(extended_decode_map[run[1]])
                consumed += 2
                continue
            except KeyError:
                # second char was not an extended, so
                # let this pass through and the marker
                # will be interpreted by the table as a NBSP
                pass

        # pass it to the standard encoder
        out,cons=codecs.charmap_decode(run,errors,decoding_table)
        out_uni.append(out)
        consumed+=cons
    return (u''.join(out_uni),consumed)
    def test_callbacks(self):
        if test_support.due_to_ironpython_bug("http://tkbgitvstfat01:8080/WorkItemTracking/WorkItem.aspx?artifactMoniker=304331"):
            return
        def handler1(exc):
            if not isinstance(exc, UnicodeEncodeError) \
               and not isinstance(exc, UnicodeDecodeError):
                raise TypeError("don't know how to handle %r" % exc)
            l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)]
            return (u"[%s]" % u"".join(l), exc.end)

        codecs.register_error("test.handler1", handler1)

        def handler2(exc):
            if not isinstance(exc, UnicodeDecodeError):
                raise TypeError("don't know how to handle %r" % exc)
            l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)]
            return (u"[%s]" % u"".join(l), exc.end+1) # skip one character

        codecs.register_error("test.handler2", handler2)

        s = "\x00\x81\x7f\x80\xff"

        self.assertEqual(
            s.decode("ascii", "test.handler1"),
            u"\x00[<129>]\x7f[<128>][<255>]"
        )
        self.assertEqual(
            s.decode("ascii", "test.handler2"),
            u"\x00[<129>][<128>]"
        )

        self.assertEqual(
            "\\u3042\u3xxx".decode("unicode-escape", "test.handler1"),
            u"\u3042[<92><117><51><120>]xx"
        )

        self.assertEqual(
            "\\u3042\u3xx".decode("unicode-escape", "test.handler1"),
            u"\u3042[<92><117><51><120><120>]"
        )

        self.assertEqual(
            codecs.charmap_decode("abc", "test.handler1", {ord("a"): u"z"})[0],
            u"z[<98>][<99>]"
        )

        self.assertEqual(
            u"g\xfc\xdfrk".encode("ascii", "test.handler1"),
            u"g[<252><223>]rk"
        )

        self.assertEqual(
            u"g\xfc\xdf".encode("ascii", "test.handler1"),
            u"g[<252><223>]"
        )
Exemple #6
0
    def test_decode_with_string_map(self):
        self.assertEquals(
            codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
            (u"abc", 3)
        )

        self.assertEquals(
            codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
            (u"ab\ufffd", 3)
        )

        self.assertEquals(
            codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
            (u"ab\ufffd", 3)
        )

        self.assertEquals(
            codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
            (u"ab", 3)
        )

        self.assertEquals(
            codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
            (u"ab", 3)
        )

        allbytes = "".join(chr(i) for i in xrange(256))
        self.assertEquals(
            codecs.charmap_decode(allbytes, "ignore", u""),
            (u"", len(allbytes))
        )
Exemple #7
0
def internet_decode(input, errors='strict', final=False):
    """The core decoding function"""
    try:
        # First try utf-8. This should be the usual case by far.
        return codecs.utf_8_decode(input, errors, final)
    except UnicodeDecodeError:
        try:
            # If that fails, try windows-1252 (aka cp1252), which defines more characters than latin1,
            # but will fail for five particular bytes: 0x81, 0x8D, 0x8F, 0x90, 0x9D
            return codecs.charmap_decode(input, errors, encodings.cp1252.decoding_table)
        except UnicodeDecodeError:
            # and finally, try latin-1, which never fails, but defines 27 less characters than cp1252.
            return codecs.latin_1_decode(input, errors)
    except UnicodeEncodeError:
        # Was that thing already unicode? Then it's already decoded.
        if isinstance(input, unicode):
            return (input, len(input))
        else:
            raise
    def test_charmap_decode(self):
        #Sanity
        new_str, size = codecs.charmap_decode("abc")
        self.assertEqual(new_str, u'abc')
        self.assertEqual(size, 3)
        self.assertEqual(codecs.charmap_decode("a", 'strict', {ord('a') : u'a'})[0], u'a')
        self.assertEqual(codecs.charmap_decode("a", "replace", {})[0], u'\ufffd')
        self.assertEqual(codecs.charmap_decode("a", "replace", {ord('a'): None})[0], u'\ufffd')
        
        self.assertEqual(codecs.charmap_decode(""), (u'', 0))

        # using a string mapping
        self.assertEqual(codecs.charmap_decode(u'\x02\x01\x00', 'strict', u"abc"), (u'cba', 3))

        #Negative
        self.assertRaises(UnicodeDecodeError, codecs.charmap_decode, "a", "strict", {})
        self.assertRaises(UnicodeDecodeError, codecs.charmap_decode, "a", "strict", {'a': None})
        self.assertRaises(UnicodeEncodeError, codecs.charmap_encode, "a", "strict", {'a': None})
        self.assertRaises(UnicodeEncodeError, codecs.charmap_encode, "a", "replace", {'a': None})
        
        self.assertRaises(TypeError, codecs.charmap_decode, "a", "strict", {ord('a'): 2.0})
Exemple #9
0
 def decode(self, _input, errors='strict'):
     return codecs.charmap_decode(_input, errors, decoding_table)
Exemple #10
0
""" Python Character Mapping Codec generated from '8859-8.TXT' with gencodec.py.
Exemple #11
0
""" Python Character Mapping Codec generated from '8859-9.TXT' with gencodec.py.
Exemple #12
0
""" Python Character Mapping Codec generated from 'CP1256.TXT' with gencodec.py.
Exemple #13
0
""" Python Character Mapping Codec generated from 'CP424.TXT' with gencodec.py.
Exemple #14
0
""" Python Character Mapping Codec generated from 'CP1006.TXT' with gencodec.py.
Exemple #15
0
""" Python Character Mapping Codec generated from 'ICELAND.TXT' with gencodec.py.
#!/usr/bin/python
# -*- coding: utf-8 -*-

#
# Copyright (c) 2008 Doug Hellmann All rights reserved.
#

import codecs
import string

# Map every character to itself
decoding_map = codecs.make_identity_dict(range(256))

# Make a list of pairs of ordinal values for the lower and upper case
# letters
pairs = zip([ord(c) for c in string.ascii_lowercase],
            [ord(c) for c in string.ascii_uppercase])

# Modify the mapping to convert upper to lower and lower to upper.
decoding_map.update(dict((upper, lower) for (lower, upper) in pairs))
decoding_map.update(dict((lower, upper) for (lower, upper) in pairs))

# Create a separate encoding map.
encoding_map = codecs.make_encoding_map(decoding_map)

if __name__ == '__main__':
    print codecs.charmap_encode('abc.DEF', 'strict', encoding_map)
    print codecs.charmap_decode('abc.DEF', 'strict', decoding_map)
    print encoding_map == decoding_map
Exemple #17
0
 def decode(self, input, errors="strict"):
     return codecs.charmap_decode(input, errors, decoding_table)
Exemple #18
0
    def decode(self,input,errors='strict'):

        return codecs.charmap_decode(input,errors,decoding_map)
Exemple #19
0
 def decode(self, input, final=False):
     return codecs.charmap_decode(input,self.errors,decoding_table)[0]
Exemple #20
0
 def decode(self, input, final=False):
     return codecs.charmap_decode(input, self.errors, self.mapping)[0]
Exemple #21
0
""" Python Character Mapping Codec generated from 'CP500.TXT' with gencodec.py.
 def test_charmap_decode(self):
     import codecs
     res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab")
     assert res == (u"ab\ufffd", 3)
     res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe")
     assert res == (u'ab\ufffd', 3)
Exemple #23
0
 def test_charmap_decode(self):
     import codecs
     res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab")
     assert res == (u"ab\ufffd", 3)
     res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe")
     assert res == (u'ab\ufffd', 3)
Exemple #24
0
""" Python Character Mapping Codec generated from 'CP875.TXT' with gencodec.py.
Exemple #25
0
""" Python Character Mapping Codec generated from '8859-10.TXT' with gencodec.py.
 def decode(self, input, errors=error_handling):
     return codecs.charmap_decode(input, errors, decoding_map)
Exemple #27
0
 def decode(self, char, errors='strict'):
     return codecs.charmap_decode(char, errors, DECODING_TABLE)
Exemple #28
0
""" Python Character Mapping Codec generated from 'CP860.TXT' with gencodec.py.
Exemple #29
0
""" Python Character Mapping Codec generated from 'CP852.TXT' with gencodec.py.
Exemple #30
0
""" Python Character Mapping Codec generated from 'CP1257.TXT' with gencodec.py.
Exemple #31
0
 def decode(self, input, final = False):
     return codecs.charmap_decode(input, self.errors, decoding_table)[0]
Exemple #32
0
""" Python Character Mapping Codec generated from 'KOI8-R.TXT' with gencodec.py.
# Map every character to itself
decoding_map = codecs.make_identity_dict( xrange(256) )

# Make a list of pairs of ordinal values for all lcase and ucase letters
pairs = zip( [ord(c) for c in string.ascii_lowercase],
             [ord(c) for c in string.ascii_uppercase] )

# modify the mapping to convert upper to lower and lower to upper
decoding_map.update( dict( (upper, lower) for (lower, upper) in pairs ) )
decoding_map.update( dict( (lower, upper) for (lower, upper) in pairs ) )

# Create a separate encoding map
encoding_map = codecs.make_encoding_map( decoding_map )

print (codecs.charmap_encode('abc.DEF', error_handling, encoding_map)) 
print (codecs.charmap_decode('abc.DEF', error_handling, decoding_map)) 
print encoding_map == decoding_map
print

# by default, char map encoders and decoders support the standard error methods
# since this charmap only includes [a-zA-Z], the u"pi: π" from earlier fails    
for error in ['ignore', 'replace', 'strict']:
    try:
        encoded = codecs.charmap_encode(data, error, encoding_map)
    except UnicodeEncodeError, err:
        encoded = str(err)
    print '{:7} {}'.format(error, encoded)
print

# After defining a en/decoding maps, a few additonal classes have to be set up
# and the encoding should be registered so codecs can locate it.
#
# Copyright (c) 2010 Doug Hellmann.  All rights reserved.
#
"""Character mapping encoder
"""
#end_pymotw_header

import codecs
import string

# Map every character to itself
decoding_map = codecs.make_identity_dict(range(256))

# Make a list of pairs of ordinal values for the lower and upper case
# letters
pairs = zip([ ord(c) for c in string.ascii_lowercase],
            [ ord(c) for c in string.ascii_uppercase])

# Modify the mapping to convert upper to lower and lower to upper.
decoding_map.update( dict( (upper, lower) for (lower, upper) in pairs) )
decoding_map.update( dict( (lower, upper) for (lower, upper) in pairs) )

# Create a separate encoding map.
encoding_map = codecs.make_encoding_map(decoding_map)

if __name__ == '__main__':
    print codecs.charmap_encode('abc.DEF', 'strict', encoding_map)
    print codecs.charmap_decode('abc.DEF', 'strict', decoding_map)
    print encoding_map == decoding_map
    
 def decode(self, input, final=False):
     data, nbytes = codecs.charmap_decode(input, self.errors, decoding_map)
     return data
Exemple #36
0
 def decode(self, input, final = False):
     return codecs.charmap_decode(input, self.errors, self.mapping)[0]
decoding_map = codecs.make_identity_dict(range(256))

# Fa una lista di coppie di valori ordinali per le
# lettere minuscole e maiuscole
pairs = list(zip(
    [ord(c) for c in string.ascii_lowercase],
    [ord(c) for c in string.ascii_uppercase],
))

# Modifica la mappatura per convertire le maiuscole in minuscole e viceversa
decoding_map.update({
    upper: lower
    for (lower, upper)
    in pairs
})
decoding_map.update({
    lower: upper
    for (lower, upper)
    in pairs
})

# Crea una mappa di codifica separata
encoding_map = codecs.make_encoding_map(decoding_map)

if __name__ == '__main__':
    print(codecs.charmap_encode('abcDEF', 'strict',
                                encoding_map))
    print(codecs.charmap_decode(b'abcDEF', 'strict',
                                decoding_map))
    print(encoding_map == decoding_map)
Exemple #38
0
 def decode(self, char, final=False):
     return codecs.charmap_decode(char, self.errors, DECODING_TABLE)[0]
Exemple #39
0
""" Python Character Mapping Codec generated from 'CP775.TXT' with gencodec.py.
Exemple #40
0
 def decode(self, input, errors="strict"):
     return codecs.charmap_decode(input, errors, decoding_table)
Exemple #41
0
""" Python Character Mapping Codec generated from '8859-13.TXT' with gencodec.py.
Exemple #42
0
    def decode(self,input,errors='strict'):

        return codecs.charmap_decode(input,errors,decoding_map)
Exemple #43
0
""" Python Character Mapping Codec generated from 'LATIN2.TXT' with gencodec.py.