def test_charmap_decode_1(self): import codecs assert codecs.charmap_encode(u'xxx') == ('xxx', 3) assert codecs.charmap_encode(u'xxx', 'strict', {ord('x'): 'XX'}) == ('XXXXXX', 3) res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab") assert res == (u"ab\ufffd", 3) res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe") assert res == (u'ab\ufffd', 3)
def test_charmap_decode_1(self): import codecs assert codecs.charmap_encode(u"xxx") == ("xxx", 3) assert codecs.charmap_encode(u"xxx", "strict", {ord("x"): "XX"}) == ("XXXXXX", 3) res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab") assert res == (u"ab\ufffd", 3) res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe") assert res == (u"ab\ufffd", 3)
def test_callbacks(self): def handler1(exc): r = range(exc.start, exc.end) if isinstance(exc, UnicodeEncodeError): l = ["<%d>" % ord(exc.object[pos]) for pos in r] elif isinstance(exc, UnicodeDecodeError): l = ["<%d>" % exc.object[pos] for pos in r] else: raise TypeError("don't know how to handle %r" % exc) return ("[%s]" % "".join(l), exc.end) codecs.register_error("test.handler1", handler1) def handler2(exc): if not isinstance(exc, UnicodeDecodeError): raise TypeError("don't know how to handle %r" % exc) l = ["<%d>" % exc.object[pos] for pos in range(exc.start, exc.end)] return ("[%s]" % "".join(l), exc.end + 1) # skip one character codecs.register_error("test.handler2", handler2) s = b"\x00\x81\x7f\x80\xff" self.assertEqual(s.decode("ascii", "test.handler1"), "\x00[<129>]\x7f[<128>][<255>]") self.assertEqual(s.decode("ascii", "test.handler2"), "\x00[<129>][<128>]") self.assertEqual(b"\\u3042\u3xxx".decode("unicode-escape", "test.handler1"), "\u3042[<92><117><51><120>]xx") self.assertEqual(b"\\u3042\u3xx".decode("unicode-escape", "test.handler1"), "\u3042[<92><117><51><120><120>]") self.assertEqual(codecs.charmap_decode(b"abc", "test.handler1", {ord("a"): "z"})[0], "z[<98>][<99>]") self.assertEqual("g\xfc\xdfrk".encode("ascii", "test.handler1"), b"g[<252><223>]rk") self.assertEqual("g\xfc\xdf".encode("ascii", "test.handler1"), b"g[<252><223>]")
def _decode(input,errors='strict'): # opposite of above, look for multibye 'marker' # and handle it ourselves, pass the rest to the # standard decoder # split to see if we have any 'extended' characters runs = str_splitter.split(input) # now iterate through handling any 'multibyte' ourselves out_uni = [] consumed = 0 for run in runs: if len(run)==0: # first char was a marker, but we don't care # the marker itself will come up in the next run continue if len(run)==2 and run[0]==extended_indicator: try: out_uni.append(extended_decode_map[run[1]]) consumed += 2 continue except KeyError: # second char was not an extended, so # let this pass through and the marker # will be interpreted by the table as a NBSP pass # pass it to the standard encoder out,cons=codecs.charmap_decode(run,errors,decoding_table) out_uni.append(out) consumed+=cons return (u''.join(out_uni),consumed)
def test_callbacks(self): if test_support.due_to_ironpython_bug("http://tkbgitvstfat01:8080/WorkItemTracking/WorkItem.aspx?artifactMoniker=304331"): return def handler1(exc): if not isinstance(exc, UnicodeEncodeError) \ and not isinstance(exc, UnicodeDecodeError): raise TypeError("don't know how to handle %r" % exc) l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)] return (u"[%s]" % u"".join(l), exc.end) codecs.register_error("test.handler1", handler1) def handler2(exc): if not isinstance(exc, UnicodeDecodeError): raise TypeError("don't know how to handle %r" % exc) l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)] return (u"[%s]" % u"".join(l), exc.end+1) # skip one character codecs.register_error("test.handler2", handler2) s = "\x00\x81\x7f\x80\xff" self.assertEqual( s.decode("ascii", "test.handler1"), u"\x00[<129>]\x7f[<128>][<255>]" ) self.assertEqual( s.decode("ascii", "test.handler2"), u"\x00[<129>][<128>]" ) self.assertEqual( "\\u3042\u3xxx".decode("unicode-escape", "test.handler1"), u"\u3042[<92><117><51><120>]xx" ) self.assertEqual( "\\u3042\u3xx".decode("unicode-escape", "test.handler1"), u"\u3042[<92><117><51><120><120>]" ) self.assertEqual( codecs.charmap_decode("abc", "test.handler1", {ord("a"): u"z"})[0], u"z[<98>][<99>]" ) self.assertEqual( u"g\xfc\xdfrk".encode("ascii", "test.handler1"), u"g[<252><223>]rk" ) self.assertEqual( u"g\xfc\xdf".encode("ascii", "test.handler1"), u"g[<252><223>]" )
def test_decode_with_string_map(self): self.assertEquals( codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"), (u"abc", 3) ) self.assertEquals( codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"), (u"ab\ufffd", 3) ) self.assertEquals( codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"), (u"ab\ufffd", 3) ) self.assertEquals( codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"), (u"ab", 3) ) self.assertEquals( codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"), (u"ab", 3) ) allbytes = "".join(chr(i) for i in xrange(256)) self.assertEquals( codecs.charmap_decode(allbytes, "ignore", u""), (u"", len(allbytes)) )
def internet_decode(input, errors='strict', final=False): """The core decoding function""" try: # First try utf-8. This should be the usual case by far. return codecs.utf_8_decode(input, errors, final) except UnicodeDecodeError: try: # If that fails, try windows-1252 (aka cp1252), which defines more characters than latin1, # but will fail for five particular bytes: 0x81, 0x8D, 0x8F, 0x90, 0x9D return codecs.charmap_decode(input, errors, encodings.cp1252.decoding_table) except UnicodeDecodeError: # and finally, try latin-1, which never fails, but defines 27 less characters than cp1252. return codecs.latin_1_decode(input, errors) except UnicodeEncodeError: # Was that thing already unicode? Then it's already decoded. if isinstance(input, unicode): return (input, len(input)) else: raise
def test_charmap_decode(self): #Sanity new_str, size = codecs.charmap_decode("abc") self.assertEqual(new_str, u'abc') self.assertEqual(size, 3) self.assertEqual(codecs.charmap_decode("a", 'strict', {ord('a') : u'a'})[0], u'a') self.assertEqual(codecs.charmap_decode("a", "replace", {})[0], u'\ufffd') self.assertEqual(codecs.charmap_decode("a", "replace", {ord('a'): None})[0], u'\ufffd') self.assertEqual(codecs.charmap_decode(""), (u'', 0)) # using a string mapping self.assertEqual(codecs.charmap_decode(u'\x02\x01\x00', 'strict', u"abc"), (u'cba', 3)) #Negative self.assertRaises(UnicodeDecodeError, codecs.charmap_decode, "a", "strict", {}) self.assertRaises(UnicodeDecodeError, codecs.charmap_decode, "a", "strict", {'a': None}) self.assertRaises(UnicodeEncodeError, codecs.charmap_encode, "a", "strict", {'a': None}) self.assertRaises(UnicodeEncodeError, codecs.charmap_encode, "a", "replace", {'a': None}) self.assertRaises(TypeError, codecs.charmap_decode, "a", "strict", {ord('a'): 2.0})
def decode(self, _input, errors='strict'): return codecs.charmap_decode(_input, errors, decoding_table)
""" Python Character Mapping Codec generated from '8859-8.TXT' with gencodec.py.
""" Python Character Mapping Codec generated from '8859-9.TXT' with gencodec.py.
""" Python Character Mapping Codec generated from 'CP1256.TXT' with gencodec.py.
""" Python Character Mapping Codec generated from 'CP424.TXT' with gencodec.py.
""" Python Character Mapping Codec generated from 'CP1006.TXT' with gencodec.py.
""" Python Character Mapping Codec generated from 'ICELAND.TXT' with gencodec.py.
#!/usr/bin/python # -*- coding: utf-8 -*- # # Copyright (c) 2008 Doug Hellmann All rights reserved. # import codecs import string # Map every character to itself decoding_map = codecs.make_identity_dict(range(256)) # Make a list of pairs of ordinal values for the lower and upper case # letters pairs = zip([ord(c) for c in string.ascii_lowercase], [ord(c) for c in string.ascii_uppercase]) # Modify the mapping to convert upper to lower and lower to upper. decoding_map.update(dict((upper, lower) for (lower, upper) in pairs)) decoding_map.update(dict((lower, upper) for (lower, upper) in pairs)) # Create a separate encoding map. encoding_map = codecs.make_encoding_map(decoding_map) if __name__ == '__main__': print codecs.charmap_encode('abc.DEF', 'strict', encoding_map) print codecs.charmap_decode('abc.DEF', 'strict', decoding_map) print encoding_map == decoding_map
def decode(self, input, errors="strict"): return codecs.charmap_decode(input, errors, decoding_table)
def decode(self,input,errors='strict'): return codecs.charmap_decode(input,errors,decoding_map)
def decode(self, input, final=False): return codecs.charmap_decode(input,self.errors,decoding_table)[0]
def decode(self, input, final=False): return codecs.charmap_decode(input, self.errors, self.mapping)[0]
""" Python Character Mapping Codec generated from 'CP500.TXT' with gencodec.py.
def test_charmap_decode(self): import codecs res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab") assert res == (u"ab\ufffd", 3) res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe") assert res == (u'ab\ufffd', 3)
""" Python Character Mapping Codec generated from 'CP875.TXT' with gencodec.py.
""" Python Character Mapping Codec generated from '8859-10.TXT' with gencodec.py.
def decode(self, input, errors=error_handling): return codecs.charmap_decode(input, errors, decoding_map)
def decode(self, char, errors='strict'): return codecs.charmap_decode(char, errors, DECODING_TABLE)
""" Python Character Mapping Codec generated from 'CP860.TXT' with gencodec.py.
""" Python Character Mapping Codec generated from 'CP852.TXT' with gencodec.py.
""" Python Character Mapping Codec generated from 'CP1257.TXT' with gencodec.py.
def decode(self, input, final = False): return codecs.charmap_decode(input, self.errors, decoding_table)[0]
""" Python Character Mapping Codec generated from 'KOI8-R.TXT' with gencodec.py.
# Map every character to itself decoding_map = codecs.make_identity_dict( xrange(256) ) # Make a list of pairs of ordinal values for all lcase and ucase letters pairs = zip( [ord(c) for c in string.ascii_lowercase], [ord(c) for c in string.ascii_uppercase] ) # modify the mapping to convert upper to lower and lower to upper decoding_map.update( dict( (upper, lower) for (lower, upper) in pairs ) ) decoding_map.update( dict( (lower, upper) for (lower, upper) in pairs ) ) # Create a separate encoding map encoding_map = codecs.make_encoding_map( decoding_map ) print (codecs.charmap_encode('abc.DEF', error_handling, encoding_map)) print (codecs.charmap_decode('abc.DEF', error_handling, decoding_map)) print encoding_map == decoding_map print # by default, char map encoders and decoders support the standard error methods # since this charmap only includes [a-zA-Z], the u"pi: π" from earlier fails for error in ['ignore', 'replace', 'strict']: try: encoded = codecs.charmap_encode(data, error, encoding_map) except UnicodeEncodeError, err: encoded = str(err) print '{:7} {}'.format(error, encoded) print # After defining a en/decoding maps, a few additonal classes have to be set up # and the encoding should be registered so codecs can locate it.
# # Copyright (c) 2010 Doug Hellmann. All rights reserved. # """Character mapping encoder """ #end_pymotw_header import codecs import string # Map every character to itself decoding_map = codecs.make_identity_dict(range(256)) # Make a list of pairs of ordinal values for the lower and upper case # letters pairs = zip([ ord(c) for c in string.ascii_lowercase], [ ord(c) for c in string.ascii_uppercase]) # Modify the mapping to convert upper to lower and lower to upper. decoding_map.update( dict( (upper, lower) for (lower, upper) in pairs) ) decoding_map.update( dict( (lower, upper) for (lower, upper) in pairs) ) # Create a separate encoding map. encoding_map = codecs.make_encoding_map(decoding_map) if __name__ == '__main__': print codecs.charmap_encode('abc.DEF', 'strict', encoding_map) print codecs.charmap_decode('abc.DEF', 'strict', decoding_map) print encoding_map == decoding_map
def decode(self, input, final=False): data, nbytes = codecs.charmap_decode(input, self.errors, decoding_map) return data
def decode(self, input, final = False): return codecs.charmap_decode(input, self.errors, self.mapping)[0]
decoding_map = codecs.make_identity_dict(range(256)) # Fa una lista di coppie di valori ordinali per le # lettere minuscole e maiuscole pairs = list(zip( [ord(c) for c in string.ascii_lowercase], [ord(c) for c in string.ascii_uppercase], )) # Modifica la mappatura per convertire le maiuscole in minuscole e viceversa decoding_map.update({ upper: lower for (lower, upper) in pairs }) decoding_map.update({ lower: upper for (lower, upper) in pairs }) # Crea una mappa di codifica separata encoding_map = codecs.make_encoding_map(decoding_map) if __name__ == '__main__': print(codecs.charmap_encode('abcDEF', 'strict', encoding_map)) print(codecs.charmap_decode(b'abcDEF', 'strict', decoding_map)) print(encoding_map == decoding_map)
def decode(self, char, final=False): return codecs.charmap_decode(char, self.errors, DECODING_TABLE)[0]
""" Python Character Mapping Codec generated from 'CP775.TXT' with gencodec.py.
""" Python Character Mapping Codec generated from '8859-13.TXT' with gencodec.py.
""" Python Character Mapping Codec generated from 'LATIN2.TXT' with gencodec.py.