def w_convert(self, space, s): # I suppose this is a valid utf8, but there is noone to check # and noone to catch an error either try: lgt = rutf8.check_utf8(s, True) return space.newutf8(s, lgt) except rutf8.CheckError: from pypy.interpreter import unicodehelper # get the correct error msg unicodehelper.str_decode_utf8( s, 'string', True, unicodehelper.decode_error_handler(space)) assert False, "always raises" return space.newtext(s)
def utf_8_decode(space, string, errors="strict", w_final=None): if errors is None: errors = 'strict' final = space.is_true(w_final) state = space.fromcache(CodecState) res, lgt, pos = unicodehelper.str_decode_utf8(string, errors, final, state.decode_error_handler) return space.newtuple([space.newutf8(res, lgt), space.newint(pos)])
def descr_decode(self, space, w_encoding=None, w_errors=None): from pypy.objspace.std.unicodeobject import ( get_encoding_and_errors, decode_object) encoding, errors = get_encoding_and_errors(space, w_encoding, w_errors) if encoding is None: encoding = 'utf8' if encoding == 'utf8' or encoding == 'utf-8': # fast path - do not call into app-level codecs.py from pypy.module._codecs.interp_codecs import CodecState state = space.fromcache(CodecState) eh = state.decode_error_handler s = space.charbuf_w(self) ret, lgt, pos = str_decode_utf8(s, errors, True, eh) return space.newtext(ret, lgt) return decode_object(space, self, encoding, errors)
def _compute_value(self, space): lst = [None] * (len(formats) + len(formats) + 1) lgt = 0 for i, fmt, attr in entries: lst[i + i] = self.xstrings[i] lgt += len(self.xstrings[i]) value = getattr(self, attr) if fmt == 'd': result = str(value) lgt += len(result) elif fmt == 'R': s = space.repr(value) result = space.utf8_w(s) lgt += space.len_w(s) elif fmt == 'S': s = space.str(value) result = space.utf8_w(s) lgt += space.len_w(s) elif fmt == 'T': result = space.type(value).name lgt += rutf8.codepoints_in_utf8(result) elif fmt == 'N': result = value.getname(space) lgt += len(result) elif fmt == '8': # u'str\uxxxx' -> 'str\xXX\xXX' -> u"'str\xXX\xXX'" from pypy.interpreter import unicodehelper result, _lgt, pos = unicodehelper.str_decode_utf8( value, 'replace', True, unicodehelper.decode_never_raise, True) lgt += _lgt elif isinstance(value, unicode): # 's' result = str(value.encode('utf-8')) lgt += len(value) else: result = str(value) try: lgt += rutf8.check_utf8(result, True) except rutf8.CheckError as e: lgt -= e.pos lst[i + i + 1] = result lst[-1] = self.xstrings[-1] lgt += len(self.xstrings[-1]) retval = ''.join(lst) return retval, lgt
def utf_8_decode(space, string, errors="strict", w_final=None): from pypy.interpreter import unicodehelper if errors is None: errors = 'strict' final = space.is_true(w_final) state = space.fromcache(CodecState) # call the fast version for checking try: lgt = rutf8.check_utf8(string, allow_surrogates=True) except rutf8.CheckError: res, consumed, lgt = unicodehelper.str_decode_utf8( string, errors, final, state.decode_error_handler) return space.newtuple2(space.newutf8(res, lgt), space.newint(consumed)) else: return space.newtuple2(space.newutf8(string, lgt), space.newint(len(string)))
def decode_utf8(u): return str_decode_utf8(u, True, "strict", None)