def unknown_fmtchar(self): space = self.space if do_unicode: cp = rutf8.codepoint_at_pos(self.fmt, self.fmtpos - 1) pos = rutf8.codepoints_in_utf8(self.fmt, 0, self.fmtpos - 1) w_s = space.newutf8( rutf8.unichr_as_utf8(r_uint(cp), allow_surrogates=True), 1) else: cp = ord(self.fmt[self.fmtpos - 1]) pos = self.fmtpos - 1 w_s = space.newbytes(chr(cp)) raise oefmt(space.w_ValueError, "unsupported format character %R (%s) at index %d", w_s, hex(cp), pos)
def lookup(self, space, name): try: code = self._lookup(name.upper(), with_named_sequence=True) except KeyError: msg = space.mod(space.newtext("undefined character name '%s'"), space.newtext(name)) raise OperationError(space.w_KeyError, msg) # The code may be a named sequence sequence = self._lookup_named_sequence(code) if sequence is not None: # named sequences only contain UCS2 codes, no surrogates &co. return space.newutf8(sequence.encode('utf-8'), len(sequence)) return space.newutf8(unichr_as_utf8(r_uint(code)), 1)
def wrap_value(space, func, add_arg, argdesc, letter): for c, ll_type in ll_typemap_iter: if letter == c: if c in TYPEMAP_PTR_LETTERS: res = func(add_arg, argdesc, rffi.VOIDP) return space.newint(rffi.cast(lltype.Unsigned, res)) if c in TYPEMAP_NUMBER_LETTERS: return space.newint(func(add_arg, argdesc, ll_type)) elif c == 'c': return space.newbytes(func(add_arg, argdesc, ll_type)) elif c == 'u': return space.newutf8(rutf8.unichr_as_utf8( ord(func(add_arg, argdesc, ll_type))), 1) elif c == 'f' or c == 'd' or c == 'g': return space.newfloat(float(func(add_arg, argdesc, ll_type))) else: assert 0, "unreachable" raise oefmt(space.w_TypeError, "cannot directly read value")
def decode_escape_sequence_unicode(self, i, builder): # at this point we are just after the 'u' of the \u1234 sequence. start = i i += 4 hexdigits = self.getslice(start, i) try: val = int(hexdigits, 16) if sys.maxunicode > 65535 and 0xd800 <= val <= 0xdfff: # surrogate pair if self.ll_chars[i] == '\\' and self.ll_chars[i + 1] == 'u': val = self.decode_surrogate_pair(i, val) i += 6 except ValueError: self._raise("Invalid \uXXXX escape (char %d)", i - 1) return # help the annotator to know that we'll never go beyond # this point # utf8_ch = rutf8.unichr_as_utf8(r_uint(val), allow_surrogates=True) builder.append(utf8_ch) return i
def decode_escape_sequence_unicode(self, i, builder): # at this point we are just after the 'u' of the \u1234 sequence. start = i i += 4 try: val = self._get_int_val_from_hex4(start) if (0xd800 <= val <= 0xdbff and self.ll_chars[i] == '\\' and self.ll_chars[i + 1] == 'u'): lowsurr = self._get_int_val_from_hex4(i + 2) if 0xdc00 <= lowsurr <= 0xdfff: # decode surrogate pair val = 0x10000 + (((val - 0xd800) << 10) | (lowsurr - 0xdc00)) i += 6 except ValueError: raise DecoderError("Invalid \uXXXX escape (char %d)", i - 1) return # help the annotator to know that we'll never go beyond # this point # utf8_ch = rutf8.unichr_as_utf8(r_uint(val), allow_surrogates=True) builder.append(utf8_ch) return i
def wrap_value(space, func, add_arg, argdesc, letter): for c, ll_type in ll_typemap_iter: if letter == c: if c in TYPEMAP_PTR_LETTERS: res = func(add_arg, argdesc, rffi.VOIDP) return space.newint(rffi.cast(lltype.Unsigned, res)) if c in TYPEMAP_NUMBER_LETTERS: return space.newint(func(add_arg, argdesc, ll_type)) elif c == 'c': return space.newbytes(func(add_arg, argdesc, ll_type)) elif c == 'u': code = ord(func(add_arg, argdesc, ll_type)) try: return space.newutf8(rutf8.unichr_as_utf8( r_uint(code), allow_surrogates=True), 1) except rutf8.OutOfRange: raise oefmt(space.w_ValueError, "unicode character %d out of range", code) elif c == 'f' or c == 'd' or c == 'g': return space.newfloat(float(func(add_arg, argdesc, ll_type))) else: assert 0, "unreachable" raise oefmt(space.w_TypeError, "cannot directly read value")
def fmt_c(self, w_value): self.prec = -1 # just because space = self.space if space.isinstance_w(w_value, space.w_bytes): if do_unicode: w_value = w_value.descr_decode(space, space.newtext('ascii')) s = space.bytes_w(w_value) if len(s) != 1: raise oefmt(space.w_TypeError, "%c requires int or char") self.std_wp(s, True) elif space.isinstance_w(w_value, space.w_unicode): if not do_unicode: raise NeedUnicodeFormattingError ustr = space.utf8_w(w_value) if space.len_w(w_value) != 1: raise oefmt(space.w_TypeError, "%c requires int or unichar") self.std_wp(ustr, False) else: n = space.int_w(w_value) if do_unicode: try: c = rutf8.unichr_as_utf8(r_uint(n), allow_surrogates=True) except rutf8.OutOfRange: raise oefmt(space.w_OverflowError, "unicode character code out of range") self.std_wp(c, False) else: try: s = chr(n) except ValueError: raise oefmt(space.w_OverflowError, "character code not in range(256)") self.std_wp(s, True)
def chr(space, code): "Return a Unicode string of one character with the given ordinal." if code < 0 or code > 0x10FFFF: raise oefmt(space.w_ValueError, "chr() arg out of range") s = rutf8.unichr_as_utf8(code, allow_surrogates=True) return space.newutf8(s, 1)
def _format_int_or_long(self, w_num, kind): space = self.space if self._precision != -1: raise oefmt(space.w_ValueError, "precision not allowed in integer type") sign_char = "\0" tp = self._type if tp == "c": if self._sign != "\0": raise oefmt(space.w_ValueError, "sign not allowed with 'c' presentation type") value = space.int_w(w_num) max_char = 0x10FFFF if self.is_unicode else 0xFF if not (0 <= value <= max_char): raise oefmt(space.w_OverflowError, "%%c arg not in range(%s)", hex(max_char)) if self.is_unicode: result = rutf8.unichr_as_utf8(value) else: result = chr(value) n_digits = 1 n_remainder = 1 to_remainder = 0 n_prefix = 0 to_prefix = 0 to_numeric = 0 else: if tp == "b": base = 2 skip_leading = 2 elif tp == "o": base = 8 skip_leading = 2 elif tp == "x" or tp == "X": base = 16 skip_leading = 2 elif tp == "n" or tp == "d": base = 10 skip_leading = 0 else: raise AssertionError("shouldn't reach") if kind == INT_KIND: result = self._int_to_base(base, space.int_w(w_num)) else: result = self._long_to_base(base, space.bigint_w(w_num)) n_prefix = skip_leading if self._alternate else 0 to_prefix = 0 if result[0] == "-": sign_char = "-" skip_leading += 1 to_prefix += 1 n_digits = len(result) - skip_leading n_remainder = 0 to_remainder = 0 to_numeric = skip_leading self._get_locale(tp) spec = self._calc_num_width(n_prefix, sign_char, to_numeric, n_digits, n_remainder, False, result) fill = self._fill_char upper = self._type == "X" return self.wrap( self._fill_number(spec, result, to_numeric, to_prefix, fill, to_remainder, upper))
def _lit(self, s): assert len(s) == 1 if self.is_unicode: return rutf8.unichr_as_utf8(ord(s[0])) else: return s
def _get_delimiter(space, dialect): s = rutf8.unichr_as_utf8(dialect.delimiter) return space.newutf8(s, 1)
def _get_quotechar(space, dialect): if dialect.quotechar == 0: return space.w_None s = rutf8.unichr_as_utf8(dialect.quotechar) return space.newutf8(s, 1)
def append_utf8(self, value): w_ch = self.space.newutf8(rutf8.unichr_as_utf8(r_uint(value)), 1) self.result_w.append(w_ch)
self.std_wp(value) def fmt_c(self, w_value): self.prec = -1 # just because space = self.space try: w_value = space.index(w_value) except OperationError as e: if e. async (space): raise # otherwise, eats all exceptions, like CPython else: n = space.int_w(w_value) if do_unicode: try: c = rutf8.unichr_as_utf8(r_uint(n), allow_surrogates=True) except rutf8.OutOfRange: raise oefmt(space.w_OverflowError, "unicode character code out of range") self.std_wp(c, False) else: try: s = chr(n) except ValueError: raise oefmt(space.w_OverflowError, "character code not in range(256)") self.std_wp(s, True) return if not do_unicode: if space.isinstance_w(w_value, space.w_bytes): s = space.bytes_w(w_value)
def surrogatepass_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): w_obj = space.getattr(w_exc, space.newtext('object')) w_obj = space.convert_arg_to_w_unicode(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) encoding = space.text_w(space.getattr(w_exc, space.newtext('encoding'))) bytelength, code = get_standard_encoding(encoding) if code == ENC_UNKNOWN: # Not supported, fail with original exception raise OperationError(space.type(w_exc), w_exc) end = space.int_w(w_end) builder = StringBuilder() start = w_obj._index_to_byte(start) end = w_obj._index_to_byte(end) obj = w_obj._utf8 pos = start while pos < end: ch = rutf8.codepoint_at_pos(obj, pos) pos = rutf8.next_codepoint_pos(obj, pos) if ch < 0xd800 or ch > 0xdfff: # Not a surrogate, fail with original exception raise OperationError(space.type(w_exc), w_exc) if code == ENC_UTF8: builder.append(chr(0xe0 | (ch >> 12))) builder.append(chr(0x80 | ((ch >> 6) & 0x3f))) builder.append(chr(0x80 | (ch & 0x3f))) elif code == ENC_UTF16LE: builder.append(chr(ch & 0xff)) builder.append(chr(ch >> 8)) elif code == ENC_UTF16BE: builder.append(chr(ch >> 8)) builder.append(chr(ch & 0xff)) elif code == ENC_UTF32LE: builder.append(chr(ch & 0xff)) builder.append(chr(ch >> 8)) builder.append(chr(0)) builder.append(chr(0)) elif code == ENC_UTF32BE: builder.append(chr(0)) builder.append(chr(0)) builder.append(chr(ch >> 8)) builder.append(chr(ch & 0xff)) return space.newtuple([space.newbytes(builder.build()), w_end]) elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError): start = space.int_w(space.getattr(w_exc, space.newtext('start'))) obj = space.bytes_w(space.getattr(w_exc, space.newtext('object'))) encoding = space.text_w(space.getattr(w_exc, space.newtext('encoding'))) bytelength, code = get_standard_encoding(encoding) ch = 0 # Try decoding a single surrogate character. If there are more, # let the codec call us again ch0 = ord(obj[start + 0]) if len(obj) > start + 0 else -1 ch1 = ord(obj[start + 1]) if len(obj) > start + 1 else -1 ch2 = ord(obj[start + 2]) if len(obj) > start + 2 else -1 ch3 = ord(obj[start + 3]) if len(obj) > start + 3 else -1 if code == ENC_UTF8: if (ch1 != -1 and ch2 != -1 and ch0 & 0xf0 == 0xe0 and ch1 & 0xc0 == 0x80 and ch2 & 0xc0 == 0x80): # it's a three-byte code ch = ((ch0 & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f) elif code == ENC_UTF16LE: ch = (ch1 << 8) | ch0 elif code == ENC_UTF16BE: ch = (ch0 << 8) | ch1 elif code == ENC_UTF32LE: ch = (ch3 << 24) | (ch2 << 16) | (ch1 << 8) | ch0 elif code == ENC_UTF32BE: ch = (ch0 << 24) | (ch1 << 16) | (ch2 << 8) | ch3 if ch < 0xd800 or ch > 0xdfff: # it's not a surrogate - fail ch = 0 if ch == 0: raise OperationError(space.type(w_exc), w_exc) ch_utf8 = rutf8.unichr_as_utf8(ch, allow_surrogates=True) return space.newtuple( [space.newtext(ch_utf8, 1), space.newint(start + bytelength)]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)