def test_unichr(): assert runicode.UNICHR(0xffff) == u'\uffff' if runicode.MAXUNICODE > 0xffff: if sys.maxunicode < 0x10000: assert runicode.UNICHR(0x10000) == u'\ud800\udc00' else: assert runicode.UNICHR(0x10000) == u'\U00010000' else: py.test.raises(ValueError, runicode.UNICHR, 0x10000) py.test.raises(TypeError, runicode.UNICHR, 'abc')
def test_encode_surrogate_pair(self): u = runicode.UNICHR(0xD800) + runicode.UNICHR(0xDC00) if runicode.MAXUNICODE < 65536: # Narrow unicode build, consider utf16 surrogate pairs assert runicode.unicode_encode_unicode_escape( u, len(u), True) == r'\U00010000' assert runicode.unicode_encode_raw_unicode_escape( u, len(u), True) == r'\U00010000' else: # Wide unicode build, don't merge utf16 surrogate pairs assert runicode.unicode_encode_unicode_escape( u, len(u), True) == r'\ud800\udc00' assert runicode.unicode_encode_raw_unicode_escape( u, len(u), True) == r'\ud800\udc00'
def test_encode_surrogate_pair_utf8(self): u = runicode.UNICHR(0xD800) + runicode.UNICHR(0xDC00) if runicode.MAXUNICODE < 65536: # Narrow unicode build, consider utf16 surrogate pairs assert runicode.unicode_encode_utf_8( u, len(u), True, allow_surrogates=True) == '\xf0\x90\x80\x80' assert runicode.unicode_encode_utf_8( u, len(u), True, allow_surrogates=False) == '\xf0\x90\x80\x80' else: # Wide unicode build, merge utf16 surrogate pairs only when allowed assert runicode.unicode_encode_utf_8( u, len(u), True, allow_surrogates=True) == '\xf0\x90\x80\x80' # Surrogates not merged, encoding fails. py.test.raises( UnicodeEncodeError, runicode.unicode_encode_utf_8, u, len(u), True, allow_surrogates=False)
def _format_int_or_long(self, w_num, kind): space = self.space if self._precision != -1: msg = "precision not allowed in integer type" raise OperationError(space.w_ValueError, space.wrap(msg)) sign_char = "\0" tp = self._type if tp == "c": if self._sign != "\0": msg = "sign not allowed with 'c' presentation type" raise OperationError(space.w_ValueError, space.wrap(msg)) value = space.int_w(w_num) if self.is_unicode: result = runicode.UNICHR(value) else: result = chr(value) n_digits = 1 n_remainder = 1 to_remainder = 0 n_prefix = 0 to_prefix = 0 to_numeric = 0 else: if tp == "b": base = 2 skip_leading = 2 elif tp == "o": base = 8 skip_leading = 2 elif tp == "x" or tp == "X": base = 16 skip_leading = 2 elif tp == "n" or tp == "d": base = 10 skip_leading = 0 else: raise AssertionError("shouldn't reach") if kind == INT_KIND: result = self._int_to_base(base, space.int_w(w_num)) else: result = self._long_to_base(base, space.bigint_w(w_num)) n_prefix = skip_leading if self._alternate else 0 to_prefix = 0 if result[0] == "-": sign_char = "-" skip_leading += 1 to_prefix += 1 n_digits = len(result) - skip_leading n_remainder = 0 to_remainder = 0 to_numeric = skip_leading self._get_locale(tp) spec = self._calc_num_width(n_prefix, sign_char, to_numeric, n_digits, n_remainder, False, result) fill = self._fill_char upper = self._type == "X" return self.space.wrap( self._fill_number(spec, result, to_numeric, to_prefix, fill, to_remainder, upper))
def str_decode_utf8(s): from rpython.rlib.rstring import UnicodeBuilder from rpython.rlib import runicode size = len(s) if size == 0: return u'' result = UnicodeBuilder(size) pos = 0 while pos < size: ordch1 = ord(s[pos]) # fast path for ASCII # XXX maybe use a while loop here if ordch1 < 0x80: result.append(unichr(ordch1)) pos += 1 continue n = ord(runicode._utf8_code_length[ordch1 - 0x80]) if pos + n > size: raise UnicodeDecodeError('utf8', s, pos, pos + 1, 'whatever') if n == 0: raise UnicodeDecodeError('utf8', s, pos, pos + 1, 'whatever') elif n == 1: assert 0, "ascii should have gone through the fast path" elif n == 2: ordch2 = ord(s[pos + 1]) if runicode._invalid_byte_2_of_2(ordch2): raise UnicodeDecodeError('utf8', s, pos, pos + 1, 'whatever') # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz result.append( unichr(((ordch1 & 0x1F) << 6) + # 0b00011111 (ordch2 & 0x3F))) # 0b00111111 pos += 2 elif n == 3: ordch2 = ord(s[pos + 1]) ordch3 = ord(s[pos + 2]) if (runicode._invalid_byte_2_of_3(ordch1, ordch2, True) or runicode._invalid_byte_3_of_3(ordch3)): raise UnicodeDecodeError('utf8', s, pos, pos + 1, 'whatever') # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz result.append( unichr(((ordch1 & 0x0F) << 12) + # 0b00001111 ((ordch2 & 0x3F) << 6) + # 0b00111111 (ordch3 & 0x3F))) # 0b00111111 pos += 3 elif n == 4: ordch2 = ord(s[pos + 1]) ordch3 = ord(s[pos + 2]) ordch4 = ord(s[pos + 3]) if (runicode._invalid_byte_2_of_4(ordch1, ordch2) or runicode._invalid_byte_3_of_4(ordch3) or runicode._invalid_byte_4_of_4(ordch4)): raise UnicodeDecodeError('utf8', s, pos, pos + 1, 'whatever') # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz c = (((ordch1 & 0x07) << 18) + # 0b00000111 ((ordch2 & 0x3F) << 12) + # 0b00111111 ((ordch3 & 0x3F) << 6) + # 0b00111111 (ordch4 & 0x3F)) # 0b00111111 if c <= runicode.MAXUNICODE: result.append(runicode.UNICHR(c)) else: # compute and append the two surrogates: # translate from 10000..10FFFF to 0..FFFF c -= 0x10000 # high surrogate = top 10 bits added to D800 result.append(unichr(0xD800 + (c >> 10))) # low surrogate = bottom 10 bits added to DC00 result.append(unichr(0xDC00 + (c & 0x03FF))) pos += 4 return result.build()
def PyUnicode_GetMax(space): """Get the maximum ordinal for a Unicode character.""" return runicode.UNICHR(runicode.MAXUNICODE)
def _format_int_or_long(self, w_num, kind): space = self.space if self._precision != -1: raise oefmt(space.w_ValueError, "precisiĆ³n no permitido en tipo de entero") sign_char = "\0" tp = self._type if tp == "c": if self._sign != "\0": raise oefmt( space.w_ValueError, "signo no permitido con tipo de presentaciĆ³n 'c'") value = space.int_w(w_num) max_char = runicode.MAXUNICODE if self.is_unicode else 0xFF if not (0 <= value <= max_char): raise oefmt(space.w_OverflowError, "%%c arg no en rango(%s)", hex(max_char)) if self.is_unicode: result = runicode.UNICHR(value) else: result = chr(value) n_digits = 1 n_remainder = 1 to_remainder = 0 n_prefix = 0 to_prefix = 0 to_numeric = 0 else: if tp == "b": base = 2 skip_leading = 2 elif tp == "o": base = 8 skip_leading = 2 elif tp == "x" or tp == "X": base = 16 skip_leading = 2 elif tp == "n" or tp == "d": base = 10 skip_leading = 0 else: raise AssertionError("shouldn't reach") if kind == INT_KIND: result = self._int_to_base(base, space.int_w(w_num)) else: result = self._long_to_base(base, space.bigint_w(w_num)) n_prefix = skip_leading if self._alternate else 0 to_prefix = 0 if result[0] == "-": sign_char = "-" skip_leading += 1 to_prefix += 1 n_digits = len(result) - skip_leading n_remainder = 0 to_remainder = 0 to_numeric = skip_leading self._get_locale(tp) spec = self._calc_num_width(n_prefix, sign_char, to_numeric, n_digits, n_remainder, False, result) fill = self._fill_char upper = self._type == "X" return self.wrap( self._fill_number(spec, result, to_numeric, to_prefix, fill, to_remainder, upper))
def PyUnicode_GetMax(space): """Get the maximum ordinal for a Unicode character.""" from rpython.rlib import runicode, rutf8 return runicode.UNICHR(rutf8.MAXUNICODE)
def f(x): u = runicode.UNICHR(x) t = runicode.ORD(u) return t