Exemple #1
0
def test_unichr():
    assert runicode.UNICHR(0xffff) == u'\uffff'
    if runicode.MAXUNICODE > 0xffff:
        if sys.maxunicode < 0x10000:
            assert runicode.UNICHR(0x10000) == u'\ud800\udc00'
        else:
            assert runicode.UNICHR(0x10000) == u'\U00010000'
    else:
        py.test.raises(ValueError, runicode.UNICHR, 0x10000)
    py.test.raises(TypeError, runicode.UNICHR, 'abc')
Exemple #2
0
 def test_encode_surrogate_pair(self):
     u = runicode.UNICHR(0xD800) + runicode.UNICHR(0xDC00)
     if runicode.MAXUNICODE < 65536:
         # Narrow unicode build, consider utf16 surrogate pairs
         assert runicode.unicode_encode_unicode_escape(
             u, len(u), True) == r'\U00010000'
         assert runicode.unicode_encode_raw_unicode_escape(
             u, len(u), True) == r'\U00010000'
     else:
         # Wide unicode build, don't merge utf16 surrogate pairs
         assert runicode.unicode_encode_unicode_escape(
             u, len(u), True) == r'\ud800\udc00'
         assert runicode.unicode_encode_raw_unicode_escape(
             u, len(u), True) == r'\ud800\udc00'
Exemple #3
0
 def test_encode_surrogate_pair_utf8(self):
     u = runicode.UNICHR(0xD800) + runicode.UNICHR(0xDC00)
     if runicode.MAXUNICODE < 65536:
         # Narrow unicode build, consider utf16 surrogate pairs
         assert runicode.unicode_encode_utf_8(
             u, len(u), True, allow_surrogates=True) == '\xf0\x90\x80\x80'
         assert runicode.unicode_encode_utf_8(
             u, len(u), True, allow_surrogates=False) == '\xf0\x90\x80\x80'
     else:
         # Wide unicode build, merge utf16 surrogate pairs only when allowed
         assert runicode.unicode_encode_utf_8(
             u, len(u), True, allow_surrogates=True) == '\xf0\x90\x80\x80'
         # Surrogates not merged, encoding fails.
         py.test.raises(
             UnicodeEncodeError, runicode.unicode_encode_utf_8,
             u, len(u), True, allow_surrogates=False)
Exemple #4
0
 def _format_int_or_long(self, w_num, kind):
     space = self.space
     if self._precision != -1:
         msg = "precision not allowed in integer type"
         raise OperationError(space.w_ValueError, space.wrap(msg))
     sign_char = "\0"
     tp = self._type
     if tp == "c":
         if self._sign != "\0":
             msg = "sign not allowed with 'c' presentation type"
             raise OperationError(space.w_ValueError, space.wrap(msg))
         value = space.int_w(w_num)
         if self.is_unicode:
             result = runicode.UNICHR(value)
         else:
             result = chr(value)
         n_digits = 1
         n_remainder = 1
         to_remainder = 0
         n_prefix = 0
         to_prefix = 0
         to_numeric = 0
     else:
         if tp == "b":
             base = 2
             skip_leading = 2
         elif tp == "o":
             base = 8
             skip_leading = 2
         elif tp == "x" or tp == "X":
             base = 16
             skip_leading = 2
         elif tp == "n" or tp == "d":
             base = 10
             skip_leading = 0
         else:
             raise AssertionError("shouldn't reach")
         if kind == INT_KIND:
             result = self._int_to_base(base, space.int_w(w_num))
         else:
             result = self._long_to_base(base, space.bigint_w(w_num))
         n_prefix = skip_leading if self._alternate else 0
         to_prefix = 0
         if result[0] == "-":
             sign_char = "-"
             skip_leading += 1
             to_prefix += 1
         n_digits = len(result) - skip_leading
         n_remainder = 0
         to_remainder = 0
         to_numeric = skip_leading
     self._get_locale(tp)
     spec = self._calc_num_width(n_prefix, sign_char, to_numeric,
                                 n_digits, n_remainder, False, result)
     fill = self._fill_char
     upper = self._type == "X"
     return self.space.wrap(
         self._fill_number(spec, result, to_numeric, to_prefix, fill,
                           to_remainder, upper))
Exemple #5
0
def str_decode_utf8(s):
    from rpython.rlib.rstring import UnicodeBuilder
    from rpython.rlib import runicode

    size = len(s)
    if size == 0:
        return u''

    result = UnicodeBuilder(size)
    pos = 0
    while pos < size:
        ordch1 = ord(s[pos])
        # fast path for ASCII
        # XXX maybe use a while loop here
        if ordch1 < 0x80:
            result.append(unichr(ordch1))
            pos += 1
            continue

        n = ord(runicode._utf8_code_length[ordch1 - 0x80])
        if pos + n > size:
            raise UnicodeDecodeError('utf8', s, pos, pos + 1, 'whatever')
        if n == 0:
            raise UnicodeDecodeError('utf8', s, pos, pos + 1, 'whatever')
        elif n == 1:
            assert 0, "ascii should have gone through the fast path"

        elif n == 2:
            ordch2 = ord(s[pos + 1])
            if runicode._invalid_byte_2_of_2(ordch2):

                raise UnicodeDecodeError('utf8', s, pos, pos + 1, 'whatever')
            # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
            result.append(
                unichr(((ordch1 & 0x1F) << 6) +  # 0b00011111
                       (ordch2 & 0x3F)))  # 0b00111111
            pos += 2

        elif n == 3:
            ordch2 = ord(s[pos + 1])
            ordch3 = ord(s[pos + 2])
            if (runicode._invalid_byte_2_of_3(ordch1, ordch2, True)
                    or runicode._invalid_byte_3_of_3(ordch3)):
                raise UnicodeDecodeError('utf8', s, pos, pos + 1, 'whatever')
            # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
            result.append(
                unichr(((ordch1 & 0x0F) << 12) +  # 0b00001111
                       ((ordch2 & 0x3F) << 6) +  # 0b00111111
                       (ordch3 & 0x3F)))  # 0b00111111
            pos += 3

        elif n == 4:
            ordch2 = ord(s[pos + 1])
            ordch3 = ord(s[pos + 2])
            ordch4 = ord(s[pos + 3])
            if (runicode._invalid_byte_2_of_4(ordch1, ordch2)
                    or runicode._invalid_byte_3_of_4(ordch3)
                    or runicode._invalid_byte_4_of_4(ordch4)):

                raise UnicodeDecodeError('utf8', s, pos, pos + 1, 'whatever')
            # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
            c = (((ordch1 & 0x07) << 18) +  # 0b00000111
                 ((ordch2 & 0x3F) << 12) +  # 0b00111111
                 ((ordch3 & 0x3F) << 6) +  # 0b00111111
                 (ordch4 & 0x3F))  # 0b00111111
            if c <= runicode.MAXUNICODE:
                result.append(runicode.UNICHR(c))
            else:
                # compute and append the two surrogates:
                # translate from 10000..10FFFF to 0..FFFF
                c -= 0x10000
                # high surrogate = top 10 bits added to D800
                result.append(unichr(0xD800 + (c >> 10)))
                # low surrogate = bottom 10 bits added to DC00
                result.append(unichr(0xDC00 + (c & 0x03FF)))
            pos += 4

    return result.build()
Exemple #6
0
def PyUnicode_GetMax(space):
    """Get the maximum ordinal for a Unicode character."""
    return runicode.UNICHR(runicode.MAXUNICODE)
Exemple #7
0
 def _format_int_or_long(self, w_num, kind):
     space = self.space
     if self._precision != -1:
         raise oefmt(space.w_ValueError,
                     "precisión no permitido en tipo de entero")
     sign_char = "\0"
     tp = self._type
     if tp == "c":
         if self._sign != "\0":
             raise oefmt(
                 space.w_ValueError,
                 "signo no permitido con tipo de presentación 'c'")
         value = space.int_w(w_num)
         max_char = runicode.MAXUNICODE if self.is_unicode else 0xFF
         if not (0 <= value <= max_char):
             raise oefmt(space.w_OverflowError,
                         "%%c arg no en rango(%s)", hex(max_char))
         if self.is_unicode:
             result = runicode.UNICHR(value)
         else:
             result = chr(value)
         n_digits = 1
         n_remainder = 1
         to_remainder = 0
         n_prefix = 0
         to_prefix = 0
         to_numeric = 0
     else:
         if tp == "b":
             base = 2
             skip_leading = 2
         elif tp == "o":
             base = 8
             skip_leading = 2
         elif tp == "x" or tp == "X":
             base = 16
             skip_leading = 2
         elif tp == "n" or tp == "d":
             base = 10
             skip_leading = 0
         else:
             raise AssertionError("shouldn't reach")
         if kind == INT_KIND:
             result = self._int_to_base(base, space.int_w(w_num))
         else:
             result = self._long_to_base(base, space.bigint_w(w_num))
         n_prefix = skip_leading if self._alternate else 0
         to_prefix = 0
         if result[0] == "-":
             sign_char = "-"
             skip_leading += 1
             to_prefix += 1
         n_digits = len(result) - skip_leading
         n_remainder = 0
         to_remainder = 0
         to_numeric = skip_leading
     self._get_locale(tp)
     spec = self._calc_num_width(n_prefix, sign_char, to_numeric,
                                 n_digits, n_remainder, False, result)
     fill = self._fill_char
     upper = self._type == "X"
     return self.wrap(
         self._fill_number(spec, result, to_numeric, to_prefix, fill,
                           to_remainder, upper))
Exemple #8
0
def PyUnicode_GetMax(space):
    """Get the maximum ordinal for a Unicode character."""
    from rpython.rlib import runicode, rutf8
    return runicode.UNICHR(rutf8.MAXUNICODE)
Exemple #9
0
 def f(x):
     u = runicode.UNICHR(x)
     t = runicode.ORD(u)
     return t