Exemple #1
0
def pack_unichar(fmtiter):
    utf8, lgt = fmtiter.accept_unicode_arg()
    if lgt != 1:
        raise StructError("expected a unicode string of length 1")
    uchr = rutf8.codepoint_at_pos(utf8, 0)
    unichar.pack_codepoint(uchr, fmtiter.wbuf, fmtiter.pos)
    fmtiter.advance(unichar.UNICODE_SIZE)
Exemple #2
0
def decode_unicode_utf8(space, s, ps, q):
    # ****The Python 2.7 version, producing UTF-32 escapes****
    # String is utf8-encoded, but 'unicode_escape' expects
    # latin-1; So multibyte sequences must be escaped.
    lis = []  # using a list to assemble the value
    end = q
    # Worst case:
    # "<92><195><164>" may become "\u005c\U000000E4" (16 bytes)
    while ps < end:
        if s[ps] == '\\':
            lis.append(s[ps])
            ps += 1
            if ord(s[ps]) & 0x80:
                # A multibyte sequence will follow, it will be
                # escaped like \u1234. To avoid confusion with
                # the backslash we just wrote, we emit "\u005c"
                # instead.
                lis.append("u005c")
        if ord(s[ps]) & 0x80:
            cp = rutf8.codepoint_at_pos(s, ps)
            hexa = hex(cp + 0x10000000)
            lis.append('\\U0')
            lis.append(hexa[3:])  # Skip 0x and the leading 1
            ps = rutf8.next_codepoint_pos(s, ps)
        else:
            lis.append(s[ps])
            ps += 1
    return ''.join(lis)
Exemple #3
0
def xmlcharrefreplace_errors(space, w_exc):
    check_exception(space, w_exc)
    if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
        w_obj = space.getattr(w_exc, space.newtext('object'))
        space.realutf8_w(w_obj)  # weeoes
        w_obj = space.convert_arg_to_w_unicode(w_obj)
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        w_end = space.getattr(w_exc, space.newtext('end'))
        end = space.int_w(w_end)
        start = w_obj._index_to_byte(start)
        end = w_obj._index_to_byte(end)
        builder = StringBuilder()
        pos = start
        obj = w_obj._utf8
        while pos < end:
            code = rutf8.codepoint_at_pos(obj, pos)
            builder.append("&#")
            builder.append(str(code))
            builder.append(";")
            pos = rutf8.next_codepoint_pos(obj, pos)
        r = builder.build()
        lgt = rutf8.check_utf8(r, True)
        return space.newtuple([space.newutf8(r, lgt), w_end])
    else:
        raise oefmt(space.w_TypeError,
                    "don't know how to handle %T in error callback", w_exc)
Exemple #4
0
def namereplace_errors(space, w_exc):
    check_exception(space, w_exc)
    if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
        w_obj = space.getattr(w_exc, space.newtext('object'))
        space.realutf8_w(w_obj)  # for errors
        w_obj = space.convert_arg_to_w_unicode(w_obj)
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        w_end = space.getattr(w_exc, space.newtext('end'))
        end = space.int_w(w_end)
        start = w_obj._index_to_byte(start)
        end = w_obj._index_to_byte(end)
        builder = StringBuilder()
        obj = w_obj._utf8
        pos = start
        while pos < end:
            oc = rutf8.codepoint_at_pos(obj, pos)
            try:
                name = unicodedb.name(oc)
            except KeyError:
                unicodehelper.raw_unicode_escape_helper(builder, oc)
            else:
                builder.append('\\N{')
                builder.append(name)
                builder.append('}')
            pos = rutf8.next_codepoint_pos(obj, pos)
        r = builder.build()
        lgt = rutf8.check_utf8(r, True)
        return space.newtuple([space.newutf8(r, lgt), w_end])
    else:
        raise oefmt(space.w_TypeError,
                    "don't know how to handle %T in error callback", w_exc)
Exemple #5
0
 def cast_unicode(self, w_ob):
     space = self.space
     w_u = space.convert_arg_to_w_unicode(w_ob)
     if w_u._len() != 1:
         raise oefmt(
             space.w_TypeError,
             "cannot cast unicode string of length %d to ctype '%s'",
             w_u._len(), self.name)
     return rutf8.codepoint_at_pos(w_u._utf8, 0)
Exemple #6
0
def test_utf8_iterator_pos(arg):
    utf8s = arg.encode('utf8')
    u = rutf8.Utf8StringPosIterator(utf8s)
    l = []
    i = 0
    for c, pos in u:
        l.append(unichr(c))
        assert c == rutf8.codepoint_at_pos(utf8s, pos)
        assert pos == i
        i = rutf8.next_codepoint_pos(utf8s, i)
    assert list(arg) == l
Exemple #7
0
def charmap_build(space, chars):
    # XXX CPython sometimes uses a three-level trie
    w_charmap = space.newdict()
    pos = 0
    num = 0
    while pos < len(chars):
        w_char = space.newint(rutf8.codepoint_at_pos(chars, pos))
        space.setitem(w_charmap, w_char, space.newint(num))
        pos = rutf8.next_codepoint_pos(chars, pos)
        num += 1
    return w_charmap
Exemple #8
0
 def unknown_fmtchar(self):
     space = self.space
     if do_unicode:
         cp = rutf8.codepoint_at_pos(self.fmt, self.fmtpos - 1)
         w_s = space.newutf8(rutf8.unichr_as_utf8(r_uint(cp)), 1)
     else:
         cp = ord(self.fmt[self.fmtpos - 1])
         w_s = space.newbytes(chr(cp))
     raise oefmt(space.w_ValueError,
                 "unsupported format character %R (%s) at index %d",
                 w_s, hex(cp), self.fmtpos - 1)
Exemple #9
0
 def _get_error_info(self, pos):
     space = self.space
     if do_unicode:
         cp = rutf8.codepoint_at_pos(self.fmt, pos)
         pos = rutf8.codepoints_in_utf8(self.fmt, 0, pos)
         w_s = space.newutf8(
             rutf8.unichr_as_utf8(r_uint(cp), allow_surrogates=True), 1)
     else:
         cp = ord(self.fmt[pos])
         w_s = space.newbytes(chr(cp))
     return w_s, pos, cp
Exemple #10
0
def utf8_to_char32(utf8, target_ptr, target_length, add_final_zero):
    # 'target_ptr' is a raw pointer to 'target_length' 32-bit integers;
    # we assume (and check) that target_length == number of unichars in utf8.
    unichardata = rffi.cast(rffi.UINTP, target_ptr)
    i = 0
    for j in range(target_length):
        code = rutf8.codepoint_at_pos(utf8, i)
        unichardata[j] = rffi.cast(rffi.UINT, code)
        i = rutf8.next_codepoint_pos(utf8, i)
    assert i == len(utf8)
    if add_final_zero:
        unichardata[target_length] = rffi.cast(rffi.UINT, 0)
Exemple #11
0
def _get_codepoint(space, w_src, default, name):
    if w_src is None:
        return default
    if space.is_w(w_src, space.w_None):
        return 0
    if not space.isinstance_w(w_src, space.w_unicode):
        raise oefmt(space.w_TypeError, '"%s" must be string, not %T', name, w_src)
    src, length = space.utf8_len_w(w_src)
    if length == 1:
        res = rutf8.codepoint_at_pos(src, 0)
        assert res >= 0
        return res
    if len(src) == 0:
        return 0
    raise oefmt(space.w_TypeError, '"%s" must be a 1-character string', name)
Exemple #12
0
def surrogateescape_errors(space, w_exc):
    check_exception(space, w_exc)
    if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
        w_obj = space.getattr(w_exc, space.newtext('object'))
        w_obj = space.convert_arg_to_w_unicode(w_obj)
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        w_end = space.getattr(w_exc, space.newtext('end'))
        end = space.int_w(w_end)
        res = ''
        start = w_obj._index_to_byte(start)
        end = w_obj._index_to_byte(end)
        obj = w_obj._utf8
        pos = start
        while pos < end:
            code = rutf8.codepoint_at_pos(obj, pos)
            if code < 0xdc80 or code > 0xdcff:
                # Not a UTF-8b surrogate, fail with original exception
                raise OperationError(space.type(w_exc), w_exc)
            res += chr(code - 0xdc00)
            pos = rutf8.next_codepoint_pos(obj, pos)
        return space.newtuple([space.newbytes(res), w_end])
    elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
        consumed = 0
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        end = space.int_w(space.getattr(w_exc, space.newtext('end')))
        obj = space.bytes_w(space.getattr(w_exc, space.newtext('object')))
        replace = u''
        while consumed < 4 and consumed < end - start:
            c = ord(obj[start + consumed])
            if c < 128:
                # Refuse to escape ASCII bytes.
                break
            replace += unichr(0xdc00 + c)
            consumed += 1
        if not consumed:
            # codec complained about ASCII byte.
            raise OperationError(space.type(w_exc), w_exc)
        replace_utf8 = runicode.unicode_encode_utf_8(replace,
                                                     len(replace),
                                                     'strict',
                                                     allow_surrogates=True)
        return space.newtuple([
            space.newtext(replace_utf8, len(replace)),
            space.newint(start + consumed)
        ])
    else:
        raise oefmt(space.w_TypeError,
                    "don't know how to handle %T in error callback", w_exc)
Exemple #13
0
 def _convert_to_charN_t(self, w_ob):
     # returns a r_uint.  If self.size == 2, it is smaller than 0x10000
     space = self.space
     if space.isinstance_w(w_ob, space.w_unicode):
         w_u = space.convert_arg_to_w_unicode(w_ob)
         if w_u._len() != 1:
             raise self._convert_error("single character", w_ob)
         ordinal = rutf8.codepoint_at_pos(w_u._utf8, 0)
         if self.size == 2 and ordinal > 0xFFFF:
             raise self._convert_error("single character <= 0xFFFF", w_ob)
         return r_uint(ordinal)
     elif (isinstance(w_ob, cdataobj.W_CData)
           and isinstance(w_ob.ctype, W_CTypePrimitiveUniChar)
           and w_ob.ctype.size == self.size):
         with w_ob as ptr:
             return misc.read_raw_ulong_data(ptr, self.size)
     raise self._convert_error("unicode string of length 1", w_ob)
Exemple #14
0
def unwrap_value(space, push_func, add_arg, argdesc, letter, w_arg):
    if letter in TYPEMAP_PTR_LETTERS:
        # check for NULL ptr
        if isinstance(w_arg, W_DataInstance):
            ptr = w_arg.ll_buffer
        else:
            ptr = unwrap_truncate_int(rffi.VOIDP, space, w_arg)
        push_func(add_arg, argdesc, ptr)
    elif letter == "d":
        push_func(add_arg, argdesc, space.float_w(w_arg))
    elif letter == "f":
        push_func(add_arg, argdesc, rffi.cast(rffi.FLOAT,
                                              space.float_w(w_arg)))
    elif letter == "g":
        push_func(add_arg, argdesc,
                  rffi.cast(rffi.LONGDOUBLE, space.float_w(w_arg)))
    elif letter == "c":
        if space.isinstance_w(w_arg, space.w_int):
            val = space.byte_w(w_arg)
        else:
            s = space.bytes_w(w_arg)
            if len(s) != 1:
                raise oefmt(space.w_TypeError,
                            "Expected bytes of length one as character")
            val = s[0]
        push_func(add_arg, argdesc, val)
    elif letter == 'u':
        s, lgt = space.utf8_len_w(w_arg)
        if lgt != 1:
            raise oefmt(
                space.w_TypeError,
                "Expected unicode string of length one as wide "
                "character")
        val = rutf8.codepoint_at_pos(s, 0)
        push_func(add_arg, argdesc, rffi.cast(rffi.WCHAR_T, val))
    else:
        for c in unroll_letters_for_numbers:
            if letter == c:
                TP = LL_TYPEMAP[c]
                val = unwrap_truncate_int(TP, space, w_arg)
                push_func(add_arg, argdesc, val)
                return
        else:
            raise oefmt(space.w_TypeError, "cannot directly write value")
Exemple #15
0
def utf8_to_char16(utf8, target_ptr, target_length, add_final_zero):
    # 'target_ptr' is a raw pointer to 'target_length' 16-bit integers;
    # we assume (and check) that target_length == utf8_size_as_char16(utf8).
    ptr = rffi.cast(rffi.USHORTP, target_ptr)
    i = 0
    while i < len(utf8):
        ordinal = rutf8.codepoint_at_pos(utf8, i)
        if ordinal > 0xFFFF:
            ordinal -= 0x10000
            ptr[0] = rffi.cast(rffi.USHORT, 0xD800 | (ordinal >> 10))
            ptr[1] = rffi.cast(rffi.USHORT, 0xDC00 | (ordinal & 0x3FF))
            ptr = rffi.ptradd(ptr, 2)
        else:
            ptr[0] = rffi.cast(rffi.USHORT, ordinal)
            ptr = rffi.ptradd(ptr, 1)
        i = rutf8.next_codepoint_pos(utf8, i)
    assert ptr == (
        rffi.ptradd(rffi.cast(rffi.USHORTP, target_ptr), target_length))
    if add_final_zero:
        ptr[0] = rffi.cast(rffi.USHORT, 0)
Exemple #16
0
 def item_w(self, w_item):
     space = self.space
     unwrap = getattr(space, mytype.unwrap)
     try:
         item = unwrap(w_item)
     except OperationError as e:
         if space.isinstance_w(w_item, space.w_float):
             # Odd special case from cpython
             raise
         if mytype.method != '' and e.match(space, space.w_TypeError):
             try:
                 item = unwrap(space.call_method(w_item, mytype.method))
             except OperationError:
                 raise oefmt(space.w_TypeError,
                             "array item must be " + mytype.errorname)
         else:
             raise
     if mytype.unwrap == 'bigint_w':
         try:
             item = item.touint()
         except (ValueError, OverflowError):
             raise oefmt(space.w_OverflowError,
                         "unsigned %d-byte integer out of range",
                         mytype.bytes)
         return rffi.cast(mytype.itemtype, item)
     if mytype.unwrap == 'bytes_w':
         if len(item) != 1:
             raise oefmt(space.w_TypeError, "array item must be char")
         item = item[0]
         return rffi.cast(mytype.itemtype, item)
     if mytype.unwrap == 'utf8_len_w':
         utf8, lgt = item
         if lgt != 1:
             raise oefmt(space.w_TypeError, "array item must be char")
         uchar = rutf8.codepoint_at_pos(utf8, 0)
         return rffi.cast(mytype.itemtype, uchar)
     #
     # "regular" case: it fits in an rpython integer (lltype.Signed)
     # or it is a float
     return self.item_from_int_or_float(item)
Exemple #17
0
def backslashreplace_errors(space, w_exc):
    check_exception(space, w_exc)
    if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
        w_obj = space.getattr(w_exc, space.newtext('object'))
        space.realutf8_w(w_obj)  # for errors
        w_obj = space.convert_arg_to_w_unicode(w_obj)
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        w_end = space.getattr(w_exc, space.newtext('end'))
        end = space.int_w(w_end)
        start = w_obj._index_to_byte(start)
        end = w_obj._index_to_byte(end)
        builder = StringBuilder()
        obj = w_obj._utf8
        pos = start
        while pos < end:
            oc = rutf8.codepoint_at_pos(obj, pos)
            num = hex(oc)
            if (oc >= 0x10000):
                builder.append("\\U")
                zeros = 8
            elif (oc >= 0x100):
                builder.append("\\u")
                zeros = 4
            else:
                builder.append("\\x")
                zeros = 2
            lnum = len(num)
            nb = zeros + 2 - lnum  # num starts with '0x'
            if nb > 0:
                builder.append_multiple_char('0', nb)
            builder.append_slice(num, 2, lnum)
            pos = rutf8.next_codepoint_pos(obj, pos)
        r = builder.build()
        lgt = rutf8.check_utf8(r, True)
        return space.newtuple([space.newutf8(r, lgt), w_end])
    else:
        raise oefmt(space.w_TypeError,
                    "don't know how to handle %T in error callback", w_exc)
Exemple #18
0
def backslashreplace_errors(space, w_exc):

    check_exception(space, w_exc)
    if (space.isinstance_w(w_exc, space.w_UnicodeEncodeError)
            or space.isinstance_w(w_exc, space.w_UnicodeTranslateError)):
        w_obj = space.getattr(w_exc, space.newtext('object'))
        space.realutf8_w(w_obj)  # for errors
        w_obj = space.convert_arg_to_w_unicode(w_obj)
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        w_end = space.getattr(w_exc, space.newtext('end'))
        end = space.int_w(w_end)
        start = w_obj._index_to_byte(start)
        end = w_obj._index_to_byte(end)
        builder = StringBuilder()
        pos = start
        obj = w_obj._utf8
        while pos < end:
            code = rutf8.codepoint_at_pos(obj, pos)
            unicodehelper.raw_unicode_escape_helper(builder, code)
            pos = rutf8.next_codepoint_pos(obj, pos)
        return space.newtuple([space.newtext(builder.build()), w_end])
    elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
        obj = space.bytes_w(space.getattr(w_exc, space.newtext('object')))
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        w_end = space.getattr(w_exc, space.newtext('end'))
        end = space.int_w(w_end)
        builder = StringBuilder()
        pos = start
        while pos < end:
            oc = ord(obj[pos])
            unicodehelper.raw_unicode_escape_helper(builder, oc)
            pos += 1
        return space.newtuple([space.newtext(builder.build()), w_end])
    else:
        raise oefmt(space.w_TypeError,
                    "don't know how to handle %T in error callback", w_exc)
Exemple #19
0
def surrogatepass_errors(space, w_exc):
    check_exception(space, w_exc)
    if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
        w_obj = space.getattr(w_exc, space.newtext('object'))
        w_obj = space.convert_arg_to_w_unicode(w_obj)
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        w_end = space.getattr(w_exc, space.newtext('end'))
        encoding = space.text_w(space.getattr(w_exc,
                                              space.newtext('encoding')))
        bytelength, code = get_standard_encoding(encoding)
        if code == ENC_UNKNOWN:
            # Not supported, fail with original exception
            raise OperationError(space.type(w_exc), w_exc)
        end = space.int_w(w_end)
        builder = StringBuilder()
        start = w_obj._index_to_byte(start)
        end = w_obj._index_to_byte(end)
        obj = w_obj._utf8
        pos = start
        while pos < end:
            ch = rutf8.codepoint_at_pos(obj, pos)
            pos = rutf8.next_codepoint_pos(obj, pos)
            if ch < 0xd800 or ch > 0xdfff:
                # Not a surrogate, fail with original exception
                raise OperationError(space.type(w_exc), w_exc)
            if code == ENC_UTF8:
                builder.append(chr(0xe0 | (ch >> 12)))
                builder.append(chr(0x80 | ((ch >> 6) & 0x3f)))
                builder.append(chr(0x80 | (ch & 0x3f)))
            elif code == ENC_UTF16LE:
                builder.append(chr(ch & 0xff))
                builder.append(chr(ch >> 8))
            elif code == ENC_UTF16BE:
                builder.append(chr(ch >> 8))
                builder.append(chr(ch & 0xff))
            elif code == ENC_UTF32LE:
                builder.append(chr(ch & 0xff))
                builder.append(chr(ch >> 8))
                builder.append(chr(0))
                builder.append(chr(0))
            elif code == ENC_UTF32BE:
                builder.append(chr(0))
                builder.append(chr(0))
                builder.append(chr(ch >> 8))
                builder.append(chr(ch & 0xff))
        return space.newtuple([space.newbytes(builder.build()), w_end])
    elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        obj = space.bytes_w(space.getattr(w_exc, space.newtext('object')))
        encoding = space.text_w(space.getattr(w_exc,
                                              space.newtext('encoding')))
        bytelength, code = get_standard_encoding(encoding)
        ch = 0
        # Try decoding a single surrogate character. If there are more,
        # let the codec call us again
        ch0 = ord(obj[start + 0]) if len(obj) > start + 0 else -1
        ch1 = ord(obj[start + 1]) if len(obj) > start + 1 else -1
        ch2 = ord(obj[start + 2]) if len(obj) > start + 2 else -1
        ch3 = ord(obj[start + 3]) if len(obj) > start + 3 else -1
        if code == ENC_UTF8:
            if (ch1 != -1 and ch2 != -1 and ch0 & 0xf0 == 0xe0
                    and ch1 & 0xc0 == 0x80 and ch2 & 0xc0 == 0x80):
                # it's a three-byte code
                ch = ((ch0 & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f)
        elif code == ENC_UTF16LE:
            ch = (ch1 << 8) | ch0
        elif code == ENC_UTF16BE:
            ch = (ch0 << 8) | ch1
        elif code == ENC_UTF32LE:
            ch = (ch3 << 24) | (ch2 << 16) | (ch1 << 8) | ch0
        elif code == ENC_UTF32BE:
            ch = (ch0 << 24) | (ch1 << 16) | (ch2 << 8) | ch3
        if ch < 0xd800 or ch > 0xdfff:
            # it's not a surrogate - fail
            ch = 0
        if ch == 0:
            raise OperationError(space.type(w_exc), w_exc)
        ch_utf8 = rutf8.unichr_as_utf8(ch, allow_surrogates=True)
        return space.newtuple(
            [space.newtext(ch_utf8, 1),
             space.newint(start + bytelength)])
    else:
        raise oefmt(space.w_TypeError,
                    "don't know how to handle %T in error callback", w_exc)
Exemple #20
0
 def str(self, index):
     check_nonneg(index)
     return rutf8.codepoint_at_pos(self._utf8, index)
Exemple #21
0
    class W_Array(W_ArrayBase):
        itemsize = mytype.bytes
        typecode = mytype.typecode

        _attrs_ = W_ArrayBase._attrs_

        def get_buffer(self):
            return rffi.cast(mytype.arrayptrtype, self._buffer)

        if mytype.unwrap == 'utf8_len_w':

            def check_valid_unicode(self, space, s):
                i = 0
                while i < len(s):
                    if s[i] != '\x00' or ord(s[i + 1]) > 0x10:
                        v = ((ord(s[i]) << 24) + (ord(s[i + 1]) << 16) +
                             (ord(s[i + 2]) << 8) + ord(s[i + 3]))
                        raise oefmt(
                            space.w_ValueError,
                            "Character U+%s is not in range [U+0000, U+10ffff]",
                            hex(v)[2:])
                    i += 4

        def item_w(self, w_item):
            space = self.space
            unwrap = getattr(space, mytype.unwrap)
            try:
                item = unwrap(w_item)
            except OperationError as e:
                if space.isinstance_w(w_item, space.w_float):
                    # Odd special case from cpython
                    raise
                if mytype.method != '' and e.match(space, space.w_TypeError):
                    try:
                        item = unwrap(space.call_method(w_item, mytype.method))
                    except OperationError as e:
                        if e. async (space):
                            raise
                        msg = "array item must be " + mytype.errorname
                        raise OperationError(space.w_TypeError,
                                             space.newtext(msg))
                else:
                    raise
            if mytype.convert:
                try:
                    item = getattr(item, mytype.convert)()
                except (ValueError, OverflowError):
                    raise oefmt(space.w_OverflowError,
                                "unsigned %d-byte integer out of range",
                                mytype.bytes)
                return rffi.cast(mytype.itemtype, item)
            if mytype.unwrap == 'utf8_len_w':
                utf8, lgt = item
                if lgt != 1:
                    raise oefmt(space.w_TypeError, "array item must be char")
                uchar = rutf8.codepoint_at_pos(utf8, 0)
                return rffi.cast(mytype.itemtype, uchar)
            #
            # "regular" case: it fits in an rpython integer (lltype.Signed)
            # or it is a float
            return self.item_from_int_or_float(item)