Example #1
0
    def UnknownEncodingHandler(self, space, name, info):
        # Yes, supports only 8bit encodings
        translationmap, lgt = space.utf8_len_w(
            space.call_method(space.newbytes(self.all_chars), "decode",
                              space.newtext(name), space.newtext("replace")))

        if lgt != 256:
            raise oefmt(space.w_ValueError,
                        "multi-byte encodings are not supported")

        i = 0
        for c in rutf8.Utf8StringIterator(translationmap):
            if c == 0xfffd:
                info.c_map[i] = rffi.cast(rffi.INT, -1)
            else:
                info.c_map[i] = rffi.cast(rffi.INT, c)
            i += 1
        info.c_data = lltype.nullptr(rffi.VOIDP.TO)
        info.c_convert = lltype.nullptr(rffi.VOIDP.TO)
        info.c_release = lltype.nullptr(rffi.VOIDP.TO)
        return True
Example #2
0
def raw_encode_basestring_ascii(space, w_string):
    if space.isinstance_w(w_string, space.w_bytes):
        s = space.bytes_w(w_string)
        for i in range(len(s)):
            c = s[i]
            if c >= ' ' and c <= '~' and c != '"' and c != '\\':
                pass
            else:
                first = i
                break
        else:
            # the input is a string with only non-special ascii chars
            return w_string

        unicodehelper.check_utf8_or_raise(space, s)
        sb = StringBuilder(len(s))
        sb.append_slice(s, 0, first)
    else:
        # We used to check if 'u' contains only safe characters, and return
        # 'w_string' directly.  But this requires an extra pass over all
        # characters, and the expected use case of this function, from
        # json.encoder, will anyway re-encode a unicode result back to
        # a string (with the ascii encoding).  This requires two passes
        # over the characters.  So we may as well directly turn it into a
        # string here --- only one pass.
        s = space.utf8_w(w_string)
        sb = StringBuilder(len(s))
        first = 0

    it = rutf8.Utf8StringIterator(s)
    for i in range(first):
        it.next()
    for c in it:
        if c <= ord('~'):
            if c == ord('"') or c == ord('\\'):
                sb.append('\\')
            elif c < ord(' '):
                sb.append(ESCAPE_BEFORE_SPACE[c])
                continue
            sb.append(chr(c))
        else:
            if c <= ord(u'\uffff'):
                sb.append('\\u')
                sb.append(HEX[c >> 12])
                sb.append(HEX[(c >> 8) & 0x0f])
                sb.append(HEX[(c >> 4) & 0x0f])
                sb.append(HEX[c & 0x0f])
            else:
                # surrogate pair
                n = c - 0x10000
                s1 = 0xd800 | ((n >> 10) & 0x3ff)
                sb.append('\\ud')
                sb.append(HEX[(s1 >> 8) & 0x0f])
                sb.append(HEX[(s1 >> 4) & 0x0f])
                sb.append(HEX[s1 & 0x0f])
                s2 = 0xdc00 | (n & 0x3ff)
                sb.append('\\ud')
                sb.append(HEX[(s2 >> 8) & 0x0f])
                sb.append(HEX[(s2 >> 4) & 0x0f])
                sb.append(HEX[s2 & 0x0f])

    res = sb.build()
    return space.newtext(res)
Example #3
0
def descr_upper(s):
    builder = rutf8.Utf8StringBuilder(len(s))
    for ch in rutf8.Utf8StringIterator(s):
        ch = unicodedb.toupper(ch)
        builder.append_code(ch)
    return builder.build()
Example #4
0
def test_utf8_iterator(arg):
    u = rutf8.Utf8StringIterator(arg.encode('utf8'))
    l = []
    for c in u:
        l.append(unichr(c))
    assert list(arg) == l