Ejemplo n.º 1
0
def hexescape(builder, s, pos, digits, encoding, errorhandler, message, errors):
    from rpython.rlib.rarithmetic import r_uint
    from rpython.rlib.runicode import MAXUNICODE, UNICHR

    chr = 0
    if pos + digits > len(s):
        message = "end of string in escape sequence"
        res, pos = errorhandler(errors, "unicodeescape", message, s, pos - 2, len(s))
        builder.append(res)
    else:
        try:
            chr = r_uint(int(str(s[pos:pos + digits]), 16))
        except ValueError:
            endinpos = pos
            while s[endinpos] in hexdigits:
                endinpos += 1
            res, pos = errorhandler(errors, encoding, message, s, pos - 2, endinpos + 1)
            builder.append(res)
        else:
            # when we get here, chr is a 32-bit unicode character
            if chr <= MAXUNICODE:
                builder.append(UNICHR(chr))
                pos += digits

            elif chr <= 0x10ffff:
                chr -= 0x10000L
                builder.append(unichr(0xD800 + (chr >> 10)))
                builder.append(unichr(0xDC00 + (chr & 0x03FF)))
                pos += digits
            else:
                message = "illegal Unicode character"
                res, pos = errorhandler(errors, encoding, message, s, pos - 2, pos + digits)
                builder.append(res)
    return pos
Ejemplo n.º 2
0
def hex_to_utf8(state, token, s):
    try:
        uchr = UNICHR(int(s, 16))
        return unicode_encode_utf_8(uchr, len(uchr), 'strict')
    except (ValueError, UnicodeDecodeError):
        # XXX better error message
        raise errorhandler(state, token, msg="Error encoding %s" % s)
Ejemplo n.º 3
0
def chr(space, code):
    "Return a Unicode string of one character with the given ordinal."
    try:
        c = UNICHR(code)
    except ValueError:
        raise oefmt(space.w_ValueError, "chr() arg out of range")
    return space.newunicode(c)
Ejemplo n.º 4
0
def chr(space, code):
    "Return a Unicode string of one character with the given ordinal."
    try:
        c = UNICHR(code)
    except ValueError:
        raise OperationError(space.w_ValueError,
                             space.wrap("chr() arg out of range"))
    return space.wrap(c)
Ejemplo n.º 5
0
def hex_to_utf8(s):
    uchr = UNICHR(int(s, 16))
    return unicode_encode_utf_8(uchr, len(uchr), 'strict')
Ejemplo n.º 6
0
 def f(x):
     try:
         return ord(UNICHR(x))
     except ValueError:
         return -42
Ejemplo n.º 7
0
def str_decode_utf_32_helper(s,
                             size,
                             errors,
                             final=True,
                             errorhandler=None,
                             byteorder="native",
                             public_encoding_name='utf32',
                             allow_surrogates=True):
    if errorhandler is None:
        errorhandler = default_unicode_error_decode
    bo = 0

    if BYTEORDER == 'little':
        iorder = [0, 1, 2, 3]
    else:
        iorder = [3, 2, 1, 0]

    #  Check for BOM marks (U+FEFF) in the input and adjust current
    #  byte order setting accordingly. In native mode, the leading BOM
    #  mark is skipped, in all other modes, it is copied to the output
    #  stream as-is (giving a ZWNBSP character).
    pos = 0
    if byteorder == 'native':
        if size >= 4:
            bom = intmask((ord(s[iorder[3]]) << 24) | (ord(s[iorder[2]]) << 16)
                          | (ord(s[iorder[1]]) << 8) | ord(s[iorder[0]]))
            if BYTEORDER == 'little':
                if bom == BOM32_DIRECT:
                    pos += 4
                    bo = -1
                elif bom == BOM32_REVERSE:
                    pos += 4
                    bo = 1
            else:
                if bom == BOM32_DIRECT:
                    pos += 4
                    bo = 1
                elif bom == BOM32_REVERSE:
                    pos += 4
                    bo = -1
    elif byteorder == 'little':
        bo = -1
    else:
        bo = 1
    if size == 0:
        return u'', 0, bo
    if bo == -1:
        # force little endian
        iorder = [0, 1, 2, 3]
    elif bo == 1:
        # force big endian
        iorder = [3, 2, 1, 0]

    result = UnicodeBuilder(size // 4)

    while pos < size:
        # remaining bytes at the end? (size should be divisible by 4)
        if len(s) - pos < 4:
            if not final:
                break
            r, pos = errorhandler(errors, public_encoding_name,
                                  "truncated data", s, pos, len(s))
            result.append(r)
            if len(s) - pos < 4:
                break
            continue
        ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) << 16)
              | (ord(s[pos + iorder[1]]) << 8) | ord(s[pos + iorder[0]]))
        if not allow_surrogates and 0xD800 <= ch <= 0xDFFF:
            r, pos = errorhandler(
                errors, public_encoding_name,
                "code point in surrogate code point "
                "range(0xd800, 0xe000)", s, pos, pos + 4)
            result.append(r)
            continue
        elif ch >= 0x110000:
            r, pos = errorhandler(errors, public_encoding_name,
                                  "codepoint not in range(0x110000)", s, pos,
                                  pos + 4)
            result.append(r)
            continue

        if MAXUNICODE < 65536 and ch >= 0x10000:
            ch -= 0x10000L
            result.append(unichr(0xD800 + (ch >> 10)))
            result.append(unichr(0xDC00 + (ch & 0x03FF)))
        else:
            result.append(UNICHR(ch))
        pos += 4
    return result.build(), pos, bo
Ejemplo n.º 8
0
def unichr(space, code):
    "Return a Unicode string of one character with the given ordinal."
    if code < 0 or code > 0x10FFFF:
        raise oefmt(space.w_ValueError, "unichr() arg out of range")
    c = UNICHR(code)
    return space.newunicode(c)
Ejemplo n.º 9
0
def str_decode_utf_16_helper(s,
                             size,
                             errors,
                             final=True,
                             errorhandler=None,
                             byteorder="native",
                             public_encoding_name='utf16'):
    if errorhandler is None:
        errorhandler = default_unicode_error_decode
    bo = 0

    if BYTEORDER == 'little':
        ihi = 1
        ilo = 0
    else:
        ihi = 0
        ilo = 1

    #  Check for BOM marks (U+FEFF) in the input and adjust current
    #  byte order setting accordingly. In native mode, the leading BOM
    #  mark is skipped, in all other modes, it is copied to the output
    #  stream as-is (giving a ZWNBSP character).
    pos = 0
    if byteorder == 'native':
        if size >= 2:
            bom = (ord(s[ihi]) << 8) | ord(s[ilo])
            if BYTEORDER == 'little':
                if bom == 0xFEFF:
                    pos += 2
                    bo = -1
                elif bom == 0xFFFE:
                    pos += 2
                    bo = 1
            else:
                if bom == 0xFEFF:
                    pos += 2
                    bo = 1
                elif bom == 0xFFFE:
                    pos += 2
                    bo = -1
    elif byteorder == 'little':
        bo = -1
    else:
        bo = 1
    if size == 0:
        return u'', 0, bo
    if bo == -1:
        # force little endian
        ihi = 1
        ilo = 0

    elif bo == 1:
        # force big endian
        ihi = 0
        ilo = 1

    result = UnicodeBuilder(size // 2)

    #XXX I think the errors are not correctly handled here
    while pos < size:
        # remaining bytes at the end? (size should be even)
        if len(s) - pos < 2:
            if not final:
                break
            r, pos = errorhandler(errors, public_encoding_name,
                                  "truncated data", s, pos, len(s))
            result.append(r)
            if len(s) - pos < 2:
                break
        ch = (ord(s[pos + ihi]) << 8) | ord(s[pos + ilo])
        pos += 2
        if ch < 0xD800 or ch > 0xDFFF:
            result.append(unichr(ch))
            continue
        # UTF-16 code pair:
        if len(s) - pos < 2:
            pos -= 2
            if not final:
                break
            errmsg = "unexpected end of data"
            r, pos = errorhandler(errors, public_encoding_name, errmsg, s, pos,
                                  len(s))
            result.append(r)
            if len(s) - pos < 2:
                break
        elif 0xD800 <= ch <= 0xDBFF:
            ch2 = (ord(s[pos + ihi]) << 8) | ord(s[pos + ilo])
            pos += 2
            if 0xDC00 <= ch2 <= 0xDFFF:
                if MAXUNICODE < 65536:
                    result.append(unichr(ch))
                    result.append(unichr(ch2))
                else:
                    result.append(
                        UNICHR((((ch & 0x3FF) << 10) | (ch2 & 0x3FF)) +
                               0x10000))
                continue
            else:
                r, pos = errorhandler(errors, public_encoding_name,
                                      "illegal UTF-16 surrogate", s, pos - 4,
                                      pos - 2)
                result.append(r)
        else:
            r, pos = errorhandler(errors, public_encoding_name,
                                  "illegal encoding", s, pos - 2, pos)
            result.append(r)
    return result.build(), pos, bo