Ejemplo n.º 1
0
 def expression_string_expand(state, p):
     val = ''.join(p[1].get_strparts())
     try:
         str_decode_utf_8(val, len(val), 'strict', final=True)
     except UnicodeDecodeError:
         raise errorhandler(state, p[1], msg="Unicode error")
     return ast.String(val, srcpos=sr(p))
Ejemplo n.º 2
0
 def expression_string_expand(state, p):
     val = ''.join(p[1].get_strparts())
     try:
         str_decode_utf_8(val, len(val), 'strict', final=True)
     except UnicodeDecodeError:
         raise errorhandler(state, p[1], msg="Unicode error")
     return ast.String(val, srcpos=sr(p))
Ejemplo n.º 3
0
        def f(x):

            s1 = "".join(["\xd7\x90\xd6\x96\xeb\x96\x95\xf0\x90\x91\x93"] * x)
            u, consumed = runicode.str_decode_utf_8(s1, len(s1), 'strict',
                                                    allow_surrogates=True)
            s2 = runicode.unicode_encode_utf_8(u, len(u), 'strict',
                                                    allow_surrogates=True)
            u3, consumed3 = runicode.str_decode_utf_8(s1, len(s1), 'strict',
                                                    allow_surrogates=False)
            s3 = runicode.unicode_encode_utf_8(u3, len(u3), 'strict',
                                                    allow_surrogates=False)
            return s1 == s2 == s3
Ejemplo n.º 4
0
        def f(x):

            s1 = "".join(["\xd7\x90\xd6\x96\xeb\x96\x95\xf0\x90\x91\x93"] * x)
            u, consumed = runicode.str_decode_utf_8(s1, len(s1), 'strict',
                                                    allow_surrogates=True)
            s2 = runicode.unicode_encode_utf_8(u, len(u), 'strict',
                                                    allow_surrogates=True)
            u3, consumed3 = runicode.str_decode_utf_8(s1, len(s1), 'strict',
                                                    allow_surrogates=False)
            s3 = runicode.unicode_encode_utf_8(u3, len(u3), 'strict',
                                                    allow_surrogates=False)
            return s1 == s2 == s3
Ejemplo n.º 5
0
Archivo: ast.py Proyecto: dorris19/soda
 def __init__(self, value, reference, package, line, col):
     string = value.getstr()
     iden, trash = str_decode_utf_8(string, len(string), "strict", True)
     self.value = iden
     if reference is not None:
         string = reference.getstr()
         ref, trash = str_decode_utf_8(string, len(string), "strict", True)
     else:
         ref = unicode(package)
     self.reference = ref
     self.package = package
     self.line = line
     self.col = col
Ejemplo n.º 6
0
def decode_object(space, w_obj, encoding, errors):
    if encoding is None:
        encoding = getdefaultencoding(space)
    if errors is None or errors == 'strict':
        if encoding == 'ascii':
            # XXX error handling
            s = space.charbuf_w(w_obj)
            try:
                u = fast_str_decode_ascii(s)
            except ValueError:
                eh = unicodehelper.decode_error_handler(space)
                u = str_decode_ascii(     # try again, to get the error right
                    s, len(s), None, final=True, errorhandler=eh)[0]
            return space.newunicode(u)
        if encoding == 'utf-8':
            s = space.charbuf_w(w_obj)
            eh = unicodehelper.decode_error_handler(space)
            return space.newunicode(str_decode_utf_8(
                    s, len(s), None, final=True, errorhandler=eh,
                    allow_surrogates=True)[0])
    w_codecs = space.getbuiltinmodule("_codecs")
    w_decode = space.getattr(w_codecs, space.newtext("decode"))
    if errors is None:
        w_retval = space.call_function(w_decode, w_obj, space.newtext(encoding))
    else:
        w_retval = space.call_function(w_decode, w_obj, space.newtext(encoding),
                                       space.newtext(errors))
    return w_retval
Ejemplo n.º 7
0
Archivo: ast.py Proyecto: dorris19/soda
 def __init__(self, value, package, line, col):
     string = value.getstr()
     iden, trash = str_decode_utf_8(string, len(string), "strict", True)
     self.value = iden
     self.package = package
     self.line = line
     self.col = col
Ejemplo n.º 8
0
def fsdecode(space, w_string):
    state = space.fromcache(interp_codecs.CodecState)
    if _WIN32:
        bytes = space.bytes_w(w_string)
        uni = str_decode_mbcs(bytes, len(bytes), 'strict',
                              errorhandler=decode_error_handler(space),
                              force_ignore=False)[0]
    elif _MACOSX:
        bytes = space.bytes_w(w_string)
        uni = runicode.str_decode_utf_8(
            bytes, len(bytes), 'surrogateescape',
            errorhandler=state.decode_error_handler)[0]
    elif state.codec_need_encodings:
        # bootstrap check: if the filesystem codec is implemented in
        # Python we cannot use it before the codecs are ready. use the
        # locale codec instead
        from pypy.module._codecs.locale import (
            str_decode_locale_surrogateescape)
        bytes = space.bytes_w(w_string)
        uni = str_decode_locale_surrogateescape(
            bytes, errorhandler=decode_error_handler(space))
    else:
        from pypy.module.sys.interp_encoding import getfilesystemencoding
        return space.call_method(w_string, 'decode',
                                 getfilesystemencoding(space),
                                 space.wrap('surrogateescape'))
    return space.wrap(uni)
Ejemplo n.º 9
0
def decode_object(space, w_obj, encoding, errors):
    if encoding is None:
        encoding = getdefaultencoding(space)
    if errors is None or errors == 'strict':
        if encoding == 'ascii':
            # XXX error handling
            s = space.charbuf_w(w_obj)
            eh = unicodehelper.decode_error_handler(space)
            return space.wrap(
                str_decode_ascii(s, len(s), None, final=True,
                                 errorhandler=eh)[0])
        if encoding == 'utf-8':
            s = space.charbuf_w(w_obj)
            eh = unicodehelper.decode_error_handler(space)
            return space.wrap(
                str_decode_utf_8(s,
                                 len(s),
                                 None,
                                 final=True,
                                 errorhandler=eh,
                                 allow_surrogates=True)[0])
    w_codecs = space.getbuiltinmodule("_codecs")
    w_decode = space.getattr(w_codecs, space.wrap("decode"))
    if errors is None:
        w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding))
    else:
        w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding),
                                       space.wrap(errors))
    return w_retval
Ejemplo n.º 10
0
def decode_object(space, w_obj, encoding, errors):
    if encoding is None:
        encoding = getdefaultencoding(space)
    if errors is None or errors == 'strict':
        if encoding == 'ascii':
            s = space.charbuf_w(w_obj)
            try:
                u = fast_str_decode_ascii(s)
            except ValueError:
                eh = unicodehelper.decode_error_handler(space)
                u = str_decode_ascii(  # try again, to get the error right
                    s,
                    len(s),
                    None,
                    final=True,
                    errorhandler=eh)[0]
            return space.newunicode(u)
        if encoding == 'utf-8':
            s = space.charbuf_w(w_obj)
            eh = unicodehelper.decode_error_handler(space)
            return space.newunicode(
                str_decode_utf_8(s, len(s), None, final=True,
                                 errorhandler=eh)[0])

    from pypy.module._codecs.interp_codecs import decode_text
    w_retval = decode_text(space, w_obj, encoding, errors)
    if not space.isinstance_w(w_retval, space.w_unicode):
        raise oefmt(
            space.w_TypeError, "'%s' decoder returned '%T' instead of 'str'; "
            "use codecs.decode() to decode to arbitrary types", encoding,
            w_retval)
    return w_retval
Ejemplo n.º 11
0
def decode_object(space, w_obj, encoding, errors):
    if encoding is None:
        encoding = getdefaultencoding(space)
    if errors is None or errors == 'strict':
        if encoding == 'ascii':
            # XXX error handling
            s = space.charbuf_w(w_obj)
            try:
                u = fast_str_decode_ascii(s)
            except ValueError:
                eh = unicodehelper.decode_error_handler(space)
                u = str_decode_ascii(     # try again, to get the error right
                    s, len(s), None, final=True, errorhandler=eh)[0]
            return space.wrap(u)
        if encoding == 'utf-8':
            s = space.charbuf_w(w_obj)
            eh = unicodehelper.decode_error_handler(space)
            return space.wrap(str_decode_utf_8(
                    s, len(s), None, final=True, errorhandler=eh)[0])
    w_codecs = space.getbuiltinmodule("_codecs")
    w_decode = space.getattr(w_codecs, space.wrap("decode"))
    if errors is None:
        w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding))
    else:
        w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding),
                                       space.wrap(errors))
    return w_retval
Ejemplo n.º 12
0
 def read_cached_string(self):
     sz = read_raw_integer(self)
     if sz >= MAX_STRING_SIZE:
         return self._str_cache[sz - MAX_STRING_SIZE]
     else:
         s, pos = str_decode_utf_8(self.read(sz), sz, "?")
         self._str_cache[len(self._str_cache)] = s
         return s
Ejemplo n.º 13
0
 def read_cached_string(self):
     sz = read_raw_integer(self)
     if sz >= MAX_STRING_SIZE:
         return self._str_cache[sz - MAX_STRING_SIZE]
     else:
         s, pos = str_decode_utf_8(self.read(sz), sz, "?")
         self._str_cache[len(self._str_cache)] = s
         return s
Ejemplo n.º 14
0
def utf8_decoder_operate(decoder, newdata, final):
    data = decoder.buffer + newdata
    try:
        string, pos = str_decode_utf_8(data, len(data), '', final=final)
    except UnicodeDecodeError as error:
        raise space.unwind(space.LError(u"unicode decode failed"))
    decoder.buffer = data[pos:]
    return string
Ejemplo n.º 15
0
def _decode_utf8(string):
    # when building the error message, don't crash if the byte string
    # provided is not valid UTF-8
    assert isinstance(string, str)
    result, consumed = runicode.str_decode_utf_8(string,
                                                 len(string),
                                                 "replace",
                                                 final=True)
    return result
Ejemplo n.º 16
0
 def f(n):
     x = strings[n]
     if n:
         errors = 'strict'
     else:
         errors = 'foo'
     # the annotation of y is SomeUnicodeString(can_be_None=False)
     y, _ = str_decode_utf_8(x, len(x), errors, errorhandler)
     return x.decode('utf-8') + y
Ejemplo n.º 17
0
def decode_utf8(space, string):
    result, consumed = runicode.str_decode_utf_8(
        string,
        len(string),
        "strict",
        final=True,
        errorhandler=decode_error_handler(space),
        allow_surrogates=True)
    return result
Ejemplo n.º 18
0
 def f(n):
     x = strings[n]
     if n:
         errors = 'strict'
     else:
         errors = 'foo'
     # the annotation of y is SomeUnicodeString(can_be_None=False)
     y, _ = str_decode_utf_8(x, len(x), errors, errorhandler=errorhandler)
     return x.decode('utf-8') + y
Ejemplo n.º 19
0
def decode_utf8(space, string):
    # Surrogates are accepted and not treated specially at all.
    # If there happen to be two 3-bytes encoding a pair of surrogates,
    # you still get two surrogate unicode characters in the result.
    # These are the Python2 rules; Python3 differs.
    result, consumed = runicode.str_decode_utf_8(
        string, len(string), "strict",
        final=True, errorhandler=decode_error_handler(space),
        allow_surrogates=True)
    return result
Ejemplo n.º 20
0
def utf8_decoder_operate(decoder, newdata, final):
    data = decoder.buffer + newdata
    try:
        string, pos = str_decode_utf_8(data, len(data), '', final=final)
    except UnicodeDecodeError as error:
        raise space.unwind(space.LError(u"unicode decode failed"))
    assert 0 <= pos <= len(data)  # Added to satisfy PyPy 5.7
    # The implementation of str_decode_utf_8 perhaps changed?
    decoder.buffer = data[pos:]
    return string
Ejemplo n.º 21
0
def utf_8_decode(space, string, errors="strict", w_final=None):
    if errors is None:
        errors = 'strict'
    final = space.is_true(w_final)
    state = space.fromcache(CodecState)
    result, consumed = runicode.str_decode_utf_8(
        string, len(string), errors,
        final, state.decode_error_handler,
        allow_surrogates=True)
    return space.newtuple([space.wrap(result), space.wrap(consumed)])
Ejemplo n.º 22
0
def utf_8_decode(space, string, errors="strict", w_final=None):
    if errors is None:
        errors = 'strict'
    final = space.is_true(w_final)
    state = space.fromcache(CodecState)
    result, consumed = runicode.str_decode_utf_8(string,
                                                 len(string),
                                                 errors,
                                                 final,
                                                 state.decode_error_handler,
                                                 allow_surrogates=True)
    return space.newtuple([space.wrap(result), space.wrap(consumed)])
Ejemplo n.º 23
0
Archivo: ast.py Proyecto: dorris19/soda
 def __init__(self, name, params, body, returnstatement, package, line,
              col):
     string = name.getstr()
     iden, trash = str_decode_utf_8(string, len(string), "strict", True)
     self.name = iden
     self.params = params
     self.body = body
     self.returnstatement = returnstatement
     self.package = package
     self.line = line
     self.col = col
     self.compiler = bytecode.Compiler()
Ejemplo n.º 24
0
 def wrap_info(self, space):
     w_text = w_filename = space.w_None
     offset = self.offset
     w_lineno = space.newint(self.lineno)
     if self.filename is not None:
         w_filename = space.newfilename(self.filename)
     if self.text is None and self.filename is not None:
         w_text = space.appexec([w_filename, w_lineno],
                                """(filename, lineno):
                 try:
                     with open(filename) as f:
                         for _ in range(lineno - 1):
                             f.readline()
                         return f.readline()
                 except:  # we can't allow any exceptions here!
                     return None""")
     elif self.text is not None:
         from rpython.rlib.runicode import str_decode_utf_8
         # self.text may not be UTF-8 in case of decoding errors.
         # adjust the encoded text offset to a decoded offset
         # XXX do the right thing about continuation lines, which
         # XXX are their own fun, sometimes giving offset >
         # XXX len(self.text) for example (right now, avoid crashing)
         if offset > len(self.text):
             offset = len(self.text)
         text, _ = str_decode_utf_8(self.text, offset, 'replace')
         offset = len(text)
         if len(self.text) != offset:
             text, _ = str_decode_utf_8(self.text, len(self.text),
                                        'replace')
         w_text = space.newunicode(text)
     return space.newtuple([
         space.newtext(self.msg),
         space.newtuple([
             w_filename, w_lineno,
             space.newint(offset), w_text,
             space.newint(self.lastlineno)
         ])
     ])
Ejemplo n.º 25
0
def decode_utf8(space, string):
    # Surrogates are accepted and not treated specially at all.
    # If there happen to be two 3-bytes encoding a pair of surrogates,
    # you still get two surrogate unicode characters in the result.
    # These are the Python2 rules; Python3 differs.
    result, consumed = runicode.str_decode_utf_8(
        string,
        len(string),
        "strict",
        final=True,
        errorhandler=decode_error_handler(space),
        allow_surrogates=True)
    return result
Ejemplo n.º 26
0
 def create_arrays(self, text):
     self.textarrays = []
     chars, words, lines = [], [], []
     wordbuffer, linebuffer = [], []
     i, j, k = 0, 0, 0
     a = rbigint()
     text, trash = str_decode_utf_8(text, len(text), "strict", True)
     for char in text:
         if char == " " and wordbuffer != []:
             word = u"".join(wordbuffer)
             words.append(SodaInt(a.fromint(j)))
             words.append(SodaString(word))
             wordbuffer = []
             j += 1
             chars.append(SodaInt(a.fromint(i)))
             chars.append(SodaString(char))
             linebuffer.append(char)
             i += 1
         elif char == "\n" and linebuffer != []:
             line = u"".join(linebuffer)
             lines.append(SodaInt(a.fromint(k)))
             lines.append(SodaString(line))
             linebuffer = []
             k += 1
             if not wordbuffer == []:
                 word = u"".join(wordbuffer)
                 words.append(SodaInt(a.fromint(j)))
                 words.append(SodaString(word))
                 wordbuffer = []
                 j += 1
             chars.append(SodaInt(a.fromint(i)))
             chars.append(SodaString(char))
             i += 1
         else:
             chars.append(SodaInt(a.fromint(i)))
             chars.append(SodaString(char))
             wordbuffer.append(char)
             linebuffer.append(char)
             i += 1
     if not wordbuffer == []:
         word = u"".join(wordbuffer)
         words.append(SodaInt(a.fromint(j)))
         words.append(SodaString(word))
     if not linebuffer == []:
         line = u"".join(linebuffer)
         lines.append(SodaInt(a.fromint(k)))
         lines.append(SodaString(line))
     self.textarrays.append(SodaArray(chars))
     self.textarrays.append(SodaArray(words))
     self.textarrays.append(SodaArray(lines))
Ejemplo n.º 27
0
def str_decode_utf8(rope):
    from rpython.rlib.runicode import str_decode_utf_8
    if rope.is_ascii():
        return rope
    elif isinstance(rope, BinaryConcatNode):
        lresult = str_decode_utf8(rope.left)
        if lresult is not None:
            return BinaryConcatNode(lresult, str_decode_utf8(rope.right))
    elif isinstance(rope, LiteralStringNode):
        try:
            result, consumed = str_decode_utf_8(rope.s, len(rope.s), "strict",
                                                False)
        except UnicodeDecodeError:
            return None
        if consumed < len(rope.s):
            return None
        return rope_from_unicode(result)
    s = rope.flatten_string()
    try:
        result, consumed = str_decode_utf_8(s, len(s), "strict", True)
        return rope_from_unicode(result)
    except UnicodeDecodeError:
        pass
Ejemplo n.º 28
0
def str_decode_utf8(rope):
    from rpython.rlib.runicode import str_decode_utf_8
    if rope.is_ascii():
        return rope
    elif isinstance(rope, BinaryConcatNode):
        lresult = str_decode_utf8(rope.left)
        if lresult is not None:
            return BinaryConcatNode(lresult,
                                    str_decode_utf8(rope.right))
    elif isinstance(rope, LiteralStringNode):
        try:
            result, consumed = str_decode_utf_8(rope.s, len(rope.s), "strict",
                                                False)
        except UnicodeDecodeError:
            return None
        if consumed < len(rope.s):
            return None
        return rope_from_unicode(result)
    s = rope.flatten_string()
    try:
        result, consumed = str_decode_utf_8(s, len(s), "strict", True)
        return rope_from_unicode(result)
    except UnicodeDecodeError:
        pass
Ejemplo n.º 29
0
def _test_check_utf8(s, allow_surrogates):
    try:
        u, _ = runicode.str_decode_utf_8(s, len(s), None, final=True,
                                         allow_surrogates=allow_surrogates)
        valid = True
    except UnicodeDecodeError as e:
        valid = False
    length = rutf8._check_utf8(s, allow_surrogates, 0, len(s))
    if length < 0:
        assert not valid
        assert ~(length) == e.start
    else:
        assert valid
        if sys.maxunicode == 0x10FFFF or not _has_surrogates(s):
            assert length == len(u)
Ejemplo n.º 30
0
def decode_utf8(space, string, allow_surrogates=False):
    # Note that Python3 tends to forbid *all* surrogates in utf-8.
    # If allow_surrogates=True, then revert to the Python 2 behavior,
    # i.e. surrogates are accepted and not treated specially at all.
    # If there happen to be two 3-bytes encoding a pair of surrogates,
    # you still get two surrogate unicode characters in the result.
    assert isinstance(string, str)
    result, consumed = runicode.str_decode_utf_8(
        string,
        len(string),
        "strict",
        final=True,
        errorhandler=decode_error_handler(space),
        allow_surrogates=allow_surrogates)
    return result
Ejemplo n.º 31
0
 def wrap_info(self, space):
     w_text = w_filename = space.w_None
     if self.text is not None:
         from rpython.rlib.runicode import str_decode_utf_8
         # self.text may not be UTF-8 in case of decoding errors
         w_text = space.wrap(str_decode_utf_8(self.text, len(self.text),
                                              'replace')[0])
     if self.filename is not None:
         w_filename = space.fsdecode(space.wrapbytes(self.filename))
     return space.newtuple([space.wrap(self.msg),
                            space.newtuple([w_filename,
                                            space.wrap(self.lineno),
                                            space.wrap(self.offset),
                                            w_text,
                                            space.wrap(self.lastlineno)])])
Ejemplo n.º 32
0
 def wrap_info(self, space):
     w_text = w_filename = space.w_None
     if self.text is not None:
         from rpython.rlib.runicode import str_decode_utf_8
         # self.text may not be UTF-8 in case of decoding errors
         w_text = space.wrap(
             str_decode_utf_8(self.text, len(self.text), 'replace')[0])
     if self.filename is not None:
         w_filename = space.fsdecode(space.wrapbytes(self.filename))
     return space.newtuple([
         space.wrap(self.msg),
         space.newtuple([
             w_filename,
             space.wrap(self.lineno),
             space.wrap(self.offset), w_text,
             space.wrap(self.lastlineno)
         ])
     ])
def decode_object(space, w_obj, encoding, errors):
    if encoding is None:
        encoding = getdefaultencoding(space)
    if errors is None or errors == "strict":
        if encoding == "ascii":
            # XXX error handling
            s = space.charbuf_w(w_obj)
            eh = unicodehelper.decode_error_handler(space)
            return space.wrap(str_decode_ascii(s, len(s), None, final=True, errorhandler=eh)[0])
        if encoding == "utf-8":
            s = space.charbuf_w(w_obj)
            eh = unicodehelper.decode_error_handler(space)
            return space.wrap(str_decode_utf_8(s, len(s), None, final=True, errorhandler=eh, allow_surrogates=True)[0])
    w_codecs = space.getbuiltinmodule("_codecs")
    w_decode = space.getattr(w_codecs, space.wrap("decode"))
    if errors is None:
        w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding))
    else:
        w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding), space.wrap(errors))
    return w_retval
Ejemplo n.º 34
0
        def f(x):

            s1 = "".join(["\xd7\x90\xd6\x96\xeb\x96\x95\xf0\x90\x91\x93"] * x)
            u, consumed = runicode.str_decode_utf_8(s1, len(s1), True)
            s2 = runicode.unicode_encode_utf_8(u, len(u), True)
            return s1 == s2
Ejemplo n.º 35
0
def unicode_from_utf8(s):
    """Converts a `str` value to a `unicode` value assuming it's encoded in UTF8."""
    res, _ = str_decode_utf_8(s, len(s), 'strict')
    return res
Ejemplo n.º 36
0
def decode_utf8(space, string, allow_surrogates=False):
    result, consumed = runicode.str_decode_utf_8(
        string, len(string), "strict",
        final=True, errorhandler=decode_error_handler(space),
        allow_surrogates=allow_surrogates)
    return result
Ejemplo n.º 37
0
 def interpstr_start(state, p):
     val = ''.join(p[0].get_strparts())
     str_decode_utf_8(val, len(val), 'strict', final=True)
     return ast.InterpStringContents([val], [])
Ejemplo n.º 38
0
 def interpstr_part(state, p):
     val = ''.join(p[4].get_strparts())
     str_decode_utf_8(val, len(val), 'strict', final=True)
     return ast.InterpStringContents(
         p[0].get_strings() + [val], p[0].get_exprs() + [p[2]])
Ejemplo n.º 39
0
def raw_encode_basestring_ascii(space, w_string):
    if space.isinstance_w(w_string, space.w_bytes):
        s = space.bytes_w(w_string)
        for i in range(len(s)):
            c = s[i]
            if c >= ' ' and c <= '~' and c != '"' and c != '\\':
                pass
            else:
                first = i
                break
        else:
            # the input is a string with only non-special ascii chars
            return w_string

        eh = unicodehelper.decode_error_handler(space)
        u = str_decode_utf_8(s,
                             len(s),
                             None,
                             final=True,
                             errorhandler=eh,
                             allow_surrogates=True)[0]
        sb = StringBuilder(len(u))
        sb.append_slice(s, 0, first)
    else:
        # We used to check if 'u' contains only safe characters, and return
        # 'w_string' directly.  But this requires an extra pass over all
        # characters, and the expected use case of this function, from
        # json.encoder, will anyway re-encode a unicode result back to
        # a string (with the ascii encoding).  This requires two passes
        # over the characters.  So we may as well directly turn it into a
        # string here --- only one pass.
        u = space.unicode_w(w_string)
        sb = StringBuilder(len(u))
        first = 0

    for i in range(first, len(u)):
        c = ord(u[i])
        if c <= ord('~'):
            if c == ord('"') or c == ord('\\'):
                sb.append('\\')
            elif c < ord(' '):
                sb.append(ESCAPE_BEFORE_SPACE[c])
                continue
            sb.append(chr(c))
        else:
            if c <= ord(u'\uffff'):
                sb.append('\\u')
                sb.append(HEX[c >> 12])
                sb.append(HEX[(c >> 8) & 0x0f])
                sb.append(HEX[(c >> 4) & 0x0f])
                sb.append(HEX[c & 0x0f])
            else:
                # surrogate pair
                n = c - 0x10000
                s1 = 0xd800 | ((n >> 10) & 0x3ff)
                sb.append('\\ud')
                sb.append(HEX[(s1 >> 8) & 0x0f])
                sb.append(HEX[(s1 >> 4) & 0x0f])
                sb.append(HEX[s1 & 0x0f])
                s2 = 0xdc00 | (n & 0x3ff)
                sb.append('\\ud')
                sb.append(HEX[(s2 >> 8) & 0x0f])
                sb.append(HEX[(s2 >> 4) & 0x0f])
                sb.append(HEX[s2 & 0x0f])

    res = sb.build()
    return space.newtext(res)
Ejemplo n.º 40
0
Archivo: nano.py Proyecto: dckc/typhon
    def decodeNextTag(self, stream):
        tag = stream.nextByte()
        if self.noisy:
            print "Tag:", tag

        if tag == 'L':
            # Literal.
            literalTag = stream.nextByte()
            if self.noisy:
                print "Literal tag:", literalTag

            if literalTag == 'C':
                # Character. Read bytes one-at-a-time until a code point has
                # been decoded successfully.
                buf = stream.nextByte()
                try:
                    rv, count = str_decode_utf_8(buf, len(buf), None)
                    while rv == u'':
                        buf += stream.nextByte()
                        rv, count = str_decode_utf_8(buf, len(buf), None)
                except UnicodeDecodeError:
                    raise InvalidMAST("Couldn't decode char %s" % buf)
                self.exprs.append(MastIR.CharExpr(rv))
            elif literalTag == 'D':
                # Double.
                self.exprs.append(MastIR.DoubleExpr(stream.nextDouble()))
            elif literalTag == 'I':
                # Int. Read a varint and un-zz it.
                bi = stream.nextVarInt()
                shifted = bi.rshift(1)
                if bi.int_and_(1).toint():
                    shifted = shifted.int_xor(-1)
                self.exprs.append(MastIR.IntExpr(shifted))
            elif literalTag == 'N':
                # Null.
                self.exprs.append(MastIR.NullExpr())
            elif literalTag == 'S':
                # Str.
                s = stream.nextStr()
                self.exprs.append(MastIR.StrExpr(s))
            else:
                raise InvalidMAST("Didn't know literal tag %s" % literalTag)
        elif tag == 'P':
            # Pattern.
            pattTag = stream.nextByte()
            if self.noisy:
                print "Pattern tag:", pattTag

            if pattTag == 'F':
                # Final.
                name = stream.nextStr()
                guard = self.nextExpr(stream)
                self.patts.append(MastIR.FinalPatt(name, guard))
            elif pattTag == 'I':
                # Ignore.
                guard = self.nextExpr(stream)
                self.patts.append(MastIR.IgnorePatt(guard))
            elif pattTag == 'V':
                # Var.
                name = stream.nextStr()
                guard = self.nextExpr(stream)
                self.patts.append(MastIR.VarPatt(name, guard))
            elif pattTag == 'L':
                # List.
                patts = self.nextPatts(stream)
                self.patts.append(MastIR.ListPatt(patts))
            elif pattTag == 'A':
                # Via.
                expr = self.nextExpr(stream)
                patt = self.nextPatt(stream)
                self.patts.append(MastIR.ViaPatt(expr, patt))
            elif pattTag == 'B':
                # Binding.
                name = stream.nextStr()
                self.patts.append(MastIR.BindingPatt(name))
            else:
                raise InvalidMAST("Didn't know pattern tag %s" % pattTag)
        elif tag == 'N':
            # Noun.
            s = stream.nextStr()
            self.exprs.append(MastIR.NounExpr(s))
        elif tag == 'B':
            # Binding.
            s = stream.nextStr()
            self.exprs.append(MastIR.BindingExpr(s))
        elif tag == 'S':
            # Sequence.
            exprs = self.nextExprs(stream)
            self.exprs.append(MastIR.SeqExpr(exprs))
        elif tag == 'C':
            # Call.
            target = self.nextExpr(stream)
            verb = stream.nextStr()
            args = self.nextExprs(stream)
            namedArgs = self.nextNamedExprs(stream)
            self.exprs.append(MastIR.CallExpr(target, verb, args, namedArgs))
        elif tag == 'D':
            # Def.
            patt = self.nextPatt(stream)
            exit = self.nextExpr(stream)
            expr = self.nextExpr(stream)
            self.exprs.append(MastIR.DefExpr(patt, exit, expr))
        elif tag == 'e':
            # Escape (no catch).
            escapePatt = self.nextPatt(stream)
            escapeExpr = self.nextExpr(stream)
            self.exprs.append(MastIR.EscapeOnlyExpr(escapePatt, escapeExpr))
        elif tag == 'E':
            # Escape (with catch).
            escapePatt = self.nextPatt(stream)
            escapeExpr = self.nextExpr(stream)
            catchPatt = self.nextPatt(stream)
            catchExpr = self.nextExpr(stream)
            self.exprs.append(MastIR.EscapeExpr(escapePatt, escapeExpr,
                                                catchPatt, catchExpr))
        elif tag == 'O':
            # Object with no script, just direct methods and matchers.
            doc = stream.nextStr()
            patt = self.nextPatt(stream)
            asExpr = self.nextExpr(stream)
            implements = self.nextExprs(stream)
            methods = self.nextMethods(stream)
            matchers = self.nextMatchers(stream)
            self.exprs.append(MastIR.ObjectExpr(doc, patt,
                                                [asExpr] + implements,
                                                methods, matchers))
        elif tag == 'M':
            # Method.
            doc = stream.nextStr()
            verb = stream.nextStr()
            patts = self.nextPatts(stream)
            namedPatts = [MastIR.NamedPattern(key, value, default)
                          for (key, value, default)
                          in self.nextNamedPatts(stream)]
            guard = self.nextExpr(stream)
            block = self.nextExpr(stream)
            self.exprs.append(MastIR.MethodExpr(doc, verb, patts, namedPatts,
                                                guard, block))
        elif tag == 'R':
            # Matcher.
            patt = self.nextPatt(stream)
            block = self.nextExpr(stream)
            self.exprs.append(MastIR.MatcherExpr(patt, block))
        elif tag == 'A':
            # Assign.
            target = stream.nextStr()
            expr = self.nextExpr(stream)
            self.exprs.append(MastIR.AssignExpr(target, expr))
        elif tag == 'F':
            # Try/finally.
            tryExpr = self.nextExpr(stream)
            finallyExpr = self.nextExpr(stream)
            self.exprs.append(MastIR.FinallyExpr(tryExpr, finallyExpr))
        elif tag == 'Y':
            # Try/catch.
            tryExpr = self.nextExpr(stream)
            catchPatt = self.nextPatt(stream)
            catchExpr = self.nextExpr(stream)
            self.exprs.append(MastIR.TryExpr(tryExpr, catchPatt, catchExpr))
        elif tag == 'H':
            # Hide.
            expr = self.nextExpr(stream)
            self.exprs.append(MastIR.HideExpr(expr))
        elif tag == 'I':
            # If/then/else.
            cond = self.nextExpr(stream)
            cons = self.nextExpr(stream)
            alt = self.nextExpr(stream)
            self.exprs.append(MastIR.IfExpr(cond, cons, alt))
        elif tag == 'T':
            # Meta state.
            self.exprs.append(MastIR.MetaStateExpr())
        elif tag == 'X':
            # Meta context.
            self.exprs.append(MastIR.MetaContextExpr())
        else:
            raise InvalidMAST("Didn't know tag %s" % tag)

        if self.noisy:
            if self.patts:
                print "Top pattern:", self.patts[-1]
            else:
                print "No patterns yet"
            if self.exprs:
                print "Top expression:", self.exprs[-1]
            else:
                print "No expressions yet"
Ejemplo n.º 41
0
 def interpstr_part(state, p):
     val = ''.join(p[4].get_strparts())
     str_decode_utf_8(val, len(val), 'strict', final=True)
     return ast.InterpStringContents(p[0].get_strings() + [val],
                                     p[0].get_exprs() + [p[2]])
Ejemplo n.º 42
0
def unicode_from_utf8(s):
    """Converts a `str` value to a `unicode` value assuming it's encoded in UTF8."""
    res, _ = str_decode_utf_8(s, len(s), 'strict')
    return res
Ejemplo n.º 43
0
 def interpstr_start(state, p):
     val = ''.join(p[0].get_strparts())
     str_decode_utf_8(val, len(val), 'strict', final=True)
     return ast.InterpStringContents([val], [])
Ejemplo n.º 44
0
 def expression_string(state, p):
     val = ''.join(p[1].get_strparts())
     str_decode_utf_8(val, len(val), 'strict', final=True)
     return ast.String(val, srcpos=sr(p))
Ejemplo n.º 45
0
def raw_encode_basestring_ascii(space, w_string):
    if space.isinstance_w(w_string, space.w_str):
        s = space.str_w(w_string)
        for i in range(len(s)):
            c = s[i]
            if c >= ' ' and c <= '~' and c != '"' and c != '\\':
                pass
            else:
                first = i
                break
        else:
            # the input is a string with only non-special ascii chars
            return w_string

        eh = unicodehelper.decode_error_handler(space)
        u = str_decode_utf_8(
                s, len(s), None, final=True, errorhandler=eh,
                allow_surrogates=True)[0]
        sb = StringBuilder(len(u))
        sb.append_slice(s, 0, first)
    else:
        # We used to check if 'u' contains only safe characters, and return
        # 'w_string' directly.  But this requires an extra pass over all
        # characters, and the expected use case of this function, from
        # json.encoder, will anyway re-encode a unicode result back to
        # a string (with the ascii encoding).  This requires two passes
        # over the characters.  So we may as well directly turn it into a
        # string here --- only one pass.
        u = space.unicode_w(w_string)
        sb = StringBuilder(len(u))
        first = 0

    for i in range(first, len(u)):
        c = u[i]
        if c <= u'~':
            if c == u'"' or c == u'\\':
                sb.append('\\')
            elif c < u' ':
                sb.append(ESCAPE_BEFORE_SPACE[ord(c)])
                continue
            sb.append(chr(ord(c)))
        else:
            if c <= u'\uffff':
                sb.append('\\u')
                sb.append(HEX[ord(c) >> 12])
                sb.append(HEX[(ord(c) >> 8) & 0x0f])
                sb.append(HEX[(ord(c) >> 4) & 0x0f])
                sb.append(HEX[ord(c) & 0x0f])
            else:
                # surrogate pair
                n = ord(c) - 0x10000
                s1 = 0xd800 | ((n >> 10) & 0x3ff)
                sb.append('\\ud')
                sb.append(HEX[(s1 >> 8) & 0x0f])
                sb.append(HEX[(s1 >> 4) & 0x0f])
                sb.append(HEX[s1 & 0x0f])
                s2 = 0xdc00 | (n & 0x3ff)
                sb.append('\\ud')
                sb.append(HEX[(s2 >> 8) & 0x0f])
                sb.append(HEX[(s2 >> 4) & 0x0f])
                sb.append(HEX[s2 & 0x0f])

    res = sb.build()
    return space.wrap(res)
Ejemplo n.º 46
0
def decode_str_utf8(string):
    result, consumed = runicode.str_decode_utf_8(string, len(string), "strict", True)
    return result
Ejemplo n.º 47
0
        def f(x):

            s1 = "".join(["\xd7\x90\xd6\x96\xeb\x96\x95\xf0\x90\x91\x93"] * x)
            u, consumed = runicode.str_decode_utf_8(s1, len(s1), True)
            s2 = runicode.unicode_encode_utf_8(u, len(u), True)
            return s1 == s2