def expression_string_expand(state, p): val = ''.join(p[1].get_strparts()) try: str_decode_utf_8(val, len(val), 'strict', final=True) except UnicodeDecodeError: raise errorhandler(state, p[1], msg="Unicode error") return ast.String(val, srcpos=sr(p))
def expression_string_expand(state, p): val = ''.join(p[1].get_strparts()) try: str_decode_utf_8(val, len(val), 'strict', final=True) except UnicodeDecodeError: raise errorhandler(state, p[1], msg="Unicode error") return ast.String(val, srcpos=sr(p))
def f(x): s1 = "".join(["\xd7\x90\xd6\x96\xeb\x96\x95\xf0\x90\x91\x93"] * x) u, consumed = runicode.str_decode_utf_8(s1, len(s1), 'strict', allow_surrogates=True) s2 = runicode.unicode_encode_utf_8(u, len(u), 'strict', allow_surrogates=True) u3, consumed3 = runicode.str_decode_utf_8(s1, len(s1), 'strict', allow_surrogates=False) s3 = runicode.unicode_encode_utf_8(u3, len(u3), 'strict', allow_surrogates=False) return s1 == s2 == s3
def f(x): s1 = "".join(["\xd7\x90\xd6\x96\xeb\x96\x95\xf0\x90\x91\x93"] * x) u, consumed = runicode.str_decode_utf_8(s1, len(s1), 'strict', allow_surrogates=True) s2 = runicode.unicode_encode_utf_8(u, len(u), 'strict', allow_surrogates=True) u3, consumed3 = runicode.str_decode_utf_8(s1, len(s1), 'strict', allow_surrogates=False) s3 = runicode.unicode_encode_utf_8(u3, len(u3), 'strict', allow_surrogates=False) return s1 == s2 == s3
def __init__(self, value, reference, package, line, col): string = value.getstr() iden, trash = str_decode_utf_8(string, len(string), "strict", True) self.value = iden if reference is not None: string = reference.getstr() ref, trash = str_decode_utf_8(string, len(string), "strict", True) else: ref = unicode(package) self.reference = ref self.package = package self.line = line self.col = col
def decode_object(space, w_obj, encoding, errors): if encoding is None: encoding = getdefaultencoding(space) if errors is None or errors == 'strict': if encoding == 'ascii': # XXX error handling s = space.charbuf_w(w_obj) try: u = fast_str_decode_ascii(s) except ValueError: eh = unicodehelper.decode_error_handler(space) u = str_decode_ascii( # try again, to get the error right s, len(s), None, final=True, errorhandler=eh)[0] return space.newunicode(u) if encoding == 'utf-8': s = space.charbuf_w(w_obj) eh = unicodehelper.decode_error_handler(space) return space.newunicode(str_decode_utf_8( s, len(s), None, final=True, errorhandler=eh, allow_surrogates=True)[0]) w_codecs = space.getbuiltinmodule("_codecs") w_decode = space.getattr(w_codecs, space.newtext("decode")) if errors is None: w_retval = space.call_function(w_decode, w_obj, space.newtext(encoding)) else: w_retval = space.call_function(w_decode, w_obj, space.newtext(encoding), space.newtext(errors)) return w_retval
def __init__(self, value, package, line, col): string = value.getstr() iden, trash = str_decode_utf_8(string, len(string), "strict", True) self.value = iden self.package = package self.line = line self.col = col
def fsdecode(space, w_string): state = space.fromcache(interp_codecs.CodecState) if _WIN32: bytes = space.bytes_w(w_string) uni = str_decode_mbcs(bytes, len(bytes), 'strict', errorhandler=decode_error_handler(space), force_ignore=False)[0] elif _MACOSX: bytes = space.bytes_w(w_string) uni = runicode.str_decode_utf_8( bytes, len(bytes), 'surrogateescape', errorhandler=state.decode_error_handler)[0] elif state.codec_need_encodings: # bootstrap check: if the filesystem codec is implemented in # Python we cannot use it before the codecs are ready. use the # locale codec instead from pypy.module._codecs.locale import ( str_decode_locale_surrogateescape) bytes = space.bytes_w(w_string) uni = str_decode_locale_surrogateescape( bytes, errorhandler=decode_error_handler(space)) else: from pypy.module.sys.interp_encoding import getfilesystemencoding return space.call_method(w_string, 'decode', getfilesystemencoding(space), space.wrap('surrogateescape')) return space.wrap(uni)
def decode_object(space, w_obj, encoding, errors): if encoding is None: encoding = getdefaultencoding(space) if errors is None or errors == 'strict': if encoding == 'ascii': # XXX error handling s = space.charbuf_w(w_obj) eh = unicodehelper.decode_error_handler(space) return space.wrap( str_decode_ascii(s, len(s), None, final=True, errorhandler=eh)[0]) if encoding == 'utf-8': s = space.charbuf_w(w_obj) eh = unicodehelper.decode_error_handler(space) return space.wrap( str_decode_utf_8(s, len(s), None, final=True, errorhandler=eh, allow_surrogates=True)[0]) w_codecs = space.getbuiltinmodule("_codecs") w_decode = space.getattr(w_codecs, space.wrap("decode")) if errors is None: w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding)) else: w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding), space.wrap(errors)) return w_retval
def decode_object(space, w_obj, encoding, errors): if encoding is None: encoding = getdefaultencoding(space) if errors is None or errors == 'strict': if encoding == 'ascii': s = space.charbuf_w(w_obj) try: u = fast_str_decode_ascii(s) except ValueError: eh = unicodehelper.decode_error_handler(space) u = str_decode_ascii( # try again, to get the error right s, len(s), None, final=True, errorhandler=eh)[0] return space.newunicode(u) if encoding == 'utf-8': s = space.charbuf_w(w_obj) eh = unicodehelper.decode_error_handler(space) return space.newunicode( str_decode_utf_8(s, len(s), None, final=True, errorhandler=eh)[0]) from pypy.module._codecs.interp_codecs import decode_text w_retval = decode_text(space, w_obj, encoding, errors) if not space.isinstance_w(w_retval, space.w_unicode): raise oefmt( space.w_TypeError, "'%s' decoder returned '%T' instead of 'str'; " "use codecs.decode() to decode to arbitrary types", encoding, w_retval) return w_retval
def decode_object(space, w_obj, encoding, errors): if encoding is None: encoding = getdefaultencoding(space) if errors is None or errors == 'strict': if encoding == 'ascii': # XXX error handling s = space.charbuf_w(w_obj) try: u = fast_str_decode_ascii(s) except ValueError: eh = unicodehelper.decode_error_handler(space) u = str_decode_ascii( # try again, to get the error right s, len(s), None, final=True, errorhandler=eh)[0] return space.wrap(u) if encoding == 'utf-8': s = space.charbuf_w(w_obj) eh = unicodehelper.decode_error_handler(space) return space.wrap(str_decode_utf_8( s, len(s), None, final=True, errorhandler=eh)[0]) w_codecs = space.getbuiltinmodule("_codecs") w_decode = space.getattr(w_codecs, space.wrap("decode")) if errors is None: w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding)) else: w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding), space.wrap(errors)) return w_retval
def read_cached_string(self): sz = read_raw_integer(self) if sz >= MAX_STRING_SIZE: return self._str_cache[sz - MAX_STRING_SIZE] else: s, pos = str_decode_utf_8(self.read(sz), sz, "?") self._str_cache[len(self._str_cache)] = s return s
def read_cached_string(self): sz = read_raw_integer(self) if sz >= MAX_STRING_SIZE: return self._str_cache[sz - MAX_STRING_SIZE] else: s, pos = str_decode_utf_8(self.read(sz), sz, "?") self._str_cache[len(self._str_cache)] = s return s
def utf8_decoder_operate(decoder, newdata, final): data = decoder.buffer + newdata try: string, pos = str_decode_utf_8(data, len(data), '', final=final) except UnicodeDecodeError as error: raise space.unwind(space.LError(u"unicode decode failed")) decoder.buffer = data[pos:] return string
def _decode_utf8(string): # when building the error message, don't crash if the byte string # provided is not valid UTF-8 assert isinstance(string, str) result, consumed = runicode.str_decode_utf_8(string, len(string), "replace", final=True) return result
def f(n): x = strings[n] if n: errors = 'strict' else: errors = 'foo' # the annotation of y is SomeUnicodeString(can_be_None=False) y, _ = str_decode_utf_8(x, len(x), errors, errorhandler) return x.decode('utf-8') + y
def decode_utf8(space, string): result, consumed = runicode.str_decode_utf_8( string, len(string), "strict", final=True, errorhandler=decode_error_handler(space), allow_surrogates=True) return result
def f(n): x = strings[n] if n: errors = 'strict' else: errors = 'foo' # the annotation of y is SomeUnicodeString(can_be_None=False) y, _ = str_decode_utf_8(x, len(x), errors, errorhandler=errorhandler) return x.decode('utf-8') + y
def decode_utf8(space, string): # Surrogates are accepted and not treated specially at all. # If there happen to be two 3-bytes encoding a pair of surrogates, # you still get two surrogate unicode characters in the result. # These are the Python2 rules; Python3 differs. result, consumed = runicode.str_decode_utf_8( string, len(string), "strict", final=True, errorhandler=decode_error_handler(space), allow_surrogates=True) return result
def utf8_decoder_operate(decoder, newdata, final): data = decoder.buffer + newdata try: string, pos = str_decode_utf_8(data, len(data), '', final=final) except UnicodeDecodeError as error: raise space.unwind(space.LError(u"unicode decode failed")) assert 0 <= pos <= len(data) # Added to satisfy PyPy 5.7 # The implementation of str_decode_utf_8 perhaps changed? decoder.buffer = data[pos:] return string
def utf_8_decode(space, string, errors="strict", w_final=None): if errors is None: errors = 'strict' final = space.is_true(w_final) state = space.fromcache(CodecState) result, consumed = runicode.str_decode_utf_8( string, len(string), errors, final, state.decode_error_handler, allow_surrogates=True) return space.newtuple([space.wrap(result), space.wrap(consumed)])
def utf_8_decode(space, string, errors="strict", w_final=None): if errors is None: errors = 'strict' final = space.is_true(w_final) state = space.fromcache(CodecState) result, consumed = runicode.str_decode_utf_8(string, len(string), errors, final, state.decode_error_handler, allow_surrogates=True) return space.newtuple([space.wrap(result), space.wrap(consumed)])
def __init__(self, name, params, body, returnstatement, package, line, col): string = name.getstr() iden, trash = str_decode_utf_8(string, len(string), "strict", True) self.name = iden self.params = params self.body = body self.returnstatement = returnstatement self.package = package self.line = line self.col = col self.compiler = bytecode.Compiler()
def wrap_info(self, space): w_text = w_filename = space.w_None offset = self.offset w_lineno = space.newint(self.lineno) if self.filename is not None: w_filename = space.newfilename(self.filename) if self.text is None and self.filename is not None: w_text = space.appexec([w_filename, w_lineno], """(filename, lineno): try: with open(filename) as f: for _ in range(lineno - 1): f.readline() return f.readline() except: # we can't allow any exceptions here! return None""") elif self.text is not None: from rpython.rlib.runicode import str_decode_utf_8 # self.text may not be UTF-8 in case of decoding errors. # adjust the encoded text offset to a decoded offset # XXX do the right thing about continuation lines, which # XXX are their own fun, sometimes giving offset > # XXX len(self.text) for example (right now, avoid crashing) if offset > len(self.text): offset = len(self.text) text, _ = str_decode_utf_8(self.text, offset, 'replace') offset = len(text) if len(self.text) != offset: text, _ = str_decode_utf_8(self.text, len(self.text), 'replace') w_text = space.newunicode(text) return space.newtuple([ space.newtext(self.msg), space.newtuple([ w_filename, w_lineno, space.newint(offset), w_text, space.newint(self.lastlineno) ]) ])
def decode_utf8(space, string): # Surrogates are accepted and not treated specially at all. # If there happen to be two 3-bytes encoding a pair of surrogates, # you still get two surrogate unicode characters in the result. # These are the Python2 rules; Python3 differs. result, consumed = runicode.str_decode_utf_8( string, len(string), "strict", final=True, errorhandler=decode_error_handler(space), allow_surrogates=True) return result
def create_arrays(self, text): self.textarrays = [] chars, words, lines = [], [], [] wordbuffer, linebuffer = [], [] i, j, k = 0, 0, 0 a = rbigint() text, trash = str_decode_utf_8(text, len(text), "strict", True) for char in text: if char == " " and wordbuffer != []: word = u"".join(wordbuffer) words.append(SodaInt(a.fromint(j))) words.append(SodaString(word)) wordbuffer = [] j += 1 chars.append(SodaInt(a.fromint(i))) chars.append(SodaString(char)) linebuffer.append(char) i += 1 elif char == "\n" and linebuffer != []: line = u"".join(linebuffer) lines.append(SodaInt(a.fromint(k))) lines.append(SodaString(line)) linebuffer = [] k += 1 if not wordbuffer == []: word = u"".join(wordbuffer) words.append(SodaInt(a.fromint(j))) words.append(SodaString(word)) wordbuffer = [] j += 1 chars.append(SodaInt(a.fromint(i))) chars.append(SodaString(char)) i += 1 else: chars.append(SodaInt(a.fromint(i))) chars.append(SodaString(char)) wordbuffer.append(char) linebuffer.append(char) i += 1 if not wordbuffer == []: word = u"".join(wordbuffer) words.append(SodaInt(a.fromint(j))) words.append(SodaString(word)) if not linebuffer == []: line = u"".join(linebuffer) lines.append(SodaInt(a.fromint(k))) lines.append(SodaString(line)) self.textarrays.append(SodaArray(chars)) self.textarrays.append(SodaArray(words)) self.textarrays.append(SodaArray(lines))
def str_decode_utf8(rope): from rpython.rlib.runicode import str_decode_utf_8 if rope.is_ascii(): return rope elif isinstance(rope, BinaryConcatNode): lresult = str_decode_utf8(rope.left) if lresult is not None: return BinaryConcatNode(lresult, str_decode_utf8(rope.right)) elif isinstance(rope, LiteralStringNode): try: result, consumed = str_decode_utf_8(rope.s, len(rope.s), "strict", False) except UnicodeDecodeError: return None if consumed < len(rope.s): return None return rope_from_unicode(result) s = rope.flatten_string() try: result, consumed = str_decode_utf_8(s, len(s), "strict", True) return rope_from_unicode(result) except UnicodeDecodeError: pass
def str_decode_utf8(rope): from rpython.rlib.runicode import str_decode_utf_8 if rope.is_ascii(): return rope elif isinstance(rope, BinaryConcatNode): lresult = str_decode_utf8(rope.left) if lresult is not None: return BinaryConcatNode(lresult, str_decode_utf8(rope.right)) elif isinstance(rope, LiteralStringNode): try: result, consumed = str_decode_utf_8(rope.s, len(rope.s), "strict", False) except UnicodeDecodeError: return None if consumed < len(rope.s): return None return rope_from_unicode(result) s = rope.flatten_string() try: result, consumed = str_decode_utf_8(s, len(s), "strict", True) return rope_from_unicode(result) except UnicodeDecodeError: pass
def _test_check_utf8(s, allow_surrogates): try: u, _ = runicode.str_decode_utf_8(s, len(s), None, final=True, allow_surrogates=allow_surrogates) valid = True except UnicodeDecodeError as e: valid = False length = rutf8._check_utf8(s, allow_surrogates, 0, len(s)) if length < 0: assert not valid assert ~(length) == e.start else: assert valid if sys.maxunicode == 0x10FFFF or not _has_surrogates(s): assert length == len(u)
def decode_utf8(space, string, allow_surrogates=False): # Note that Python3 tends to forbid *all* surrogates in utf-8. # If allow_surrogates=True, then revert to the Python 2 behavior, # i.e. surrogates are accepted and not treated specially at all. # If there happen to be two 3-bytes encoding a pair of surrogates, # you still get two surrogate unicode characters in the result. assert isinstance(string, str) result, consumed = runicode.str_decode_utf_8( string, len(string), "strict", final=True, errorhandler=decode_error_handler(space), allow_surrogates=allow_surrogates) return result
def wrap_info(self, space): w_text = w_filename = space.w_None if self.text is not None: from rpython.rlib.runicode import str_decode_utf_8 # self.text may not be UTF-8 in case of decoding errors w_text = space.wrap(str_decode_utf_8(self.text, len(self.text), 'replace')[0]) if self.filename is not None: w_filename = space.fsdecode(space.wrapbytes(self.filename)) return space.newtuple([space.wrap(self.msg), space.newtuple([w_filename, space.wrap(self.lineno), space.wrap(self.offset), w_text, space.wrap(self.lastlineno)])])
def wrap_info(self, space): w_text = w_filename = space.w_None if self.text is not None: from rpython.rlib.runicode import str_decode_utf_8 # self.text may not be UTF-8 in case of decoding errors w_text = space.wrap( str_decode_utf_8(self.text, len(self.text), 'replace')[0]) if self.filename is not None: w_filename = space.fsdecode(space.wrapbytes(self.filename)) return space.newtuple([ space.wrap(self.msg), space.newtuple([ w_filename, space.wrap(self.lineno), space.wrap(self.offset), w_text, space.wrap(self.lastlineno) ]) ])
def decode_object(space, w_obj, encoding, errors): if encoding is None: encoding = getdefaultencoding(space) if errors is None or errors == "strict": if encoding == "ascii": # XXX error handling s = space.charbuf_w(w_obj) eh = unicodehelper.decode_error_handler(space) return space.wrap(str_decode_ascii(s, len(s), None, final=True, errorhandler=eh)[0]) if encoding == "utf-8": s = space.charbuf_w(w_obj) eh = unicodehelper.decode_error_handler(space) return space.wrap(str_decode_utf_8(s, len(s), None, final=True, errorhandler=eh, allow_surrogates=True)[0]) w_codecs = space.getbuiltinmodule("_codecs") w_decode = space.getattr(w_codecs, space.wrap("decode")) if errors is None: w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding)) else: w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding), space.wrap(errors)) return w_retval
def f(x): s1 = "".join(["\xd7\x90\xd6\x96\xeb\x96\x95\xf0\x90\x91\x93"] * x) u, consumed = runicode.str_decode_utf_8(s1, len(s1), True) s2 = runicode.unicode_encode_utf_8(u, len(u), True) return s1 == s2
def unicode_from_utf8(s): """Converts a `str` value to a `unicode` value assuming it's encoded in UTF8.""" res, _ = str_decode_utf_8(s, len(s), 'strict') return res
def decode_utf8(space, string, allow_surrogates=False): result, consumed = runicode.str_decode_utf_8( string, len(string), "strict", final=True, errorhandler=decode_error_handler(space), allow_surrogates=allow_surrogates) return result
def interpstr_start(state, p): val = ''.join(p[0].get_strparts()) str_decode_utf_8(val, len(val), 'strict', final=True) return ast.InterpStringContents([val], [])
def interpstr_part(state, p): val = ''.join(p[4].get_strparts()) str_decode_utf_8(val, len(val), 'strict', final=True) return ast.InterpStringContents( p[0].get_strings() + [val], p[0].get_exprs() + [p[2]])
def raw_encode_basestring_ascii(space, w_string): if space.isinstance_w(w_string, space.w_bytes): s = space.bytes_w(w_string) for i in range(len(s)): c = s[i] if c >= ' ' and c <= '~' and c != '"' and c != '\\': pass else: first = i break else: # the input is a string with only non-special ascii chars return w_string eh = unicodehelper.decode_error_handler(space) u = str_decode_utf_8(s, len(s), None, final=True, errorhandler=eh, allow_surrogates=True)[0] sb = StringBuilder(len(u)) sb.append_slice(s, 0, first) else: # We used to check if 'u' contains only safe characters, and return # 'w_string' directly. But this requires an extra pass over all # characters, and the expected use case of this function, from # json.encoder, will anyway re-encode a unicode result back to # a string (with the ascii encoding). This requires two passes # over the characters. So we may as well directly turn it into a # string here --- only one pass. u = space.unicode_w(w_string) sb = StringBuilder(len(u)) first = 0 for i in range(first, len(u)): c = ord(u[i]) if c <= ord('~'): if c == ord('"') or c == ord('\\'): sb.append('\\') elif c < ord(' '): sb.append(ESCAPE_BEFORE_SPACE[c]) continue sb.append(chr(c)) else: if c <= ord(u'\uffff'): sb.append('\\u') sb.append(HEX[c >> 12]) sb.append(HEX[(c >> 8) & 0x0f]) sb.append(HEX[(c >> 4) & 0x0f]) sb.append(HEX[c & 0x0f]) else: # surrogate pair n = c - 0x10000 s1 = 0xd800 | ((n >> 10) & 0x3ff) sb.append('\\ud') sb.append(HEX[(s1 >> 8) & 0x0f]) sb.append(HEX[(s1 >> 4) & 0x0f]) sb.append(HEX[s1 & 0x0f]) s2 = 0xdc00 | (n & 0x3ff) sb.append('\\ud') sb.append(HEX[(s2 >> 8) & 0x0f]) sb.append(HEX[(s2 >> 4) & 0x0f]) sb.append(HEX[s2 & 0x0f]) res = sb.build() return space.newtext(res)
def decodeNextTag(self, stream): tag = stream.nextByte() if self.noisy: print "Tag:", tag if tag == 'L': # Literal. literalTag = stream.nextByte() if self.noisy: print "Literal tag:", literalTag if literalTag == 'C': # Character. Read bytes one-at-a-time until a code point has # been decoded successfully. buf = stream.nextByte() try: rv, count = str_decode_utf_8(buf, len(buf), None) while rv == u'': buf += stream.nextByte() rv, count = str_decode_utf_8(buf, len(buf), None) except UnicodeDecodeError: raise InvalidMAST("Couldn't decode char %s" % buf) self.exprs.append(MastIR.CharExpr(rv)) elif literalTag == 'D': # Double. self.exprs.append(MastIR.DoubleExpr(stream.nextDouble())) elif literalTag == 'I': # Int. Read a varint and un-zz it. bi = stream.nextVarInt() shifted = bi.rshift(1) if bi.int_and_(1).toint(): shifted = shifted.int_xor(-1) self.exprs.append(MastIR.IntExpr(shifted)) elif literalTag == 'N': # Null. self.exprs.append(MastIR.NullExpr()) elif literalTag == 'S': # Str. s = stream.nextStr() self.exprs.append(MastIR.StrExpr(s)) else: raise InvalidMAST("Didn't know literal tag %s" % literalTag) elif tag == 'P': # Pattern. pattTag = stream.nextByte() if self.noisy: print "Pattern tag:", pattTag if pattTag == 'F': # Final. name = stream.nextStr() guard = self.nextExpr(stream) self.patts.append(MastIR.FinalPatt(name, guard)) elif pattTag == 'I': # Ignore. guard = self.nextExpr(stream) self.patts.append(MastIR.IgnorePatt(guard)) elif pattTag == 'V': # Var. name = stream.nextStr() guard = self.nextExpr(stream) self.patts.append(MastIR.VarPatt(name, guard)) elif pattTag == 'L': # List. patts = self.nextPatts(stream) self.patts.append(MastIR.ListPatt(patts)) elif pattTag == 'A': # Via. expr = self.nextExpr(stream) patt = self.nextPatt(stream) self.patts.append(MastIR.ViaPatt(expr, patt)) elif pattTag == 'B': # Binding. name = stream.nextStr() self.patts.append(MastIR.BindingPatt(name)) else: raise InvalidMAST("Didn't know pattern tag %s" % pattTag) elif tag == 'N': # Noun. s = stream.nextStr() self.exprs.append(MastIR.NounExpr(s)) elif tag == 'B': # Binding. s = stream.nextStr() self.exprs.append(MastIR.BindingExpr(s)) elif tag == 'S': # Sequence. exprs = self.nextExprs(stream) self.exprs.append(MastIR.SeqExpr(exprs)) elif tag == 'C': # Call. target = self.nextExpr(stream) verb = stream.nextStr() args = self.nextExprs(stream) namedArgs = self.nextNamedExprs(stream) self.exprs.append(MastIR.CallExpr(target, verb, args, namedArgs)) elif tag == 'D': # Def. patt = self.nextPatt(stream) exit = self.nextExpr(stream) expr = self.nextExpr(stream) self.exprs.append(MastIR.DefExpr(patt, exit, expr)) elif tag == 'e': # Escape (no catch). escapePatt = self.nextPatt(stream) escapeExpr = self.nextExpr(stream) self.exprs.append(MastIR.EscapeOnlyExpr(escapePatt, escapeExpr)) elif tag == 'E': # Escape (with catch). escapePatt = self.nextPatt(stream) escapeExpr = self.nextExpr(stream) catchPatt = self.nextPatt(stream) catchExpr = self.nextExpr(stream) self.exprs.append(MastIR.EscapeExpr(escapePatt, escapeExpr, catchPatt, catchExpr)) elif tag == 'O': # Object with no script, just direct methods and matchers. doc = stream.nextStr() patt = self.nextPatt(stream) asExpr = self.nextExpr(stream) implements = self.nextExprs(stream) methods = self.nextMethods(stream) matchers = self.nextMatchers(stream) self.exprs.append(MastIR.ObjectExpr(doc, patt, [asExpr] + implements, methods, matchers)) elif tag == 'M': # Method. doc = stream.nextStr() verb = stream.nextStr() patts = self.nextPatts(stream) namedPatts = [MastIR.NamedPattern(key, value, default) for (key, value, default) in self.nextNamedPatts(stream)] guard = self.nextExpr(stream) block = self.nextExpr(stream) self.exprs.append(MastIR.MethodExpr(doc, verb, patts, namedPatts, guard, block)) elif tag == 'R': # Matcher. patt = self.nextPatt(stream) block = self.nextExpr(stream) self.exprs.append(MastIR.MatcherExpr(patt, block)) elif tag == 'A': # Assign. target = stream.nextStr() expr = self.nextExpr(stream) self.exprs.append(MastIR.AssignExpr(target, expr)) elif tag == 'F': # Try/finally. tryExpr = self.nextExpr(stream) finallyExpr = self.nextExpr(stream) self.exprs.append(MastIR.FinallyExpr(tryExpr, finallyExpr)) elif tag == 'Y': # Try/catch. tryExpr = self.nextExpr(stream) catchPatt = self.nextPatt(stream) catchExpr = self.nextExpr(stream) self.exprs.append(MastIR.TryExpr(tryExpr, catchPatt, catchExpr)) elif tag == 'H': # Hide. expr = self.nextExpr(stream) self.exprs.append(MastIR.HideExpr(expr)) elif tag == 'I': # If/then/else. cond = self.nextExpr(stream) cons = self.nextExpr(stream) alt = self.nextExpr(stream) self.exprs.append(MastIR.IfExpr(cond, cons, alt)) elif tag == 'T': # Meta state. self.exprs.append(MastIR.MetaStateExpr()) elif tag == 'X': # Meta context. self.exprs.append(MastIR.MetaContextExpr()) else: raise InvalidMAST("Didn't know tag %s" % tag) if self.noisy: if self.patts: print "Top pattern:", self.patts[-1] else: print "No patterns yet" if self.exprs: print "Top expression:", self.exprs[-1] else: print "No expressions yet"
def interpstr_part(state, p): val = ''.join(p[4].get_strparts()) str_decode_utf_8(val, len(val), 'strict', final=True) return ast.InterpStringContents(p[0].get_strings() + [val], p[0].get_exprs() + [p[2]])
def unicode_from_utf8(s): """Converts a `str` value to a `unicode` value assuming it's encoded in UTF8.""" res, _ = str_decode_utf_8(s, len(s), 'strict') return res
def interpstr_start(state, p): val = ''.join(p[0].get_strparts()) str_decode_utf_8(val, len(val), 'strict', final=True) return ast.InterpStringContents([val], [])
def expression_string(state, p): val = ''.join(p[1].get_strparts()) str_decode_utf_8(val, len(val), 'strict', final=True) return ast.String(val, srcpos=sr(p))
def raw_encode_basestring_ascii(space, w_string): if space.isinstance_w(w_string, space.w_str): s = space.str_w(w_string) for i in range(len(s)): c = s[i] if c >= ' ' and c <= '~' and c != '"' and c != '\\': pass else: first = i break else: # the input is a string with only non-special ascii chars return w_string eh = unicodehelper.decode_error_handler(space) u = str_decode_utf_8( s, len(s), None, final=True, errorhandler=eh, allow_surrogates=True)[0] sb = StringBuilder(len(u)) sb.append_slice(s, 0, first) else: # We used to check if 'u' contains only safe characters, and return # 'w_string' directly. But this requires an extra pass over all # characters, and the expected use case of this function, from # json.encoder, will anyway re-encode a unicode result back to # a string (with the ascii encoding). This requires two passes # over the characters. So we may as well directly turn it into a # string here --- only one pass. u = space.unicode_w(w_string) sb = StringBuilder(len(u)) first = 0 for i in range(first, len(u)): c = u[i] if c <= u'~': if c == u'"' or c == u'\\': sb.append('\\') elif c < u' ': sb.append(ESCAPE_BEFORE_SPACE[ord(c)]) continue sb.append(chr(ord(c))) else: if c <= u'\uffff': sb.append('\\u') sb.append(HEX[ord(c) >> 12]) sb.append(HEX[(ord(c) >> 8) & 0x0f]) sb.append(HEX[(ord(c) >> 4) & 0x0f]) sb.append(HEX[ord(c) & 0x0f]) else: # surrogate pair n = ord(c) - 0x10000 s1 = 0xd800 | ((n >> 10) & 0x3ff) sb.append('\\ud') sb.append(HEX[(s1 >> 8) & 0x0f]) sb.append(HEX[(s1 >> 4) & 0x0f]) sb.append(HEX[s1 & 0x0f]) s2 = 0xdc00 | (n & 0x3ff) sb.append('\\ud') sb.append(HEX[(s2 >> 8) & 0x0f]) sb.append(HEX[(s2 >> 4) & 0x0f]) sb.append(HEX[s2 & 0x0f]) res = sb.build() return space.wrap(res)
def decode_str_utf8(string): result, consumed = runicode.str_decode_utf_8(string, len(string), "strict", True) return result
def f(x): s1 = "".join(["\xd7\x90\xd6\x96\xeb\x96\x95\xf0\x90\x91\x93"] * x) u, consumed = runicode.str_decode_utf_8(s1, len(s1), True) s2 = runicode.unicode_encode_utf_8(u, len(u), True) return s1 == s2