def decode_string_escaped(self, start): i = self.pos builder = StringBuilder((i - start) * 2) # just an estimate assert start >= 0 assert i >= 0 builder.append_slice(self.s, start, i) while True: ch = self.ll_chars[i] i += 1 if ch == '"': content_utf8 = builder.build() lgt = unicodehelper.check_utf8_or_raise( self.space, content_utf8) self.pos = i return self.space.newutf8(content_utf8, lgt) elif ch == '\\': i = self.decode_escape_sequence(i, builder) elif ch < '\x20': if ch == '\0': self._raise("Unterminated string starting at char %d", start - 1) else: self._raise("Invalid control character at char %d", i - 1) else: builder.append(ch)
def _create_string_wrapped(self, start, end, nonascii): content = self.getslice(start, end) if nonascii: # contains non-ascii chars, we need to check that it's valid utf-8 lgt = unicodehelper.check_utf8_or_raise(self.space, content) else: lgt = end - start return self.space.newutf8(content, lgt)
def decode_utf8_recode(space, s, ps, end, recode_encoding): p = ps while p < end and ord(s[p]) & 0x80: p += 1 lgt = unicodehelper.check_utf8_or_raise(space, s, ps, p) w_v = unicodehelper.encode(space, space.newutf8(s[ps:p], lgt), recode_encoding) v = space.bytes_w(w_v) return v, p
def _create_string(self, start, end, bits): if bits & 0x80: # the 8th bit is set, it's an utf8 string content_utf8 = self.getslice(start, end) lgt = unicodehelper.check_utf8_or_raise(self.space, content_utf8) return self.space.newutf8(content_utf8, lgt) else: # ascii only, fast path (ascii is a strict subset of # latin1, and we already checked that all the chars are < # 128) return self.space.newutf8(self.getslice(start, end), end - start)
def decode_string_escaped(self, start, nonascii): i = self.pos builder = StringBuilder((i - start) * 2) # just an estimate assert start >= 0 assert i >= 0 builder.append_slice(self.s, start, i) while True: ch = self.ll_chars[i] i += 1 if ch == '"': content_utf8 = builder.build() length = unicodehelper.check_utf8_or_raise( self.space, content_utf8) self.pos = i return self.space.newutf8(content_utf8, length) elif ch == '\\': i = self.decode_escape_sequence_to_utf8(i, builder) elif ch < '\x20': self._raise_control_char_in_string(ch, start, i) else: builder.append(ch)
def raw_encode_basestring_ascii(space, w_string): if space.isinstance_w(w_string, space.w_bytes): s = space.bytes_w(w_string) for i in range(len(s)): c = s[i] if c >= ' ' and c <= '~' and c != '"' and c != '\\': pass else: first = i break else: # the input is a string with only non-special ascii chars return w_string unicodehelper.check_utf8_or_raise(space, s) sb = StringBuilder(len(s)) sb.append_slice(s, 0, first) else: # We used to check if 'u' contains only safe characters, and return # 'w_string' directly. But this requires an extra pass over all # characters, and the expected use case of this function, from # json.encoder, will anyway re-encode a unicode result back to # a string (with the ascii encoding). This requires two passes # over the characters. So we may as well directly turn it into a # string here --- only one pass. s = space.utf8_w(w_string) sb = StringBuilder(len(s)) first = 0 it = rutf8.Utf8StringIterator(s) for i in range(first): it.next() for c in it: if c <= ord('~'): if c == ord('"') or c == ord('\\'): sb.append('\\') elif c < ord(' '): sb.append(ESCAPE_BEFORE_SPACE[c]) continue sb.append(chr(c)) else: if c <= ord(u'\uffff'): sb.append('\\u') sb.append(HEX[c >> 12]) sb.append(HEX[(c >> 8) & 0x0f]) sb.append(HEX[(c >> 4) & 0x0f]) sb.append(HEX[c & 0x0f]) else: # surrogate pair n = c - 0x10000 s1 = 0xd800 | ((n >> 10) & 0x3ff) sb.append('\\ud') sb.append(HEX[(s1 >> 8) & 0x0f]) sb.append(HEX[(s1 >> 4) & 0x0f]) sb.append(HEX[s1 & 0x0f]) s2 = 0xdc00 | (n & 0x3ff) sb.append('\\ud') sb.append(HEX[(s2 >> 8) & 0x0f]) sb.append(HEX[(s2 >> 4) & 0x0f]) sb.append(HEX[s2 & 0x0f]) res = sb.build() return space.newtext(res)
def parsestr(space, encoding, s, unicode_literal=False): """Parses a string or unicode literal, and return a wrapped value. If encoding=iso8859-1, the source string is also in this encoding. If encoding=None, the source string is ascii only. In other cases, the source string is in utf-8 encoding. When a bytes string is returned, it will be encoded with the original encoding. Yes, it's very inefficient. Yes, CPython has very similar code. """ # we use ps as "pointer to s" # q is the virtual last char index of the string ps = 0 quote = s[ps] rawmode = False # string decoration handling if quote == 'b' or quote == 'B': ps += 1 quote = s[ps] unicode_literal = False elif quote == 'u' or quote == 'U': ps += 1 quote = s[ps] unicode_literal = True if quote == 'r' or quote == 'R': ps += 1 quote = s[ps] rawmode = True if quote != "'" and quote != '"': raise_app_valueerror(space, 'Internal error: parser passed unquoted literal') ps += 1 q = len(s) - 1 if s[q] != quote: raise_app_valueerror( space, 'Internal error: parser passed unmatched ' 'quotes in literal') if q - ps >= 4 and s[ps] == quote and s[ps + 1] == quote: # triple quotes ps += 2 if s[q - 1] != quote or s[q - 2] != quote: raise_app_valueerror( space, 'Internal error: parser passed ' 'unmatched triple quotes in literal') q -= 2 if unicode_literal: if encoding is None or encoding == "iso-8859-1": # 'unicode_escape' expects latin-1 bytes, string is ready. assert 0 <= ps <= q substr = s[ps:q] else: unicodehelper.check_utf8_or_raise(space, s, ps, q) substr = decode_unicode_utf8(space, s, ps, q) if rawmode: r = unicodehelper.decode_raw_unicode_escape(space, substr) else: r = unicodehelper.decode_unicode_escape(space, substr) v, length = r return space.newutf8(v, length) need_encoding = (encoding is not None and encoding != "utf-8" and encoding != "utf8" and encoding != "iso-8859-1") assert 0 <= ps <= q substr = s[ps:q] if rawmode or '\\' not in s[ps:]: if need_encoding: lgt = unicodehelper.check_utf8_or_raise(space, substr) w_u = space.newutf8(substr, lgt) w_v = unicodehelper.encode(space, w_u, encoding) return w_v else: return space.newbytes(substr) enc = None if need_encoding: enc = encoding v = PyString_DecodeEscape(space, substr, 'strict', enc) return space.newbytes(v)
def unmarshal_unicode(space, u, tc): arg = u.get_str() length = unicodehelper.check_utf8_or_raise(space, arg) return space.newutf8(arg, length)