def _tm_to_tuple(space, t): time_tuple = [ space.newint(rffi.getintfield(t, 'c_tm_year') + 1900), space.newint(rffi.getintfield(t, 'c_tm_mon') + 1), # want january == 1 space.newint(rffi.getintfield(t, 'c_tm_mday')), space.newint(rffi.getintfield(t, 'c_tm_hour')), space.newint(rffi.getintfield(t, 'c_tm_min')), space.newint(rffi.getintfield(t, 'c_tm_sec')), space.newint( (rffi.getintfield(t, 'c_tm_wday') + 6) % 7), # want monday == 0 space.newint(rffi.getintfield(t, 'c_tm_yday') + 1), # want january, 1 == 1 space.newint(rffi.getintfield(t, 'c_tm_isdst')) ] if HAS_TM_ZONE: # CPython calls PyUnicode_DecodeLocale here should we do the same? tm_zone = decode_utf8(space, rffi.charp2str(t.c_tm_zone), allow_surrogates=True) extra = [ space.newunicode(tm_zone), space.newint(rffi.getintfield(t, 'c_tm_gmtoff')) ] w_time_tuple = space.newtuple(time_tuple + extra) else: w_time_tuple = space.newtuple(time_tuple) w_struct_time = _get_module_object(space, 'struct_time') w_obj = space.call_function(w_struct_time, w_time_tuple) return w_obj
def decode_string_escaped(self, start): i = self.pos builder = StringBuilder((i - start) * 2) # just an estimate assert start >= 0 assert i >= 0 builder.append_slice(self.s, start, i) while True: ch = self.ll_chars[i] i += 1 if ch == '"': content_utf8 = builder.build() content_unicode = unicodehelper.decode_utf8( self.space, content_utf8) self.last_type = TYPE_STRING self.pos = i return self.space.newunicode(content_unicode) elif ch == '\\': i = self.decode_escape_sequence(i, builder) elif ch < '\x20': if ch == '\0': self._raise("Unterminated string starting at char %d", start - 1) else: self._raise("Invalid control character at char %d", i - 1) else: builder.append(ch)
def w_convert(self, space, s): if self.returns_unicode: from pypy.interpreter.unicodehelper import decode_utf8 return space.wrap(decode_utf8(space, s)) else: return space.wrap(s)
def decode_string(self, i): start = i bits = 0 while True: # this loop is a fast path for strings which do not contain escape # characters ch = self.ll_chars[i] i += 1 bits |= ord(ch) if ch == '"': if bits & 0x80: # the 8th bit is set, it's an utf8 strnig content_utf8 = self.getslice(start, i-1) content_unicode = unicodehelper.decode_utf8(self.space, content_utf8) else: # ascii only, fast path (ascii is a strict subset of # latin1, and we already checked that all the chars are < # 128) content_unicode = strslice2unicode_latin1(self.s, start, i-1) self.last_type = TYPE_STRING self.pos = i return self.space.wrap(content_unicode) elif ch == '\\': content_so_far = self.getslice(start, i-1) self.pos = i-1 return self.decode_string_escaped(start, content_so_far) elif ch < '\x20': self._raise("Invalid control character at char %d", self.pos-1)
def decode_string(self, i): start = i bits = 0 while True: # this loop is a fast path for strings which do not contain escape # characters ch = self.ll_chars[i] i += 1 bits |= ord(ch) if ch == '"': if bits & 0x80: # the 8th bit is set, it's an utf8 strnig content_utf8 = self.getslice(start, i - 1) content_unicode = unicodehelper.decode_utf8( self.space, content_utf8) else: # ascii only, fast path (ascii is a strict subset of # latin1, and we already checked that all the chars are < # 128) content_unicode = strslice2unicode_latin1( self.s, start, i - 1) self.last_type = TYPE_STRING self.pos = i return self.space.newunicode(content_unicode) elif ch == '\\' or ch < '\x20': self.pos = i - 1 return self.decode_string_escaped(start)
def decode_utf8(space, s, ps, end): assert ps >= 0 pt = ps # while (s < end && *s != '\\') s++; */ /* inefficient for u".." while ps < end and ord(s[ps]) & 0x80: ps += 1 u = unicodehelper.decode_utf8(space, s[pt:ps]) return u, ps
def _create_string(self, start, end, bits): if bits & 0x80: # the 8th bit is set, it's an utf8 string content_utf8 = self.getslice(start, end) return unicodehelper.decode_utf8(self.space, content_utf8) else: # ascii only, fast path (ascii is a strict subset of # latin1, and we already checked that all the chars are < # 128) return strslice2unicode_latin1(self.s, start, end)
def decode_utf8(space, s, ps, end, encoding): assert ps >= 0 pt = ps # while (s < end && *s != '\\') s++; */ /* inefficient for u".." while ps < end and ord(s[ps]) & 0x80: ps += 1 w_u = space.wrap(unicodehelper.decode_utf8(space, s[pt:ps])) w_v = unicodehelper.encode(space, w_u, encoding) v = space.str_w(w_v) return v, ps
def test_decode_utf8(): space = FakeSpace() assert decode_utf8(space, "abc") == u"abc" assert decode_utf8(space, "\xe1\x88\xb4") == u"\u1234" assert decode_utf8(space, "\xed\xa0\x80") == u"\ud800" assert decode_utf8(space, "\xed\xb0\x80") == u"\udc00" got = decode_utf8(space, "\xed\xa0\x80\xed\xb0\x80") assert map(ord, got) == [0xd800, 0xdc00] got = decode_utf8(space, "\xf0\x90\x80\x80") assert map(ord, got) == [0x10000]
def decode_string_escaped(self, start, content_so_far): builder = StringBuilder(len(content_so_far)*2) # just an estimate builder.append(content_so_far) i = self.pos while True: ch = self.ll_chars[i] i += 1 if ch == '"': content_utf8 = builder.build() content_unicode = unicodehelper.decode_utf8(self.space, content_utf8) self.last_type = TYPE_STRING self.pos = i return self.space.wrap(content_unicode) elif ch == '\\': i = self.decode_escape_sequence(i, builder) elif ch == '\0': self._raise("Unterminated string starting at char %d", start) else: builder.append_multiple_char(ch, 1) # we should implement append_char
def decode_string_escaped(self, start, content_so_far): builder = StringBuilder(len(content_so_far) * 2) # just an estimate builder.append(content_so_far) i = self.pos while True: ch = self.ll_chars[i] i += 1 if ch == '"': content_utf8 = builder.build() content_unicode = unicodehelper.decode_utf8( self.space, content_utf8) self.last_type = TYPE_STRING self.pos = i return self.space.wrap(content_unicode) elif ch == '\\': i = self.decode_escape_sequence(i, builder) elif ch == '\0': self._raise("Unterminated string starting at char %d", start) else: builder.append_multiple_char( ch, 1) # we should implement append_char
def newlist_text(self, list_t): return self.newlist_unicode([ decode_utf8(self, s, allow_surrogates=True) for s in list_t])
def parsestr(space, encoding, s): """Parses a string or unicode literal, and return a wrapped value. If encoding=None, the source string is ascii only. In other cases, the source string is in utf-8 encoding. When a bytes string is returned, it will be encoded with the original encoding. Yes, it's very inefficient. Yes, CPython has very similar code. """ # we use ps as "pointer to s" # q is the virtual last char index of the string ps = 0 quote = s[ps] rawmode = False unicode_literal = True saw_u = False # string decoration handling if quote == "b" or quote == "B": ps += 1 quote = s[ps] unicode_literal = False elif quote == "u" or quote == "U": ps += 1 quote = s[ps] saw_u = True if not saw_u and quote == "r" or quote == "R": ps += 1 quote = s[ps] rawmode = True if quote != "'" and quote != '"': raise_app_valueerror(space, "Internal error: parser passed unquoted literal") ps += 1 q = len(s) - 1 if s[q] != quote: raise_app_valueerror(space, "Internal error: parser passed unmatched " "quotes in literal") if q - ps >= 4 and s[ps] == quote and s[ps + 1] == quote: # triple quotes ps += 2 if s[q - 1] != quote or s[q - 2] != quote: raise_app_valueerror(space, "Internal error: parser passed " "unmatched triple quotes in literal") q -= 2 if unicode_literal and not rawmode: # XXX Py_UnicodeFlag is ignored for now if encoding is None: assert 0 <= ps <= q substr = s[ps:q] else: substr = decode_unicode_utf8(space, s, ps, q) v = unicodehelper.decode_unicode_escape(space, substr) return space.wrap(v) assert 0 <= ps <= q substr = s[ps:q] if not unicode_literal: # Disallow non-ascii characters (but not escapes) for c in substr: if ord(c) > 0x80: raise OperationError( space.w_SyntaxError, space.wrap("bytes can only contain ASCII literal characters.") ) if rawmode or "\\" not in substr: if not unicode_literal: return space.wrapbytes(substr) else: v = unicodehelper.decode_utf8(space, substr) return space.wrap(v) v = PyString_DecodeEscape(space, substr, "strict", encoding) return space.wrapbytes(v)
def parsestr(space, encoding, s, unicode_literal=False): """Parses a string or unicode literal, and return a wrapped value. If encoding=iso8859-1, the source string is also in this encoding. If encoding=None, the source string is ascii only. In other cases, the source string is in utf-8 encoding. When a bytes string is returned, it will be encoded with the original encoding. Yes, it's very inefficient. Yes, CPython has very similar code. """ # we use ps as "pointer to s" # q is the virtual last char index of the string ps = 0 quote = s[ps] rawmode = False # string decoration handling if quote == 'b' or quote == 'B': ps += 1 quote = s[ps] unicode_literal = False elif quote == 'u' or quote == 'U': ps += 1 quote = s[ps] unicode_literal = True if quote == 'r' or quote == 'R': ps += 1 quote = s[ps] rawmode = True if quote != "'" and quote != '"': raise_app_valueerror(space, 'Internal error: parser passed unquoted literal') ps += 1 q = len(s) - 1 if s[q] != quote: raise_app_valueerror(space, 'Internal error: parser passed unmatched ' 'quotes in literal') if q-ps >= 4 and s[ps] == quote and s[ps+1] == quote: # triple quotes ps += 2 if s[q-1] != quote or s[q-2] != quote: raise_app_valueerror(space, 'Internal error: parser passed ' 'unmatched triple quotes in literal') q -= 2 if unicode_literal: # XXX Py_UnicodeFlag is ignored for now if encoding is None or encoding == "iso-8859-1": # 'unicode_escape' expects latin-1 bytes, string is ready. assert 0 <= ps <= q substr = s[ps:q] else: substr = decode_unicode_utf8(space, s, ps, q) if rawmode: v = unicodehelper.decode_raw_unicode_escape(space, substr) else: v = unicodehelper.decode_unicode_escape(space, substr) return space.wrap(v) need_encoding = (encoding is not None and encoding != "utf-8" and encoding != "utf8" and encoding != "iso-8859-1") assert 0 <= ps <= q substr = s[ps : q] if rawmode or '\\' not in s[ps:]: if need_encoding: w_u = space.wrap(unicodehelper.decode_utf8(space, substr)) w_v = unicodehelper.encode(space, w_u, encoding) return w_v else: return space.wrap(substr) enc = None if need_encoding: enc = encoding v = PyString_DecodeEscape(space, substr, 'strict', enc) return space.wrap(v)
def newtext(self, s): return self.newunicode(decode_utf8(self, s, allow_surrogates=True))
def parsestr(space, encoding, s): """Parses a string or unicode literal, and return usually a wrapped value. If we get an f-string, then instead return an unparsed but unquoted W_FString instance. If encoding=None, the source string is ascii only. In other cases, the source string is in utf-8 encoding. When a bytes string is returned, it will be encoded with the original encoding. Yes, it's very inefficient. Yes, CPython has very similar code. """ # we use ps as "pointer to s" # q is the virtual last char index of the string ps = 0 quote = s[ps] rawmode = False unicode_literal = True saw_u = False saw_f = False # string decoration handling if quote == 'b' or quote == 'B': ps += 1 quote = s[ps] unicode_literal = False elif quote == 'u' or quote == 'U': ps += 1 quote = s[ps] saw_u = True elif quote == 'r' or quote == 'R': ps += 1 quote = s[ps] rawmode = True elif quote == 'f' or quote == 'F': ps += 1 quote = s[ps] saw_f = True if not saw_u: if quote == 'r' or quote == 'R': ps += 1 quote = s[ps] rawmode = True elif quote == 'b' or quote == 'B': ps += 1 quote = s[ps] unicode_literal = False elif quote == 'f' or quote == 'F': ps += 1 quote = s[ps] saw_f = True if quote != "'" and quote != '"': raise_app_valueerror(space, 'Internal error: parser passed unquoted literal') ps += 1 q = len(s) - 1 if s[q] != quote: raise_app_valueerror( space, 'Internal error: parser passed unmatched ' 'quotes in literal') if q - ps >= 4 and s[ps] == quote and s[ps + 1] == quote: # triple quotes ps += 2 if s[q - 1] != quote or s[q - 2] != quote: raise_app_valueerror( space, 'Internal error: parser passed ' 'unmatched triple quotes in literal') q -= 2 if unicode_literal and not rawmode: # XXX Py_UnicodeFlag is ignored for now assert 0 <= ps <= q if saw_f: return W_FString(s[ps:q], rawmode) if encoding is None: substr = s[ps:q] else: substr = decode_unicode_utf8(space, s, ps, q) v = unicodehelper.decode_unicode_escape(space, substr) return space.newunicode(v) assert 0 <= ps <= q substr = s[ps:q] if not unicode_literal: # Disallow non-ascii characters (but not escapes) for c in substr: if ord(c) > 0x80: raise oefmt( space.w_SyntaxError, "bytes can only contain ASCII literal characters.") if rawmode or '\\' not in substr: if not unicode_literal: return space.newbytes(substr) elif saw_f: return W_FString(substr, rawmode) else: v = unicodehelper.decode_utf8(space, substr) return space.newunicode(v) v = PyString_DecodeEscape(space, substr, 'strict', encoding) return space.newbytes(v)
def w_convert(self, space, s): if self.returns_unicode: from pypy.interpreter.unicodehelper import decode_utf8 return space.newunicode(decode_utf8(space, s)) else: return space.newtext(s)
def unmarshal_unicode(space, u, tc): return space.newunicode(unicodehelper.decode_utf8(space, u.get_str()))
def unmarshal_Unicode(space, u, tc): return space.wrap(unicodehelper.decode_utf8(space, u.get_str()))
def unmarshal_unicode(space, u, tc): return space.wrap(unicodehelper.decode_utf8(space, u.get_str(), allow_surrogates=True))
def parsestr(space, encoding, s, unicode_literal=False): """Parses a string or unicode literal, and return a wrapped value. If encoding=iso8859-1, the source string is also in this encoding. If encoding=None, the source string is ascii only. In other cases, the source string is in utf-8 encoding. When a bytes string is returned, it will be encoded with the original encoding. Yes, it's very inefficient. Yes, CPython has very similar code. """ # we use ps as "pointer to s" # q is the virtual last char index of the string ps = 0 quote = s[ps] rawmode = False # string decoration handling if quote == 'b' or quote == 'B': ps += 1 quote = s[ps] unicode_literal = False elif quote == 'u' or quote == 'U': ps += 1 quote = s[ps] unicode_literal = True if quote == 'r' or quote == 'R': ps += 1 quote = s[ps] rawmode = True if quote != "'" and quote != '"': raise_app_valueerror(space, 'Internal error: parser passed unquoted literal') ps += 1 q = len(s) - 1 if s[q] != quote: raise_app_valueerror(space, 'Internal error: parser passed unmatched ' 'quotes in literal') if q-ps >= 4 and s[ps] == quote and s[ps+1] == quote: # triple quotes ps += 2 if s[q-1] != quote or s[q-2] != quote: raise_app_valueerror(space, 'Internal error: parser passed ' 'unmatched triple quotes in literal') q -= 2 if unicode_literal: # XXX Py_UnicodeFlag is ignored for now if encoding is None or encoding == "iso-8859-1": # 'unicode_escape' expects latin-1 bytes, string is ready. buf = s bufp = ps bufq = q u = None else: # String is utf8-encoded, but 'unicode_escape' expects # latin-1; So multibyte sequences must be escaped. lis = [] # using a list to assemble the value end = q # Worst case: "\XX" may become "\u005c\uHHLL" (12 bytes) while ps < end: if s[ps] == '\\': lis.append(s[ps]) ps += 1 if ord(s[ps]) & 0x80: # A multibyte sequence will follow, it will be # escaped like \u1234. To avoid confusion with # the backslash we just wrote, we emit "\u005c" # instead. lis.append("u005c") if ord(s[ps]) & 0x80: # XXX inefficient w, ps = decode_utf8(space, s, ps, end, "utf-16-be") rn = len(w) assert rn % 2 == 0 for i in range(0, rn, 2): lis.append('\\u') lis.append(hexbyte(ord(w[i]))) lis.append(hexbyte(ord(w[i+1]))) else: lis.append(s[ps]) ps += 1 buf = ''.join(lis) bufp = 0 bufq = len(buf) assert 0 <= bufp <= bufq substr = buf[bufp:bufq] if rawmode: v = unicodehelper.decode_raw_unicode_escape(space, substr) else: v = unicodehelper.decode_unicode_escape(space, substr) return space.wrap(v) need_encoding = (encoding is not None and encoding != "utf-8" and encoding != "utf8" and encoding != "iso-8859-1") assert 0 <= ps <= q substr = s[ps : q] if rawmode or '\\' not in s[ps:]: if need_encoding: w_u = space.wrap(unicodehelper.decode_utf8(space, substr)) w_v = unicodehelper.encode(space, w_u, encoding) return w_v else: return space.wrap(substr) enc = None if need_encoding: enc = encoding v = PyString_DecodeEscape(space, substr, enc) return space.wrap(v)
def parsestr(space, encoding, s, unicode_literal=False): """Parses a string or unicode literal, and return a wrapped value. If encoding=iso8859-1, the source string is also in this encoding. If encoding=None, the source string is ascii only. In other cases, the source string is in utf-8 encoding. When a bytes string is returned, it will be encoded with the original encoding. Yes, it's very inefficient. Yes, CPython has very similar code. """ # we use ps as "pointer to s" # q is the virtual last char index of the string ps = 0 quote = s[ps] rawmode = False # string decoration handling if quote == 'b' or quote == 'B': ps += 1 quote = s[ps] unicode_literal = False elif quote == 'u' or quote == 'U': ps += 1 quote = s[ps] unicode_literal = True if quote == 'r' or quote == 'R': ps += 1 quote = s[ps] rawmode = True if quote != "'" and quote != '"': raise_app_valueerror(space, 'Internal error: parser passed unquoted literal') ps += 1 q = len(s) - 1 if s[q] != quote: raise_app_valueerror( space, 'Internal error: parser passed unmatched ' 'quotes in literal') if q - ps >= 4 and s[ps] == quote and s[ps + 1] == quote: # triple quotes ps += 2 if s[q - 1] != quote or s[q - 2] != quote: raise_app_valueerror( space, 'Internal error: parser passed ' 'unmatched triple quotes in literal') q -= 2 if unicode_literal: # XXX Py_UnicodeFlag is ignored for now if encoding is None or encoding == "iso-8859-1": # 'unicode_escape' expects latin-1 bytes, string is ready. assert 0 <= ps <= q substr = s[ps:q] else: substr = decode_unicode_utf8(space, s, ps, q) if rawmode: v = unicodehelper.decode_raw_unicode_escape(space, substr) else: v = unicodehelper.decode_unicode_escape(space, substr) return space.newunicode(v) need_encoding = (encoding is not None and encoding != "utf-8" and encoding != "utf8" and encoding != "iso-8859-1") assert 0 <= ps <= q substr = s[ps:q] if rawmode or '\\' not in s[ps:]: if need_encoding: w_u = space.newunicode(unicodehelper.decode_utf8(space, substr)) w_v = unicodehelper.encode(space, w_u, encoding) return w_v else: return space.newbytes(substr) enc = None if need_encoding: enc = encoding v = PyString_DecodeEscape(space, substr, 'strict', enc) return space.newbytes(v)