def test_encode_utf8(): space = FakeSpace() assert encode_utf8(space, u"abc") == "abc" assert encode_utf8(space, u"\u1234") == "\xe1\x88\xb4" assert encode_utf8(space, u"\ud800") == "\xed\xa0\x80" assert encode_utf8(space, u"\udc00") == "\xed\xb0\x80" # for the following test, go to lengths to avoid CPython's optimizer # and .pyc file storage, which collapse the two surrogates into one c = u"\udc00" assert encode_utf8(space, u"\ud800" + c) == "\xf0\x90\x80\x80"
def unicode_to_decimal_w(space, w_unistr, allow_surrogates=False): if not isinstance(w_unistr, W_UnicodeObject): raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr) value = _rpy_unicode_to_decimal_w(space, w_unistr._value) return unicodehelper.encode_utf8(space, value, allow_surrogates=allow_surrogates)
def unicode_to_decimal_w(space, w_unistr): if not isinstance(w_unistr, W_UnicodeObject): raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr) unistr = w_unistr._value result = [u'\0'] * len(unistr) for i in xrange(len(unistr)): uchr = ord(unistr[i]) if uchr > 127: if unicodedb.isspace(uchr): result[i] = ' ' continue try: uchr = ord(u'0') + unicodedb.decimal(uchr) except KeyError: pass result[i] = unichr(uchr) return unicodehelper.encode_utf8(space, u''.join(result))
def Parse(self, space, w_data, isfinal=False): """Parse(data[, isfinal]) Parse XML data. `isfinal' should be true at end of input.""" if space.isinstance_w(w_data, space.w_unicode): u = w_data.unicode_w(space) data = encode_utf8(space, w_data.unicode_w(space)) # Explicitly set UTF-8 encoding. Return code ignored. XML_SetEncoding(self.itself, "utf-8") else: data = space.bufferstr_w(w_data) res = XML_Parse(self.itself, data, len(data), isfinal) if self._exc_info: e = self._exc_info self._exc_info = None raise e elif res == 0: exc = self.set_error(space, XML_GetErrorCode(self.itself)) raise exc self.flush_character_buffer(space) return space.wrap(res)
def decode_escape_sequence_unicode(self, i, builder): # at this point we are just after the 'u' of the \u1234 sequence. start = i i += 4 hexdigits = self.getslice(start, i) try: val = int(hexdigits, 16) if val & 0xfc00 == 0xd800: # surrogate pair val = self.decode_surrogate_pair(i, val) i += 6 except ValueError: self._raise("Invalid \uXXXX escape (char %d)", i-1) return # help the annotator to know that we'll never go beyond # this point # uchr = runicode.code_to_unichr(val) # may be a surrogate pair again utf8_ch = unicodehelper.encode_utf8(self.space, uchr) builder.append(utf8_ch) return i
def decode_escape_sequence_unicode(self, i, builder): # at this point we are just after the 'u' of the \u1234 sequence. start = i i += 4 hexdigits = self.getslice(start, i) try: val = int(hexdigits, 16) if val & 0xfc00 == 0xd800: # surrogate pair val = self.decode_surrogate_pair(i, val) i += 6 except ValueError: self._raise("Invalid \uXXXX escape (char %d)", i - 1) return # help the annotator to know that we'll never go beyond # this point # uchr = runicode.code_to_unichr(val) # may be a surrogate pair again utf8_ch = unicodehelper.encode_utf8(self.space, uchr) builder.append(utf8_ch) return i
def Parse(self, space, w_data, isfinal=False): """Parse(data[, isfinal]) Parse XML data. `isfinal' should be true at end of input.""" if space.isinstance_w(w_data, space.w_unicode): data = encode_utf8(space, w_data.unicode_w(space)) # Explicitly set UTF-8 encoding. Return code ignored. XML_SetEncoding(self.itself, "utf-8") else: data = space.charbuf_w(w_data) isfinal = bool(isfinal) res = XML_Parse(self.itself, data, len(data), isfinal) if self._exc_info: e = self._exc_info self._exc_info = None raise e elif res == 0: exc = self.set_error(space, XML_GetErrorCode(self.itself)) raise exc self.flush_character_buffer(space) return space.newint(res)
def decode_escape_sequence_unicode(self, i, builder): # at this point we are just after the 'u' of the \u1234 sequence. start = i i += 4 hexdigits = self.getslice(start, i) try: val = int(hexdigits, 16) if sys.maxunicode > 65535 and 0xd800 <= val <= 0xdfff: # surrogate pair if self.ll_chars[i] == '\\' and self.ll_chars[i + 1] == 'u': val = self.decode_surrogate_pair(i, val) i += 6 except ValueError: raise DecoderError("Invalid \\uXXXX escape", i - 1) # uchr = runicode.code_to_unichr(val) # may be a surrogate pair again utf8_ch = unicodehelper.encode_utf8(self.space, uchr, allow_surrogates=True) builder.append(utf8_ch) return i
def marshal_w__Unicode(space, w_unicode, m): s = unicodehelper.encode_utf8(space, space.unicode_w(w_unicode)) m.atom_str(TYPE_UNICODE, s)
def marshal_unicode(space, w_unicode, m): if not isinstance(w_unicode, W_UnicodeObject): raise_exception(space, "unmarshallable object") s = unicodehelper.encode_utf8(space, space.unicode_w(w_unicode)) m.atom_str(TYPE_UNICODE, s)
def marshal_unicode(space, w_unicode, m): s = unicodehelper.encode_utf8(space, space.unicode_w(w_unicode), allow_surrogates=True) m.atom_str(TYPE_UNICODE, s)
def marshal_unicode(space, w_unicode, m): s = unicodehelper.encode_utf8(space, space.unicode_w(w_unicode)) m.atom_str(TYPE_UNICODE, s)
def _gettmarg(space, w_tup, allowNone=True): if space.is_none(w_tup): if not allowNone: raise oefmt(space.w_TypeError, "tuple expected") # default to the current local time tt = rffi.cast(rffi.TIME_T, pytime.time()) t_ref = lltype.malloc(rffi.TIME_TP.TO, 1, flavor='raw') t_ref[0] = tt pbuf = c_localtime(t_ref) rffi.setintfield(pbuf, "c_tm_year", rffi.getintfield(pbuf, "c_tm_year") + 1900) lltype.free(t_ref, flavor='raw') if not pbuf: raise OperationError(space.w_ValueError, space.newunicode(_get_error_msg())) return pbuf tup_w = space.fixedview(w_tup) if len(tup_w) < 9: raise oefmt(space.w_TypeError, "argument must be sequence of at least length 9, not %d", len(tup_w)) y = space.c_int_w(tup_w[0]) tm_mon = space.c_int_w(tup_w[1]) if tm_mon == 0: tm_mon = 1 tm_mday = space.c_int_w(tup_w[2]) if tm_mday == 0: tm_mday = 1 tm_yday = space.c_int_w(tup_w[7]) if tm_yday == 0: tm_yday = 1 rffi.setintfield(glob_buf, 'c_tm_mon', tm_mon) rffi.setintfield(glob_buf, 'c_tm_mday', tm_mday) rffi.setintfield(glob_buf, 'c_tm_hour', space.c_int_w(tup_w[3])) rffi.setintfield(glob_buf, 'c_tm_min', space.c_int_w(tup_w[4])) rffi.setintfield(glob_buf, 'c_tm_sec', space.c_int_w(tup_w[5])) rffi.setintfield(glob_buf, 'c_tm_wday', space.c_int_w(tup_w[6])) rffi.setintfield(glob_buf, 'c_tm_yday', tm_yday) rffi.setintfield(glob_buf, 'c_tm_isdst', space.c_int_w(tup_w[8])) # if HAS_TM_ZONE: old_tm_zone = glob_buf.c_tm_zone glob_buf.c_tm_zone = lltype.nullptr(rffi.CCHARP.TO) rffi.setintfield(glob_buf, 'c_tm_gmtoff', 0) if len(tup_w) >= 10: # NOTE this is not cleanly solved! # it saves the string that is later deleted when this # function is called again. A refactoring of this module # could remove this tm_zone = encode_utf8(space, space.unicode_w(tup_w[9]), allow_surrogates=True) malloced_str = rffi.str2charp(tm_zone, track_allocation=False) if old_tm_zone != lltype.nullptr(rffi.CCHARP.TO): rffi.free_charp(old_tm_zone, track_allocation=False) glob_buf.c_tm_zone = malloced_str if len(tup_w) >= 11: rffi.setintfield(glob_buf, 'c_tm_gmtoff', space.c_int_w(tup_w[10])) # tm_wday does not need checking of its upper-bound since taking "% # 7" in _gettmarg() automatically restricts the range. if rffi.getintfield(glob_buf, 'c_tm_wday') < -1: raise oefmt(space.w_ValueError, "day of week out of range") rffi.setintfield(glob_buf, 'c_tm_year', y) rffi.setintfield(glob_buf, 'c_tm_mon', rffi.getintfield(glob_buf, 'c_tm_mon') - 1) rffi.setintfield(glob_buf, 'c_tm_wday', (rffi.getintfield(glob_buf, 'c_tm_wday') + 1) % 7) rffi.setintfield(glob_buf, 'c_tm_yday', rffi.getintfield(glob_buf, 'c_tm_yday') - 1) return glob_buf