def str_decode_code_page(cp, s, errors, errorhandler, final=False): """Decodes a byte string s from a code page cp with an error handler. Returns utf8 result, codepoints in s """ insize = len(s) if insize == 0: return '', 0 flags = _decode_code_page_flags(cp) encoding = _code_page_name(cp) assert errorhandler is not None res = StringBuilder(insize) if errors == 'strict': pos, outsize = _decode_helper(cp, s, flags, encoding, errors, errorhandler, final, 0, len(s), res) else: prev_pos = 0 pos = 0 outsize = 0 while pos < len(s): pos = next_codepoint_pos(s, prev_pos) pos, _outsize = _decode_helper(cp, s, flags, encoding, errors, errorhandler, final, prev_pos, pos, res) prev_pos = pos outsize += _outsize return res.build(), outsize
def get_chars(self, size): if self.text is None or size == 0: return "" lgt = codepoints_in_utf8(self.text) available = lgt - self.upos if size < 0 or size > available: size = available assert size >= 0 if self.pos > 0 or size < available: start = self.pos ret = [] pos = start for i in range(size): pos = next_codepoint_pos(self.text, pos) self.upos += 1 assert start >= 0 assert pos >= 0 chars = self.text[start:pos] self.pos = pos else: chars = self.text self.pos = len(self.text) self.upos = lgt return chars
def encode_w(self, space, w_object, final=False): utf8data, length = space.utf8_len_w(w_object) space = self.space state = space.fromcache(CodecState) if len(self.pending) > 0: utf8data = self.pending + utf8data length += self.pending_len try: output = c_codecs.encodeex(self.encodebuf, utf8data, length, self.errors, state.encode_error_handler, self.name, get_ignore_error(final)) except c_codecs.EncodeDecodeError as e: raise wrap_unicodeencodeerror(space, e, utf8data, length, self.name) except RuntimeError: raise wrap_runtimeerror(space) pos = c_codecs.pypy_cjk_enc_inbuf_consumed(self.encodebuf) assert 0 <= pos <= length # scan the utf8 string until we hit pos i = 0 stop = length - pos self.pending_len = stop if stop > 0: while pos > 0: i = rutf8.next_codepoint_pos(utf8data, i) pos -= 1 self.pending = utf8data[i:] else: self.pending = "" return space.newbytes(output)
def namereplace_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): w_obj = space.getattr(w_exc, space.newtext('object')) space.realutf8_w(w_obj) # for errors w_obj = space.convert_arg_to_w_unicode(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) start = w_obj._index_to_byte(start) end = w_obj._index_to_byte(end) builder = StringBuilder() obj = w_obj._utf8 pos = start while pos < end: oc = rutf8.codepoint_at_pos(obj, pos) try: name = unicodedb.name(oc) except KeyError: unicodehelper.raw_unicode_escape_helper(builder, oc) else: builder.append('\\N{') builder.append(name) builder.append('}') pos = rutf8.next_codepoint_pos(obj, pos) r = builder.build() lgt = rutf8.check_utf8(r, True) return space.newtuple([space.newutf8(r, lgt), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def get_chars(self, size): """ returns a tuple (utf8, lgt) """ if self.text is None or size == 0: return "", 0 lgt = self.ulen available = lgt - self.upos if size < 0 or size > available: size = available assert size >= 0 if self.pos > 0 or size < available: start = self.pos pos = start for i in range(size): pos = next_codepoint_pos(self.text, pos) self.upos += 1 assert start >= 0 assert pos >= 0 chars = self.text[start:pos] self.pos = pos else: chars = self.text self.pos = len(self.text) self.upos = lgt size = lgt return chars, size
def xmlcharrefreplace_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): w_obj = space.getattr(w_exc, space.newtext('object')) space.realutf8_w(w_obj) # weeoes w_obj = space.convert_arg_to_w_unicode(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) start = w_obj._index_to_byte(start) end = w_obj._index_to_byte(end) builder = StringBuilder() pos = start obj = w_obj._utf8 while pos < end: code = rutf8.codepoint_at_pos(obj, pos) builder.append("&#") builder.append(str(code)) builder.append(";") pos = rutf8.next_codepoint_pos(obj, pos) r = builder.build() lgt = rutf8.check_utf8(r, True) return space.newtuple([space.newutf8(r, lgt), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def decode_unicode_utf8(space, s, ps, q): # ****The Python 2.7 version, producing UTF-32 escapes**** # String is utf8-encoded, but 'unicode_escape' expects # latin-1; So multibyte sequences must be escaped. lis = [] # using a list to assemble the value end = q # Worst case: # "<92><195><164>" may become "\u005c\U000000E4" (16 bytes) while ps < end: if s[ps] == '\\': lis.append(s[ps]) ps += 1 if ord(s[ps]) & 0x80: # A multibyte sequence will follow, it will be # escaped like \u1234. To avoid confusion with # the backslash we just wrote, we emit "\u005c" # instead. lis.append("u005c") if ord(s[ps]) & 0x80: cp = rutf8.codepoint_at_pos(s, ps) hexa = hex(cp + 0x10000000) lis.append('\\U0') lis.append(hexa[3:]) # Skip 0x and the leading 1 ps = rutf8.next_codepoint_pos(s, ps) else: lis.append(s[ps]) ps += 1 return ''.join(lis)
def next_n(self, position, n, end_position): i = 0 # avoid range(n) since n can be quite large while i < n: if position >= end_position: raise EndOfString position = rutf8.next_codepoint_pos(self._utf8, position) i += 1 return position
def peek_char(self): # like next_char, but doesn't advance pos if self.exhausted(): raise StopIteration newpos = next_codepoint_pos(self.text, self.pos) pos = self.pos assert pos >= 0 assert newpos >= 0 return self.text[pos:newpos]
def _incr(s, pos, isutf8): if isutf8: from rpython.rlib.rutf8 import next_codepoint_pos assert pos >= 0 r = next_codepoint_pos(s, pos) assert r >= 0 return r else: return pos + 1
def write(self, string): length = get_utf8_length(string) if self.pos + length > len(self.data): self.resize(self.pos + length) pos = 0 for i in range(length): nextpos = next_codepoint_pos(string, pos) self.data[self.pos + i] = string[pos:nextpos] pos = nextpos self.pos += length
def test_utf8_iterator_pos(arg): utf8s = arg.encode('utf8') u = rutf8.Utf8StringPosIterator(utf8s) l = [] i = 0 for c, pos in u: l.append(unichr(c)) assert c == rutf8.codepoint_at_pos(utf8s, pos) assert pos == i i = rutf8.next_codepoint_pos(utf8s, i) assert list(arg) == l
def charmap_build(space, chars): # XXX CPython sometimes uses a three-level trie w_charmap = space.newdict() pos = 0 num = 0 while pos < len(chars): w_char = space.newint(rutf8.codepoint_at_pos(chars, pos)) space.setitem(w_charmap, w_char, space.newint(num)) pos = rutf8.next_codepoint_pos(chars, pos) num += 1 return w_charmap
def next_char(self): if self.exhausted(): raise StopIteration newpos = next_codepoint_pos(self.text, self.pos) pos = self.pos assert pos >= 0 assert newpos >= 0 ch = self.text[pos:newpos] self.pos = newpos self.upos += 1 return ch
def test_next_pos(uni): skips = [] for elem in uni: skips.append(len(elem.encode('utf8'))) pos = 0 i = 0 utf8 = uni.encode('utf8') while pos < len(utf8): new_pos = rutf8.next_codepoint_pos(utf8, pos) assert new_pos - pos == skips[i] i += 1 pos = new_pos
def utf8_to_char32(utf8, target_ptr, target_length, add_final_zero): # 'target_ptr' is a raw pointer to 'target_length' 32-bit integers; # we assume (and check) that target_length == number of unichars in utf8. unichardata = rffi.cast(rffi.UINTP, target_ptr) i = 0 for j in range(target_length): code = rutf8.codepoint_at_pos(utf8, i) unichardata[j] = rffi.cast(rffi.UINT, code) i = rutf8.next_codepoint_pos(utf8, i) assert i == len(utf8) if add_final_zero: unichardata[target_length] = rffi.cast(rffi.UINT, 0)
def __init__(self, data=None, pos=0): if data is None: data = '' self.data = [] self.pos = 0 # break the data into unicode codepoints _pos = 0 while _pos < pos: _pos = next_codepoint_pos(data, _pos) if _pos >= len(data): break self.write(data[_pos:]) self.pos = pos
def surrogateescape_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): w_obj = space.getattr(w_exc, space.newtext('object')) w_obj = space.convert_arg_to_w_unicode(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) res = '' start = w_obj._index_to_byte(start) end = w_obj._index_to_byte(end) obj = w_obj._utf8 pos = start while pos < end: code = rutf8.codepoint_at_pos(obj, pos) if code < 0xdc80 or code > 0xdcff: # Not a UTF-8b surrogate, fail with original exception raise OperationError(space.type(w_exc), w_exc) res += chr(code - 0xdc00) pos = rutf8.next_codepoint_pos(obj, pos) return space.newtuple([space.newbytes(res), w_end]) elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError): consumed = 0 start = space.int_w(space.getattr(w_exc, space.newtext('start'))) end = space.int_w(space.getattr(w_exc, space.newtext('end'))) obj = space.bytes_w(space.getattr(w_exc, space.newtext('object'))) replace = u'' while consumed < 4 and consumed < end - start: c = ord(obj[start + consumed]) if c < 128: # Refuse to escape ASCII bytes. break replace += unichr(0xdc00 + c) consumed += 1 if not consumed: # codec complained about ASCII byte. raise OperationError(space.type(w_exc), w_exc) replace_utf8 = runicode.unicode_encode_utf_8(replace, len(replace), 'strict', allow_surrogates=True) return space.newtuple([ space.newtext(replace_utf8, len(replace)), space.newint(start + consumed) ]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def descr_next(self, space): from pypy.objspace.std.unicodeobject import W_UnicodeObject from rpython.rlib import rutf8 w_seq = self.w_seq if w_seq is None: raise OperationError(space.w_StopIteration, space.w_None) assert isinstance(w_seq, W_UnicodeObject) index = self.index if index == w_seq._length: self.w_seq = None raise OperationError(space.w_StopIteration, space.w_None) start = self.byteindex end = rutf8.next_codepoint_pos(w_seq._utf8, start) w_res = W_UnicodeObject(w_seq._utf8[start:end], 1) self.byteindex = end self.index += 1 return w_res
def utf8_to_char16(utf8, target_ptr, target_length, add_final_zero): # 'target_ptr' is a raw pointer to 'target_length' 16-bit integers; # we assume (and check) that target_length == utf8_size_as_char16(utf8). ptr = rffi.cast(rffi.USHORTP, target_ptr) i = 0 while i < len(utf8): ordinal = rutf8.codepoint_at_pos(utf8, i) if ordinal > 0xFFFF: ordinal -= 0x10000 ptr[0] = rffi.cast(rffi.USHORT, 0xD800 | (ordinal >> 10)) ptr[1] = rffi.cast(rffi.USHORT, 0xDC00 | (ordinal & 0x3FF)) ptr = rffi.ptradd(ptr, 2) else: ptr[0] = rffi.cast(rffi.USHORT, ordinal) ptr = rffi.ptradd(ptr, 1) i = rutf8.next_codepoint_pos(utf8, i) assert ptr == ( rffi.ptradd(rffi.cast(rffi.USHORTP, target_ptr), target_length)) if add_final_zero: ptr[0] = rffi.cast(rffi.USHORT, 0)
def backslashreplace_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): w_obj = space.getattr(w_exc, space.newtext('object')) space.realutf8_w(w_obj) # for errors w_obj = space.convert_arg_to_w_unicode(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) start = w_obj._index_to_byte(start) end = w_obj._index_to_byte(end) builder = StringBuilder() obj = w_obj._utf8 pos = start while pos < end: oc = rutf8.codepoint_at_pos(obj, pos) num = hex(oc) if (oc >= 0x10000): builder.append("\\U") zeros = 8 elif (oc >= 0x100): builder.append("\\u") zeros = 4 else: builder.append("\\x") zeros = 2 lnum = len(num) nb = zeros + 2 - lnum # num starts with '0x' if nb > 0: builder.append_multiple_char('0', nb) builder.append_slice(num, 2, lnum) pos = rutf8.next_codepoint_pos(obj, pos) r = builder.build() lgt = rutf8.check_utf8(r, True) return space.newtuple([space.newutf8(r, lgt), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def backslashreplace_errors(space, w_exc): check_exception(space, w_exc) if (space.isinstance_w(w_exc, space.w_UnicodeEncodeError) or space.isinstance_w(w_exc, space.w_UnicodeTranslateError)): w_obj = space.getattr(w_exc, space.newtext('object')) space.realutf8_w(w_obj) # for errors w_obj = space.convert_arg_to_w_unicode(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) start = w_obj._index_to_byte(start) end = w_obj._index_to_byte(end) builder = StringBuilder() pos = start obj = w_obj._utf8 while pos < end: code = rutf8.codepoint_at_pos(obj, pos) unicodehelper.raw_unicode_escape_helper(builder, code) pos = rutf8.next_codepoint_pos(obj, pos) return space.newtuple([space.newtext(builder.build()), w_end]) elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError): obj = space.bytes_w(space.getattr(w_exc, space.newtext('object'))) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) builder = StringBuilder() pos = start while pos < end: oc = ord(obj[pos]) unicodehelper.raw_unicode_escape_helper(builder, oc) pos += 1 return space.newtuple([space.newtext(builder.build()), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def _parse_spec(self, default_type, default_align): space = self.space self._fill_char = self._lit(" ")[0] self._align = default_align self._alternate = False self._sign = "\0" self._thousands_sep = False self._precision = -1 the_type = default_type spec = self.spec if not spec: return True length = len(spec) i = 0 got_align = True got_fill_char = False # The single character could be utf8-encoded unicode if self.is_unicode: after_i = rutf8.next_codepoint_pos(spec, i) else: after_i = i + 1 if length - i >= 2 and self._is_alignment(spec[after_i]): self._align = spec[after_i] self._fill_char = spec[i:after_i] got_fill_char = True i = after_i + 1 elif length - i >= 1 and self._is_alignment(spec[i]): self._align = spec[i] i += 1 else: got_align = False if length - i >= 1 and self._is_sign(spec[i]): self._sign = spec[i] i += 1 if length - i >= 1 and spec[i] == "#": self._alternate = True i += 1 if not got_fill_char and length - i >= 1 and spec[i] == "0": self._fill_char = self._lit("0")[0] if not got_align: self._align = "=" i += 1 self._width, i = _parse_int(self.space, spec, i, length) if length != i and spec[i] == ",": self._thousands_sep = True i += 1 if length != i and spec[i] == ".": i += 1 self._precision, i = _parse_int(self.space, spec, i, length) if self._precision == -1: raise oefmt(space.w_ValueError, "no precision given") if length - i > 1: raise oefmt(space.w_ValueError, "invalid format spec") if length - i == 1: presentation_type = spec[i] if self.is_unicode: try: rutf8.check_utf8(spec[i], True) the_type = spec[i][0] except rutf8.CheckError: raise oefmt(space.w_ValueError, "invalid presentation type") else: the_type = presentation_type i += 1 self._type = the_type if self._thousands_sep: tp = self._type if (tp == "d" or tp == "e" or tp == "f" or tp == "g" or tp == "E" or tp == "G" or tp == "%" or tp == "F" or tp == "\0"): # ok pass else: raise oefmt(space.w_ValueError, "invalid type with ','") return False
def surrogatepass_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): w_obj = space.getattr(w_exc, space.newtext('object')) w_obj = space.convert_arg_to_w_unicode(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) encoding = space.text_w(space.getattr(w_exc, space.newtext('encoding'))) bytelength, code = get_standard_encoding(encoding) if code == ENC_UNKNOWN: # Not supported, fail with original exception raise OperationError(space.type(w_exc), w_exc) end = space.int_w(w_end) builder = StringBuilder() start = w_obj._index_to_byte(start) end = w_obj._index_to_byte(end) obj = w_obj._utf8 pos = start while pos < end: ch = rutf8.codepoint_at_pos(obj, pos) pos = rutf8.next_codepoint_pos(obj, pos) if ch < 0xd800 or ch > 0xdfff: # Not a surrogate, fail with original exception raise OperationError(space.type(w_exc), w_exc) if code == ENC_UTF8: builder.append(chr(0xe0 | (ch >> 12))) builder.append(chr(0x80 | ((ch >> 6) & 0x3f))) builder.append(chr(0x80 | (ch & 0x3f))) elif code == ENC_UTF16LE: builder.append(chr(ch & 0xff)) builder.append(chr(ch >> 8)) elif code == ENC_UTF16BE: builder.append(chr(ch >> 8)) builder.append(chr(ch & 0xff)) elif code == ENC_UTF32LE: builder.append(chr(ch & 0xff)) builder.append(chr(ch >> 8)) builder.append(chr(0)) builder.append(chr(0)) elif code == ENC_UTF32BE: builder.append(chr(0)) builder.append(chr(0)) builder.append(chr(ch >> 8)) builder.append(chr(ch & 0xff)) return space.newtuple([space.newbytes(builder.build()), w_end]) elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError): start = space.int_w(space.getattr(w_exc, space.newtext('start'))) obj = space.bytes_w(space.getattr(w_exc, space.newtext('object'))) encoding = space.text_w(space.getattr(w_exc, space.newtext('encoding'))) bytelength, code = get_standard_encoding(encoding) ch = 0 # Try decoding a single surrogate character. If there are more, # let the codec call us again ch0 = ord(obj[start + 0]) if len(obj) > start + 0 else -1 ch1 = ord(obj[start + 1]) if len(obj) > start + 1 else -1 ch2 = ord(obj[start + 2]) if len(obj) > start + 2 else -1 ch3 = ord(obj[start + 3]) if len(obj) > start + 3 else -1 if code == ENC_UTF8: if (ch1 != -1 and ch2 != -1 and ch0 & 0xf0 == 0xe0 and ch1 & 0xc0 == 0x80 and ch2 & 0xc0 == 0x80): # it's a three-byte code ch = ((ch0 & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f) elif code == ENC_UTF16LE: ch = (ch1 << 8) | ch0 elif code == ENC_UTF16BE: ch = (ch0 << 8) | ch1 elif code == ENC_UTF32LE: ch = (ch3 << 24) | (ch2 << 16) | (ch1 << 8) | ch0 elif code == ENC_UTF32BE: ch = (ch0 << 24) | (ch1 << 16) | (ch2 << 8) | ch3 if ch < 0xd800 or ch > 0xdfff: # it's not a surrogate - fail ch = 0 if ch == 0: raise OperationError(space.type(w_exc), w_exc) ch_utf8 = rutf8.unichr_as_utf8(ch, allow_surrogates=True) return space.newtuple( [space.newtext(ch_utf8, 1), space.newint(start + bytelength)]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def replace_count(input, sub, by, maxcount=-1, isutf8=False): if isinstance(input, str): Builder = StringBuilder elif isinstance(input, unicode): Builder = UnicodeBuilder else: assert isinstance(input, list) Builder = ByteListBuilder if maxcount == 0: return input, 0 if not sub and not isutf8: upper = len(input) if maxcount > 0 and maxcount < upper + 2: upper = maxcount - 1 assert upper >= 0 try: result_size = ovfcheck(upper * len(by)) result_size = ovfcheck(result_size + upper) result_size = ovfcheck(result_size + len(by)) remaining_size = len(input) - upper result_size = ovfcheck(result_size + remaining_size) except OverflowError: raise builder = Builder(result_size) for i in range(upper): builder.append(by) builder.append(input[i]) builder.append(by) builder.append_slice(input, upper, len(input)) replacements = upper + 1 elif isinstance(input, str) and len(sub) == 1: if len(by) == 1: return replace_count_str_chr_chr(input, sub[0], by[0], maxcount) return replace_count_str_chr_str(input, sub[0], by, maxcount) else: # First compute the exact result size if sub: cnt = count(input, sub, 0, len(input)) if isinstance(input, str) and cnt == 0: return input, 0 if isinstance(input, str): return replace_count_str_str_str(input, sub, by, cnt, maxcount) else: assert isutf8 from rpython.rlib import rutf8 cnt = rutf8.codepoints_in_utf8(input) + 1 if cnt > maxcount and maxcount > 0: cnt = maxcount diff_len = len(by) - len(sub) try: result_size = ovfcheck(diff_len * cnt) result_size = ovfcheck(result_size + len(input)) except OverflowError: raise replacements = cnt builder = Builder(result_size) start = 0 sublen = len(sub) if sublen == 0: assert isutf8 from rpython.rlib import rutf8 while True: builder.append(by) maxcount -= 1 if start == len(input) or maxcount == 0: break next = rutf8.next_codepoint_pos(input, start) builder.append_slice(input, start, next) start = next else: while maxcount != 0: next = find(input, sub, start, len(input)) if next < 0: break builder.append_slice(input, start, next) builder.append(by) start = next + sublen maxcount -= 1 # NB. if it's already < 0, it stays < 0 builder.append_slice(input, start, len(input)) return builder.build(), replacements
def _advance_codepoint(self): # must only be called after checking self.exhausted()! self.pos = next_codepoint_pos(self.text, self.pos) self.upos += 1
def next_n(self, position, n, end_position): for i in range(n): if position >= end_position: raise EndOfString position = rutf8.next_codepoint_pos(self._utf8, position) return position
def next(self, position): return rutf8.next_codepoint_pos(self._utf8, position)