def test_get_utf8_length(unichars): u = u''.join(unichars) exp_lgt = len(u) s = ''.join([c.encode('utf8') for c in u]) lgt = rutf8.get_utf8_length(s) if not _has_surrogates(s) or sys.maxunicode > 0xffff: assert lgt == exp_lgt
def multibytecodec_decerror(decodebuf, e, errors, errorcb, namecb, stringdata): if e > 0: reason = "illegal multibyte sequence" esize = e elif e == MBERR_TOOFEW: reason = "incomplete multibyte sequence" esize = pypy_cjk_dec_inbuf_remaining(decodebuf) elif e == MBERR_NOMEMORY: raise MemoryError else: raise RuntimeError # # compute the unicode to use as a replacement -> 'replace', and # the current position in the input 'unicodedata' -> 'end' start = pypy_cjk_dec_inbuf_consumed(decodebuf) end = start + esize if errors == "strict": raise EncodeDecodeError(start, end, reason) elif errors == "ignore": replace = "" elif errors == "replace": replace = UNICODE_REPLACEMENT_CHARACTER else: assert errorcb replace, end = errorcb(errors, namecb, reason, stringdata, start, end) # 'replace' is RPython unicode here lgt = rutf8.get_utf8_length(replace) inbuf = rffi.utf82wcharp(replace, lgt) try: r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, lgt, end) finally: lltype.free(inbuf, flavor='raw') if r == MBERR_NOMEMORY: raise MemoryError
def read_w(self, space, w_size=None): self._check_attached(space) self._check_closed(space) if not self.w_decoder: raise oefmt(space.w_IOError, "not readable") size = convert_size(space, w_size) self._writeflush(space) if size < 0: # Read everything w_bytes = space.call_method(self.w_buffer, "read") w_decoded = space.call_method(self.w_decoder, "decode", w_bytes, space.w_True) check_decoded(space, w_decoded) chars = self.decoded.get_chars(-1) lgt = get_utf8_length(chars) w_result = space.newutf8(chars, lgt) w_final = space.add(w_result, w_decoded) self.snapshot = None return w_final remaining = size builder = Utf8StringBuilder(size) # Keep reading chunks until we have n characters to return while remaining > 0: if not self._ensure_data(space): break data = self.decoded.get_chars(remaining) builder.append(data) remaining -= len(data) return space.newutf8(builder.build(), builder.getlength())
def convert_to(self, space, dtype): if dtype.is_unicode(): return self elif dtype.is_object(): return W_ObjectBox( space.newutf8(self._value, get_utf8_length(self._value))) else: raise oefmt(space.w_NotImplementedError, "Conversion from unicode not implemented yet")
def readline_w(self, space, w_limit=None): self._check_attached(space) self._check_closed(space) self._writeflush(space) limit = convert_size(space, w_limit) remnant = None builder = StringBuilder() # XXX maybe use Utf8StringBuilder instead? while True: # First, get some data if necessary has_data = self._ensure_data(space) if not has_data: # end of file if remnant: builder.append(remnant) break if remnant: assert not self.readtranslate and self.readnl == '\r\n' assert self.decoded.pos == 0 if remnant == '\r' and self.decoded.text[0] == '\n': builder.append('\r\n') self.decoded.pos = 1 remnant = None break else: builder.append(remnant) remnant = None continue if limit >= 0: remaining = limit - builder.getlength() assert remaining >= 0 else: remaining = -1 start = self.decoded.pos assert start >= 0 found = self._scan_line_ending(remaining) end_scan = self.decoded.pos if end_scan > start: s = self.decoded.text[start:end_scan] builder.append(s) if found or (limit >= 0 and builder.getlength() >= limit): break # There may be some remaining chars we'll have to prepend to the # next chunk of data if not self.decoded.exhausted(): remnant = self.decoded.get_chars(-1) # We have consumed the buffer self.decoded.reset() result = builder.build() lgt = get_utf8_length(result) return space.newutf8(result, lgt)
def write(self, string): length = get_utf8_length(string) if self.pos + length > len(self.data): self.resize(self.pos + length) pos = 0 for i in range(length): nextpos = next_codepoint_pos(string, pos) self.data[self.pos + i] = string[pos:nextpos] pos = nextpos self.pos += length
def fget_string(self, space): ctx = self.ctx if isinstance(ctx, rsre_core.BufMatchContext): return space.newbytes(ctx._buffer.as_str()) elif isinstance(ctx, rsre_core.StrMatchContext): return space.newbytes(ctx._string) elif isinstance(ctx, rsre_utf8.Utf8MatchContext): lgt = rutf8.get_utf8_length(ctx._utf8) return space.newutf8(ctx._utf8, lgt) else: raise SystemError
def descr_getstate(self, space): w_initialval = self.getvalue_w(space) w_dict = space.call_method(self.w_dict, "copy") if self.readnl is None: w_readnl = space.w_None else: w_readnl = space.str( space.newutf8(self.readnl, get_utf8_length(self.readnl))) # YYY return space.newtuple( [w_initialval, w_readnl, space.newint(self.buf.pos), w_dict])
def write_w(self, space, w_text): self._check_attached(space) self._check_closed(space) if not self.w_encoder: raise oefmt(space.w_IOError, "not writable") if not space.isinstance_w(w_text, space.w_unicode): raise oefmt(space.w_TypeError, "unicode argument expected, got '%T'", w_text) text, textlen = space.utf8_len_w(w_text) haslf = False if (self.writetranslate and self.writenl) or self.line_buffering: if text.find('\n') >= 0: haslf = True if haslf and self.writetranslate and self.writenl: w_text = space.call_method( w_text, "replace", space.newutf8('\n', 1), space.newutf8(self.writenl, get_utf8_length(self.writenl))) text = space.utf8_w(w_text) needflush = False if self.line_buffering and (haslf or text.find('\r') >= 0): needflush = True # XXX What if we were just reading? if self.encodefunc: w_bytes = self.encodefunc(space, w_text, self.errors) self.encoding_start_of_stream = False else: w_bytes = space.call_method(self.w_encoder, "encode", w_text) b = space.bytes_w(w_bytes) if not self.pending_bytes: self.pending_bytes = [] self.pending_bytes_count = 0 self.pending_bytes.append(b) self.pending_bytes_count += len(b) if self.pending_bytes_count > self.chunk_size or needflush: self._writeflush(space) if needflush: space.call_method(self.w_buffer, "flush") self.snapshot = None if self.w_decoder: space.call_method(self.w_decoder, "reset") return space.newint(textlen)
def readline_w(self, space, w_limit=None): self._check_closed(space) limit = convert_size(space, w_limit) if self.readuniversal: result = self.buf.readline_universal(limit) else: if self.readtranslate: # Newlines are already translated, only search for \n newline = '\n' else: newline = self.readnl result = self.buf.readline(newline, limit) resultlen = get_utf8_length(result) return space.newutf8(result, resultlen)
def decode(self, space, input, errors=None): if errors is None: errors = 'strict' state = space.fromcache(CodecState) # try: utf8_output = c_codecs.decode(self.codec, input, errors, state.decode_error_handler, self.name) except c_codecs.EncodeDecodeError as e: raise wrap_unicodedecodeerror(space, e, input, self.name) except RuntimeError: raise wrap_runtimeerror(space) lgt = rutf8.get_utf8_length(utf8_output) return space.newtuple([space.newutf8(utf8_output, lgt), space.newint(len(input))])
def decode_w(self, object, final=False): space = self.space state = space.fromcache(CodecState) if len(self.pending) > 0: object = self.pending + object try: output = c_codecs.decodeex(self.decodebuf, object, self.errors, state.decode_error_handler, self.name, get_ignore_error(final)) except c_codecs.EncodeDecodeError as e: raise wrap_unicodedecodeerror(space, e, object, self.name) except RuntimeError: raise wrap_runtimeerror(space) pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf) assert 0 <= pos <= len(object) self.pending = object[pos:] lgt = rutf8.get_utf8_length(output) return space.newutf8(output, lgt)
def slice_w(space, ctx, start, end, w_default): # 'start' and 'end' are byte positions if ctx.ZERO <= start <= end: if isinstance(ctx, rsre_core.BufMatchContext): return space.newbytes( ctx._buffer.getslice(start, end, 1, end - start)) if isinstance(ctx, rsre_core.StrMatchContext): start = ctx._real_pos(start) end = ctx._real_pos(end) return space.newbytes(ctx._string[start:end]) elif isinstance(ctx, rsre_utf8.Utf8MatchContext): s = ctx._utf8[start:end] lgt = rutf8.get_utf8_length(s) return space.newutf8(s, lgt) else: # unreachable raise SystemError return w_default
def write_w(self, space, w_obj): if not space.isinstance_w(w_obj, space.w_unicode): raise oefmt(space.w_TypeError, "unicode argument expected, got '%T'", w_obj) self._check_closed(space) orig_size = space.len_w(w_obj) if self.w_decoder is not None: w_decoded = space.call_method(self.w_decoder, "decode", w_obj, space.w_True) else: w_decoded = w_obj if self.writenl: w_decoded = space.call_method( w_decoded, "replace", space.newtext("\n"), space.newutf8(self.writenl, get_utf8_length(self.writenl))) string = space.utf8_w(w_decoded) if string: self.buf.write(string) return space.newint(orig_size)
def multibytecodec_encerror(encodebuf, e, errors, errorcb, namecb, unicodedata): if e > 0: reason = "illegal multibyte sequence" esize = e elif e == MBERR_TOOFEW: reason = "incomplete multibyte sequence" esize = pypy_cjk_enc_inbuf_remaining(encodebuf) elif e == MBERR_NOMEMORY: raise MemoryError else: raise RuntimeError # # compute the string to use as a replacement -> 'replace', and # the current position in the input 'unicodedata' -> 'end' start = pypy_cjk_enc_inbuf_consumed(encodebuf) end = start + esize if errors == "strict": raise EncodeDecodeError(start, end, reason) elif errors == "ignore": replace = "" elif errors == "replace": codec = pypy_cjk_enc_getcodec(encodebuf) try: replace = encode(codec, "?", 1) except EncodeDecodeError: replace = "?" else: assert errorcb rets, end = errorcb(errors, namecb, reason, unicodedata, start, end) codec = pypy_cjk_enc_getcodec(encodebuf) lgt = rutf8.get_utf8_length(rets) replace = encode(codec, rets, lgt, "strict", errorcb, namecb) with rffi.scoped_nonmovingbuffer(replace) as inbuf: r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end) if r == MBERR_NOMEMORY: raise MemoryError
def getvalue_w(self, space): self._check_closed(space) v = self.buf.getvalue() lgt = get_utf8_length(v) return space.newutf8(v, lgt)
def read_w(self, space, w_size=None): self._check_closed(space) size = convert_size(space, w_size) v = self.buf.read(size) lgt = get_utf8_length(v) return space.newutf8(v, lgt)
def normalize(s): u = s.encode('utf8') w_s = space.newutf8(u, get_utf8_length(u)) w_res = ucd.normalize(space, NF_code, w_s) return space.utf8_w(w_res).decode('utf8')
start = ctx.next_indirect(start) ctx.reset(start) if last_pos < ctx.end: _sub_append_slice(ctx, space, use_builder, sublist_w, strbuilder, last_pos, ctx.end) if use_builder != '\x00': assert strbuilder is not None result_bytes = strbuilder.build() if use_builder == 'S': assert not isinstance(ctx, rsre_utf8.Utf8MatchContext) return space.newbytes(result_bytes), n elif use_builder == 'U': assert isinstance(ctx, rsre_utf8.Utf8MatchContext) return space.newutf8(result_bytes, rutf8.get_utf8_length(result_bytes)), n else: raise AssertionError(use_builder) else: if space.isinstance_w(w_string, space.w_unicode): w_emptystr = space.newutf8('', 0) else: w_emptystr = space.newbytes('') w_item = space.call_method(w_emptystr, 'join', space.newlist(sublist_w)) return w_item, n sub_jitdriver = jit.JitDriver( reds="""count n last_pos ctx w_filter