Beispiel #1
0
def test_get_utf8_length(unichars):
    u = u''.join(unichars)
    exp_lgt = len(u)
    s = ''.join([c.encode('utf8') for c in u])
    lgt = rutf8.get_utf8_length(s)
    if not _has_surrogates(s) or sys.maxunicode > 0xffff:
        assert lgt == exp_lgt
Beispiel #2
0
def multibytecodec_decerror(decodebuf, e, errors, errorcb, namecb, stringdata):
    if e > 0:
        reason = "illegal multibyte sequence"
        esize = e
    elif e == MBERR_TOOFEW:
        reason = "incomplete multibyte sequence"
        esize = pypy_cjk_dec_inbuf_remaining(decodebuf)
    elif e == MBERR_NOMEMORY:
        raise MemoryError
    else:
        raise RuntimeError
    #
    # compute the unicode to use as a replacement -> 'replace', and
    # the current position in the input 'unicodedata' -> 'end'
    start = pypy_cjk_dec_inbuf_consumed(decodebuf)
    end = start + esize
    if errors == "strict":
        raise EncodeDecodeError(start, end, reason)
    elif errors == "ignore":
        replace = ""
    elif errors == "replace":
        replace = UNICODE_REPLACEMENT_CHARACTER
    else:
        assert errorcb
        replace, end = errorcb(errors, namecb, reason, stringdata, start, end)
        # 'replace' is RPython unicode here
    lgt = rutf8.get_utf8_length(replace)
    inbuf = rffi.utf82wcharp(replace, lgt)
    try:
        r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, lgt, end)
    finally:
        lltype.free(inbuf, flavor='raw')
    if r == MBERR_NOMEMORY:
        raise MemoryError
Beispiel #3
0
    def read_w(self, space, w_size=None):
        self._check_attached(space)
        self._check_closed(space)
        if not self.w_decoder:
            raise oefmt(space.w_IOError, "not readable")

        size = convert_size(space, w_size)
        self._writeflush(space)

        if size < 0:
            # Read everything
            w_bytes = space.call_method(self.w_buffer, "read")
            w_decoded = space.call_method(self.w_decoder, "decode", w_bytes,
                                          space.w_True)
            check_decoded(space, w_decoded)
            chars = self.decoded.get_chars(-1)
            lgt = get_utf8_length(chars)
            w_result = space.newutf8(chars, lgt)
            w_final = space.add(w_result, w_decoded)
            self.snapshot = None
            return w_final

        remaining = size
        builder = Utf8StringBuilder(size)

        # Keep reading chunks until we have n characters to return
        while remaining > 0:
            if not self._ensure_data(space):
                break
            data = self.decoded.get_chars(remaining)
            builder.append(data)
            remaining -= len(data)

        return space.newutf8(builder.build(), builder.getlength())
Beispiel #4
0
 def convert_to(self, space, dtype):
     if dtype.is_unicode():
         return self
     elif dtype.is_object():
         return W_ObjectBox(
             space.newutf8(self._value, get_utf8_length(self._value)))
     else:
         raise oefmt(space.w_NotImplementedError,
                     "Conversion from unicode not implemented yet")
Beispiel #5
0
    def readline_w(self, space, w_limit=None):
        self._check_attached(space)
        self._check_closed(space)
        self._writeflush(space)

        limit = convert_size(space, w_limit)
        remnant = None
        builder = StringBuilder()
        # XXX maybe use Utf8StringBuilder instead?
        while True:
            # First, get some data if necessary
            has_data = self._ensure_data(space)
            if not has_data:
                # end of file
                if remnant:
                    builder.append(remnant)
                break

            if remnant:
                assert not self.readtranslate and self.readnl == '\r\n'
                assert self.decoded.pos == 0
                if remnant == '\r' and self.decoded.text[0] == '\n':
                    builder.append('\r\n')
                    self.decoded.pos = 1
                    remnant = None
                    break
                else:
                    builder.append(remnant)
                    remnant = None
                    continue

            if limit >= 0:
                remaining = limit - builder.getlength()
                assert remaining >= 0
            else:
                remaining = -1
            start = self.decoded.pos
            assert start >= 0
            found = self._scan_line_ending(remaining)
            end_scan = self.decoded.pos
            if end_scan > start:
                s = self.decoded.text[start:end_scan]
                builder.append(s)

            if found or (limit >= 0 and builder.getlength() >= limit):
                break

            # There may be some remaining chars we'll have to prepend to the
            # next chunk of data
            if not self.decoded.exhausted():
                remnant = self.decoded.get_chars(-1)
            # We have consumed the buffer
            self.decoded.reset()

        result = builder.build()
        lgt = get_utf8_length(result)
        return space.newutf8(result, lgt)
Beispiel #6
0
 def write(self, string):
     length = get_utf8_length(string)
     if self.pos + length > len(self.data):
         self.resize(self.pos + length)
     pos = 0
     for i in range(length):
         nextpos = next_codepoint_pos(string, pos)
         self.data[self.pos + i] = string[pos:nextpos]
         pos = nextpos
     self.pos += length
Beispiel #7
0
 def fget_string(self, space):
     ctx = self.ctx
     if isinstance(ctx, rsre_core.BufMatchContext):
         return space.newbytes(ctx._buffer.as_str())
     elif isinstance(ctx, rsre_core.StrMatchContext):
         return space.newbytes(ctx._string)
     elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
         lgt = rutf8.get_utf8_length(ctx._utf8)
         return space.newutf8(ctx._utf8, lgt)
     else:
         raise SystemError
Beispiel #8
0
 def descr_getstate(self, space):
     w_initialval = self.getvalue_w(space)
     w_dict = space.call_method(self.w_dict, "copy")
     if self.readnl is None:
         w_readnl = space.w_None
     else:
         w_readnl = space.str(
             space.newutf8(self.readnl,
                           get_utf8_length(self.readnl)))  # YYY
     return space.newtuple(
         [w_initialval, w_readnl,
          space.newint(self.buf.pos), w_dict])
Beispiel #9
0
    def write_w(self, space, w_text):
        self._check_attached(space)
        self._check_closed(space)

        if not self.w_encoder:
            raise oefmt(space.w_IOError, "not writable")

        if not space.isinstance_w(w_text, space.w_unicode):
            raise oefmt(space.w_TypeError,
                        "unicode argument expected, got '%T'", w_text)

        text, textlen = space.utf8_len_w(w_text)

        haslf = False
        if (self.writetranslate and self.writenl) or self.line_buffering:
            if text.find('\n') >= 0:
                haslf = True
        if haslf and self.writetranslate and self.writenl:
            w_text = space.call_method(
                w_text, "replace", space.newutf8('\n', 1),
                space.newutf8(self.writenl, get_utf8_length(self.writenl)))
            text = space.utf8_w(w_text)

        needflush = False
        if self.line_buffering and (haslf or text.find('\r') >= 0):
            needflush = True

        # XXX What if we were just reading?
        if self.encodefunc:
            w_bytes = self.encodefunc(space, w_text, self.errors)
            self.encoding_start_of_stream = False
        else:
            w_bytes = space.call_method(self.w_encoder, "encode", w_text)

        b = space.bytes_w(w_bytes)
        if not self.pending_bytes:
            self.pending_bytes = []
            self.pending_bytes_count = 0
        self.pending_bytes.append(b)
        self.pending_bytes_count += len(b)

        if self.pending_bytes_count > self.chunk_size or needflush:
            self._writeflush(space)

        if needflush:
            space.call_method(self.w_buffer, "flush")

        self.snapshot = None

        if self.w_decoder:
            space.call_method(self.w_decoder, "reset")

        return space.newint(textlen)
Beispiel #10
0
 def readline_w(self, space, w_limit=None):
     self._check_closed(space)
     limit = convert_size(space, w_limit)
     if self.readuniversal:
         result = self.buf.readline_universal(limit)
     else:
         if self.readtranslate:
             # Newlines are already translated, only search for \n
             newline = '\n'
         else:
             newline = self.readnl
         result = self.buf.readline(newline, limit)
     resultlen = get_utf8_length(result)
     return space.newutf8(result, resultlen)
Beispiel #11
0
 def decode(self, space, input, errors=None):
     if errors is None:
         errors = 'strict'
     state = space.fromcache(CodecState)
     #
     try:
         utf8_output = c_codecs.decode(self.codec, input, errors,
                                  state.decode_error_handler, self.name)
     except c_codecs.EncodeDecodeError as e:
         raise wrap_unicodedecodeerror(space, e, input, self.name)
     except RuntimeError:
         raise wrap_runtimeerror(space)
     lgt = rutf8.get_utf8_length(utf8_output)
     return space.newtuple([space.newutf8(utf8_output, lgt),
                            space.newint(len(input))])
Beispiel #12
0
 def decode_w(self, object, final=False):
     space = self.space
     state = space.fromcache(CodecState)
     if len(self.pending) > 0:
         object = self.pending + object
     try:
         output = c_codecs.decodeex(self.decodebuf, object, self.errors,
                                    state.decode_error_handler, self.name,
                                    get_ignore_error(final))
     except c_codecs.EncodeDecodeError as e:
         raise wrap_unicodedecodeerror(space, e, object, self.name)
     except RuntimeError:
         raise wrap_runtimeerror(space)
     pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf)
     assert 0 <= pos <= len(object)
     self.pending = object[pos:]
     lgt = rutf8.get_utf8_length(output)
     return space.newutf8(output, lgt)
Beispiel #13
0
def slice_w(space, ctx, start, end, w_default):
    # 'start' and 'end' are byte positions
    if ctx.ZERO <= start <= end:
        if isinstance(ctx, rsre_core.BufMatchContext):
            return space.newbytes(
                ctx._buffer.getslice(start, end, 1, end - start))
        if isinstance(ctx, rsre_core.StrMatchContext):
            start = ctx._real_pos(start)
            end = ctx._real_pos(end)
            return space.newbytes(ctx._string[start:end])
        elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
            s = ctx._utf8[start:end]
            lgt = rutf8.get_utf8_length(s)
            return space.newutf8(s, lgt)
        else:
            # unreachable
            raise SystemError
    return w_default
Beispiel #14
0
    def write_w(self, space, w_obj):
        if not space.isinstance_w(w_obj, space.w_unicode):
            raise oefmt(space.w_TypeError,
                        "unicode argument expected, got '%T'", w_obj)
        self._check_closed(space)
        orig_size = space.len_w(w_obj)

        if self.w_decoder is not None:
            w_decoded = space.call_method(self.w_decoder, "decode", w_obj,
                                          space.w_True)
        else:
            w_decoded = w_obj
        if self.writenl:
            w_decoded = space.call_method(
                w_decoded, "replace", space.newtext("\n"),
                space.newutf8(self.writenl, get_utf8_length(self.writenl)))
        string = space.utf8_w(w_decoded)
        if string:
            self.buf.write(string)

        return space.newint(orig_size)
Beispiel #15
0
def multibytecodec_encerror(encodebuf, e, errors, errorcb, namecb,
                            unicodedata):
    if e > 0:
        reason = "illegal multibyte sequence"
        esize = e
    elif e == MBERR_TOOFEW:
        reason = "incomplete multibyte sequence"
        esize = pypy_cjk_enc_inbuf_remaining(encodebuf)
    elif e == MBERR_NOMEMORY:
        raise MemoryError
    else:
        raise RuntimeError
    #
    # compute the string to use as a replacement -> 'replace', and
    # the current position in the input 'unicodedata' -> 'end'
    start = pypy_cjk_enc_inbuf_consumed(encodebuf)
    end = start + esize
    if errors == "strict":
        raise EncodeDecodeError(start, end, reason)
    elif errors == "ignore":
        replace = ""
    elif errors == "replace":
        codec = pypy_cjk_enc_getcodec(encodebuf)
        try:
            replace = encode(codec, "?", 1)
        except EncodeDecodeError:
            replace = "?"
    else:
        assert errorcb
        rets, end = errorcb(errors, namecb, reason, unicodedata, start, end)
        codec = pypy_cjk_enc_getcodec(encodebuf)
        lgt = rutf8.get_utf8_length(rets)
        replace = encode(codec, rets, lgt, "strict", errorcb, namecb)
    with rffi.scoped_nonmovingbuffer(replace) as inbuf:
        r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end)
    if r == MBERR_NOMEMORY:
        raise MemoryError
Beispiel #16
0
 def getvalue_w(self, space):
     self._check_closed(space)
     v = self.buf.getvalue()
     lgt = get_utf8_length(v)
     return space.newutf8(v, lgt)
Beispiel #17
0
 def read_w(self, space, w_size=None):
     self._check_closed(space)
     size = convert_size(space, w_size)
     v = self.buf.read(size)
     lgt = get_utf8_length(v)
     return space.newutf8(v, lgt)
Beispiel #18
0
 def normalize(s):
     u = s.encode('utf8')
     w_s = space.newutf8(u, get_utf8_length(u))
     w_res = ucd.normalize(space, NF_code, w_s)
     return space.utf8_w(w_res).decode('utf8')
Beispiel #19
0
                start = ctx.next_indirect(start)
            ctx.reset(start)

        if last_pos < ctx.end:
            _sub_append_slice(ctx, space, use_builder, sublist_w, strbuilder,
                              last_pos, ctx.end)
        if use_builder != '\x00':
            assert strbuilder is not None
            result_bytes = strbuilder.build()
            if use_builder == 'S':
                assert not isinstance(ctx, rsre_utf8.Utf8MatchContext)
                return space.newbytes(result_bytes), n
            elif use_builder == 'U':
                assert isinstance(ctx, rsre_utf8.Utf8MatchContext)
                return space.newutf8(result_bytes,
                                     rutf8.get_utf8_length(result_bytes)), n
            else:
                raise AssertionError(use_builder)
        else:
            if space.isinstance_w(w_string, space.w_unicode):
                w_emptystr = space.newutf8('', 0)
            else:
                w_emptystr = space.newbytes('')
            w_item = space.call_method(w_emptystr, 'join',
                                       space.newlist(sublist_w))
            return w_item, n


sub_jitdriver = jit.JitDriver(
    reds="""count n last_pos
            ctx w_filter