Example #1
0
def format(space, w_fmt, values_w, w_valuedict, fmt_type):
    "Entry point"
    if fmt_type != FORMAT_UNICODE:
        if fmt_type == FORMAT_BYTEARRAY:
            fmt = w_fmt.buffer_w(space, 0).as_str()
        else:
            fmt = space.bytes_w(w_fmt)
        formatter = StringFormatter(space, fmt, values_w, w_valuedict)
        try:
            result = formatter.format()
        except NeedUnicodeFormattingError:
            # fall through to the unicode case
            pass
        else:
            if fmt_type == FORMAT_BYTES:
                return space.newbytes(result)
            elif fmt_type == FORMAT_BYTEARRAY:
                return _bytearray_from_bytes(space, result)
            return space.newbytes(result)
    fmt = space.utf8_w(w_fmt)
    formatter = UnicodeFormatter(space, fmt, values_w, w_valuedict)
    result = formatter.format()
    # this can force strings, not sure if it's a problem or not
    lgt = rutf8.codepoints_in_utf8(result)
    return space.newutf8(result, lgt)
Example #2
0
    def get_chars(self, size):
        if self.text is None or size == 0:
            return ""

        lgt = codepoints_in_utf8(self.text)
        available = lgt - self.upos
        if size < 0 or size > available:
            size = available
        assert size >= 0

        if self.pos > 0 or size < available:
            start = self.pos
            ret = []
            pos = start
            for i in range(size):
                pos = next_codepoint_pos(self.text, pos)
                self.upos += 1
            assert start >= 0
            assert pos >= 0
            chars = self.text[start:pos]
            self.pos = pos
        else:
            chars = self.text
            self.pos = len(self.text)
            self.upos = lgt

        return chars
Example #3
0
def multibytecodec_decerror(decodebuf, e, errors, errorcb, namecb, stringdata):
    if e > 0:
        reason = "illegal multibyte sequence"
        esize = e
    elif e == MBERR_TOOFEW:
        reason = "incomplete multibyte sequence"
        esize = pypy_cjk_dec_inbuf_remaining(decodebuf)
    elif e == MBERR_NOMEMORY:
        raise MemoryError
    else:
        raise RuntimeError
    #
    # compute the unicode to use as a replacement -> 'replace', and
    # the current position in the input 'unicodedata' -> 'end'
    start = pypy_cjk_dec_inbuf_consumed(decodebuf)
    end = start + esize
    if errors == "strict":
        raise EncodeDecodeError(start, end, reason)
    elif errors == "ignore":
        replace = ""
    elif errors == "replace":
        replace = UNICODE_REPLACEMENT_CHARACTER
    else:
        assert errorcb
        replace, end, rettype, obj = errorcb(errors, namecb, reason,
                                             stringdata, start, end)
        # 'replace' is UTF8 encoded unicode, rettype is 'u'
    lgt = rutf8.codepoints_in_utf8(replace)
    inbuf = rffi.utf82wcharp(replace, lgt)
    try:
        r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, lgt, end)
    finally:
        lltype.free(inbuf, flavor='raw')
    if r == MBERR_NOMEMORY:
        raise MemoryError
Example #4
0
 def std_wp(self, r, is_string=False):
     # r is utf8-encoded unicode
     length = rutf8.codepoints_in_utf8(r)
     if do_unicode and is_string:
         # convert string to unicode using the default encoding
         r = self.space.utf8_w(self.space.newbytes(r))
     prec = self.prec
     if prec == -1 and self.width == 0:
         # fast path
         self.result.append(r)
         return
     if prec >= 0 and prec < length:
         length = prec  # ignore the end of the string if too long
     padding = self.width - length
     if do_unicode:
         # XXX could use W_UnicodeObject.descr_getslice, but that would
         # require a refactor to use the w_val, not r
         length = rutf8._pos_at_index(r, length)
     result = self.result
     if padding < 0:
         padding = 0
     assert padding >= 0
     if not self.f_ljust and padding > 0:
         result.append_multiple_char(' ', padding)
         # add any padding at the left of 'r'
         padding = 0
     result.append_slice(r, 0, length)  # add 'r' itself
     if padding > 0:
         result.append_multiple_char(' ', padding)
Example #5
0
def _decode_helper(cp, s, flags, encoding, errors, errorhandler, final, start,
                   end, res):
    if end > len(s):
        end = len(s)
    piece = s[start:end]
    with rffi.scoped_nonmovingbuffer(piece) as dataptr:
        # first get the size of the result
        outsize = MultiByteToWideChar(cp, flags, dataptr, len(piece),
                                      lltype.nullptr(rffi.CWCHARP.TO), 0)
        if outsize == 0:
            r, pos = _decode_cp_error(s, errorhandler, encoding, errors, final,
                                      start, end)
            res.append(r)
            return pos, check_utf8(r, True)

        with rffi.scoped_alloc_unicodebuffer(outsize) as buf:
            # do the conversion
            if MultiByteToWideChar(cp, flags, dataptr, len(piece), buf.raw,
                                   outsize) == 0:
                r, pos = _decode_cp_error(s, errorhandler, encoding, errors,
                                          final, start, end)
                res.append(r)
                return pos, check_utf8(r, True)
            buf_as_str = buf.str(outsize)
            assert buf_as_str is not None
            with rffi.scoped_nonmoving_unicodebuffer(buf_as_str) as dataptr:
                conv = _unibuf_to_utf8(dataptr, outsize)
            res.append(conv)
            return end, codepoints_in_utf8(conv)
Example #6
0
    def write_w(self, space, w_text):
        self._check_attached(space)
        self._check_closed(space)

        if not self.w_encoder:
            raise oefmt(space.w_IOError, "not writable")

        if not space.isinstance_w(w_text, space.w_unicode):
            raise oefmt(space.w_TypeError,
                        "unicode argument expected, got '%T'", w_text)

        text, textlen = space.utf8_len_w(w_text)

        haslf = False
        if (self.writetranslate and self.writenl) or self.line_buffering:
            if text.find('\n') >= 0:
                haslf = True
        if haslf and self.writetranslate and self.writenl:
            w_text = space.call_method(
                w_text, "replace", space.newutf8('\n', 1),
                space.newutf8(self.writenl, codepoints_in_utf8(self.writenl)))
            text = space.utf8_w(w_text)

        needflush = False
        if self.line_buffering and (haslf or text.find('\r') >= 0):
            needflush = True

        # XXX What if we were just reading?
        if self.encodefunc:
            w_bytes = self.encodefunc(space, w_text, self.errors)
            self.encoding_start_of_stream = False
        else:
            w_bytes = space.call_method(self.w_encoder, "encode", w_text)

        if not space.isinstance_w(w_bytes, space.w_bytes):
            raise oefmt(space.w_TypeError,
                        "encoder should return a bytes object, not '%T'",
                        w_bytes)

        b = space.bytes_w(w_bytes)
        if not self.pending_bytes:
            self.pending_bytes = []
            self.pending_bytes_count = 0
        self.pending_bytes.append(b)
        self.pending_bytes_count += len(b)

        if self.pending_bytes_count > self.chunk_size or needflush:
            self._writeflush(space)

        if needflush:
            space.call_method(self.w_buffer, "flush")

        self.decoded.reset()
        self.snapshot = None

        if self.w_decoder:
            space.call_method(self.w_decoder, "reset")

        return space.newint(textlen)
Example #7
0
 def convert_to(self, space, dtype):
     if dtype.is_unicode():
         return self
     elif dtype.is_object():
         return W_ObjectBox(
             space.newutf8(self._value, codepoints_in_utf8(self._value)))
     else:
         raise oefmt(space.w_NotImplementedError,
                     "Conversion from unicode not implemented yet")
Example #8
0
 def write(self, string):
     length = codepoints_in_utf8(string)
     if self.pos + length > len(self.data):
         self.resize(self.pos + length)
     pos = 0
     for i in range(length):
         nextpos = next_codepoint_pos(string, pos)
         self.data[self.pos + i] = string[pos:nextpos]
         pos = nextpos
     self.pos += length
Example #9
0
def test_codepoints_in_utf8(u, start, len1):
    end = start + len1
    if end > len(u):
        extra = end - len(u)
    else:
        extra = 0
    count = rutf8.codepoints_in_utf8(u.encode('utf8'),
                                     len(u[:start].encode('utf8')),
                                     len(u[:end].encode('utf8')) + extra)
    assert count == len(u[start:end])
Example #10
0
 def _get_error_info(self, pos):
     space = self.space
     if do_unicode:
         cp = rutf8.codepoint_at_pos(self.fmt, pos)
         pos = rutf8.codepoints_in_utf8(self.fmt, 0, pos)
         w_s = space.newutf8(
             rutf8.unichr_as_utf8(r_uint(cp), allow_surrogates=True), 1)
     else:
         cp = ord(self.fmt[pos])
         w_s = space.newbytes(chr(cp))
     return w_s, pos, cp
Example #11
0
 def fget_string(self, space):
     ctx = self.ctx
     if isinstance(ctx, rsre_core.BufMatchContext):
         return space.newbytes(ctx._buffer.as_str())
     elif isinstance(ctx, rsre_core.StrMatchContext):
         return space.newbytes(ctx._string)
     elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
         lgt = rutf8.codepoints_in_utf8(ctx._utf8)
         return space.newutf8(ctx._utf8, lgt)
     else:
         raise SystemError
Example #12
0
 def descr_getstate(self, space):
     w_initialval = self.getvalue_w(space)
     w_dict = space.call_method(self.w_dict, "copy")
     if self.readnl is None:
         w_readnl = space.w_None
     else:
         w_readnl = space.str(
             space.newutf8(self.readnl,
                           codepoints_in_utf8(self.readnl)))  # YYY
     return space.newtuple(
         [w_initialval, w_readnl,
          space.newint(self.buf.pos), w_dict])
Example #13
0
 def readline_w(self, space, w_limit=None):
     self._check_closed(space)
     limit = convert_size(space, w_limit)
     if self.readuniversal:
         result = self.buf.readline_universal(limit)
     else:
         if self.readtranslate:
             # Newlines are already translated, only search for \n
             newline = '\n'
         else:
             newline = self.readnl
         result = self.buf.readline(newline, limit)
     resultlen = codepoints_in_utf8(result)
     return space.newutf8(result, resultlen)
Example #14
0
 def unknown_fmtchar(self):
     space = self.space
     if do_unicode:
         cp = rutf8.codepoint_at_pos(self.fmt, self.fmtpos - 1)
         pos = rutf8.codepoints_in_utf8(self.fmt, 0, self.fmtpos - 1)
         w_s = space.newutf8(
             rutf8.unichr_as_utf8(r_uint(cp), allow_surrogates=True), 1)
     else:
         cp = ord(self.fmt[self.fmtpos - 1])
         pos = self.fmtpos - 1
         w_s = space.newbytes(chr(cp))
     raise oefmt(space.w_ValueError,
                 "unsupported format character %R (%s) at index %d",
                 w_s, hex(cp), pos)
Example #15
0
 def decode(self, space, input, errors=None):
     if errors is None:
         errors = 'strict'
     state = space.fromcache(CodecState)
     #
     try:
         utf8_output = c_codecs.decode(self.codec, input, errors,
                                  state.decode_error_handler, self.name)
     except c_codecs.EncodeDecodeError as e:
         raise wrap_unicodedecodeerror(space, e, input, self.name)
     except RuntimeError:
         raise wrap_runtimeerror(space)
     lgt = rutf8.codepoints_in_utf8(utf8_output)
     return space.newtuple([space.newutf8(utf8_output, lgt),
                            space.newint(len(input))])
Example #16
0
 def _compute_value(self, space):
     lst = [None] * (len(formats) + len(formats) + 1)
     lgt = 0
     for i, fmt, attr in entries:
         lst[i + i] = self.xstrings[i]
         lgt += len(self.xstrings[i])
         value = getattr(self, attr)
         if fmt == 'd':
             result = str(value)
             lgt += len(result)
         elif fmt == 'R':
             s = space.repr(value)
             result = space.utf8_w(s)
             lgt += space.len_w(s)
         elif fmt == 'S':
             s = space.str(value)
             result = space.utf8_w(s)
             lgt += space.len_w(s)
         elif fmt == 'T':
             result = space.type(value).name
             lgt += rutf8.codepoints_in_utf8(result)
         elif fmt == 'N':
             result = value.getname(space)
             lgt += len(result)
         elif fmt == '8':
             # u'str\uxxxx' -> 'str\xXX\xXX' -> u"'str\xXX\xXX'"
             from pypy.interpreter import unicodehelper
             result, _lgt, pos = unicodehelper.str_decode_utf8(
                 value, 'replace', True,
                 unicodehelper.decode_never_raise, True)
             lgt += _lgt
         elif isinstance(value, unicode):
             # 's'
             result = str(value.encode('utf-8'))
             lgt += len(value)
         else:
             result = str(value)
             try:
                 lgt += rutf8.check_utf8(result, True)
             except rutf8.CheckError as e:
                 lgt -= e.pos
         lst[i + i + 1] = result
     lst[-1] = self.xstrings[-1]
     lgt += len(self.xstrings[-1])
     retval = ''.join(lst)
     return retval, lgt
Example #17
0
def format(space, w_fmt, values_w, w_valuedict, do_unicode):
    "Entry point"
    if not do_unicode:
        fmt = space.bytes_w(w_fmt)
        formatter = StringFormatter(space, fmt, values_w, w_valuedict)
        try:
            result = formatter.format()
        except NeedUnicodeFormattingError:
            # fall through to the unicode case
            pass
        else:
            return space.newbytes(result)
    fmt = space.utf8_w(w_fmt)
    formatter = UnicodeFormatter(space, fmt, values_w, w_valuedict)
    result = formatter.format()
    # this can force strings, not sure if it's a problem or not
    lgt = rutf8.codepoints_in_utf8(result)
    return space.newutf8(result, lgt)
Example #18
0
 def decode_w(self, object, final=False):
     space = self.space
     state = space.fromcache(CodecState)
     if len(self.pending) > 0:
         object = self.pending + object
     try:
         output = c_codecs.decodeex(self.decodebuf, object, self.errors,
                                    state.decode_error_handler, self.name,
                                    get_ignore_error(final))
     except c_codecs.EncodeDecodeError as e:
         raise wrap_unicodedecodeerror(space, e, object, self.name)
     except RuntimeError:
         raise wrap_runtimeerror(space)
     pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf)
     assert 0 <= pos <= len(object)
     self.pending = object[pos:]
     lgt = rutf8.codepoints_in_utf8(output)
     return space.newutf8(output, lgt)
Example #19
0
def slice_w(space, ctx, start, end, w_default):
    # 'start' and 'end' are byte positions
    if ctx.ZERO <= start <= end:
        if isinstance(ctx, rsre_core.BufMatchContext):
            return space.newbytes(
                ctx._buffer.getslice(start, end, 1, end - start))
        if isinstance(ctx, rsre_core.StrMatchContext):
            start = ctx._real_pos(start)
            end = ctx._real_pos(end)
            return space.newbytes(ctx._string[start:end])
        elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
            s = ctx._utf8[start:end]
            lgt = rutf8.codepoints_in_utf8(s)
            return space.newutf8(s, lgt)
        else:
            # unreachable
            raise SystemError
    return w_default
Example #20
0
    def write_w(self, space, w_obj):
        if not space.isinstance_w(w_obj, space.w_unicode):
            raise oefmt(space.w_TypeError,
                        "unicode argument expected, got '%T'", w_obj)
        self._check_closed(space)
        orig_size = space.len_w(w_obj)

        if self.w_decoder is not None:
            w_decoded = space.call_method(self.w_decoder, "decode", w_obj,
                                          space.w_True)
        else:
            w_decoded = w_obj
        if self.writenl:
            w_decoded = space.call_method(
                w_decoded, "replace", space.newtext("\n"),
                space.newutf8(self.writenl, codepoints_in_utf8(self.writenl)))
        string = space.utf8_w(w_decoded)
        if string:
            self.buf.write(string)

        return space.newint(orig_size)
Example #21
0
def multibytecodec_encerror(encodebuf, e, errors, errorcb, namecb,
                            unicodedata):
    if e > 0:
        reason = "illegal multibyte sequence"
        esize = e
    elif e == MBERR_TOOFEW:
        reason = "incomplete multibyte sequence"
        esize = pypy_cjk_enc_inbuf_remaining(encodebuf)
    elif e == MBERR_NOMEMORY:
        raise MemoryError
    else:
        raise RuntimeError
    #
    # compute the string to use as a replacement -> 'replace', and
    # the current position in the input 'unicodedata' -> 'end'
    start = pypy_cjk_enc_inbuf_consumed(encodebuf)
    end = start + esize
    if errors == "strict":
        raise EncodeDecodeError(start, end, reason)
    elif errors == "ignore":
        replace = ""
    elif errors == "replace":
        codec = pypy_cjk_enc_getcodec(encodebuf)
        try:
            replace = encode(codec, "?", 1)
        except EncodeDecodeError:
            replace = "?"
    else:
        assert errorcb
        rets, end = errorcb(errors, namecb, reason, unicodedata, start, end)
        codec = pypy_cjk_enc_getcodec(encodebuf)
        lgt = rutf8.codepoints_in_utf8(rets)
        replace = encode(codec, rets, lgt, "strict", errorcb, namecb)
    with rffi.scoped_nonmovingbuffer(replace) as inbuf:
        r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end)
    if r == MBERR_NOMEMORY:
        raise MemoryError
Example #22
0
def replace_count(input, sub, by, maxcount=-1, isutf8=False):
    if isinstance(input, str):
        Builder = StringBuilder
    elif isinstance(input, unicode):
        Builder = UnicodeBuilder
    else:
        assert isinstance(input, list)
        Builder = ByteListBuilder
    if maxcount == 0:
        return input, 0

    if not sub and not isutf8:
        upper = len(input)
        if maxcount > 0 and maxcount < upper + 2:
            upper = maxcount - 1
            assert upper >= 0

        try:
            result_size = ovfcheck(upper * len(by))
            result_size = ovfcheck(result_size + upper)
            result_size = ovfcheck(result_size + len(by))
            remaining_size = len(input) - upper
            result_size = ovfcheck(result_size + remaining_size)
        except OverflowError:
            raise
        builder = Builder(result_size)
        for i in range(upper):
            builder.append(by)
            builder.append(input[i])
        builder.append(by)
        builder.append_slice(input, upper, len(input))
        replacements = upper + 1

    elif isinstance(input, str) and len(sub) == 1:
        if len(by) == 1:
            return replace_count_str_chr_chr(input, sub[0], by[0], maxcount)
        return replace_count_str_chr_str(input, sub[0], by, maxcount)

    else:
        # First compute the exact result size
        if sub:
            cnt = count(input, sub, 0, len(input))
            if isinstance(input, str) and cnt == 0:
                return input, 0
            if isinstance(input, str):
                return replace_count_str_str_str(input, sub, by, cnt, maxcount)
        else:
            assert isutf8
            from rpython.rlib import rutf8
            cnt = rutf8.codepoints_in_utf8(input) + 1

        if cnt > maxcount and maxcount > 0:
            cnt = maxcount
        diff_len = len(by) - len(sub)
        try:
            result_size = ovfcheck(diff_len * cnt)
            result_size = ovfcheck(result_size + len(input))
        except OverflowError:
            raise
        replacements = cnt

        builder = Builder(result_size)
        start = 0
        sublen = len(sub)

        if sublen == 0:
            assert isutf8
            from rpython.rlib import rutf8
            while True:
                builder.append(by)
                maxcount -= 1
                if start == len(input) or maxcount == 0:
                    break
                next = rutf8.next_codepoint_pos(input, start)
                builder.append_slice(input, start, next)
                start = next
        else:
            while maxcount != 0:
                next = find(input, sub, start, len(input))
                if next < 0:
                    break
                builder.append_slice(input, start, next)
                builder.append(by)
                start = next + sublen
                maxcount -= 1  # NB. if it's already < 0, it stays < 0

        builder.append_slice(input, start, len(input))

    return builder.build(), replacements
Example #23
0
            ctx.reset(start)

        if last_pos < ctx.end:
            _sub_append_slice(ctx, space, use_builder, sublist_w, strbuilder,
                              last_pos, ctx.end)
        if use_builder != '\x00':
            assert strbuilder is not None
            result_bytes = strbuilder.build()
            if use_builder == 'S':
                assert not isinstance(ctx, rsre_utf8.Utf8MatchContext)
                return space.newbytes(result_bytes), n
            elif use_builder == 'U':
                assert (isinstance(ctx, UnicodeAsciiMatchContext)
                        or isinstance(ctx, rsre_utf8.Utf8MatchContext))
                return space.newutf8(result_bytes,
                                     rutf8.codepoints_in_utf8(result_bytes)), n
            else:
                raise AssertionError(use_builder)
        else:
            if space.isinstance_w(w_string, space.w_unicode):
                w_emptystr = space.newutf8('', 0)
            else:
                w_emptystr = space.newbytes('')
            w_item = space.call_method(w_emptystr, 'join',
                                       space.newlist(sublist_w))
            return w_item, n


sub_jitdriver = jit.JitDriver(
    reds="""count n last_pos
            ctx w_filter
Example #24
0
    def tell_w(self, space):
        self._check_closed(space)
        if not self.seekable:
            raise oefmt(space.w_IOError, "underlying stream is not seekable")
        if not self.telling:
            raise oefmt(space.w_IOError,
                        "telling position disabled by next() call")

        self._writeflush(space)
        space.call_method(self, "flush")

        w_pos = space.call_method(self.w_buffer, "tell")

        if self.w_decoder is None or self.snapshot is None:
            assert not self.decoded.text
            return w_pos

        cookie = PositionCookie(space.bigint_w(w_pos))

        # Skip backward to the snapshot point (see _read_chunk)
        cookie.dec_flags = self.snapshot.flags
        input = self.snapshot.input
        cookie.start_pos -= len(input)

        # How many decoded characters have been used up since the snapshot?
        if not self.decoded.pos:
            # We haven't moved from the snapshot point.
            return space.newlong_from_rbigint(cookie.pack())

        chars_to_skip = codepoints_in_utf8(self.decoded.text,
                                           end=self.decoded.pos)

        # Starting from the snapshot position, we will walk the decoder
        # forward until it gives us enough decoded characters.
        w_saved_state = space.call_method(self.w_decoder, "getstate")

        try:
            # Note our initial start point
            self._decoder_setstate(space, cookie)

            # Feed the decoder one byte at a time.  As we go, note the nearest
            # "safe start point" before the current location (a point where
            # the decoder has nothing buffered, so seek() can safely start
            # from there and advance to this location).

            chars_decoded = 0
            i = 0
            while i < len(input):
                w_decoded = space.call_method(self.w_decoder, "decode",
                                              space.newbytes(input[i]))
                check_decoded(space, w_decoded)
                chars_decoded += space.len_w(w_decoded)

                cookie.bytes_to_feed += 1

                w_state = space.call_method(self.w_decoder, "getstate")
                w_dec_buffer, w_flags = space.unpackiterable(w_state, 2)
                dec_buffer_len = space.len_w(w_dec_buffer)

                if dec_buffer_len == 0 and chars_decoded <= chars_to_skip:
                    # Decoder buffer is empty, so this is a safe start point.
                    cookie.start_pos += cookie.bytes_to_feed
                    chars_to_skip -= chars_decoded
                    assert chars_to_skip >= 0
                    cookie.dec_flags = space.int_w(w_flags)
                    cookie.bytes_to_feed = 0
                    chars_decoded = 0
                if chars_decoded >= chars_to_skip:
                    break
                i += 1
            else:
                # We didn't get enough decoded data; signal EOF to get more.
                w_decoded = space.call_method(self.w_decoder, "decode",
                                              space.newbytes(""),
                                              space.newint(1))  # final=1
                check_decoded(space, w_decoded)
                chars_decoded += space.len_w(w_decoded)
                cookie.need_eof = 1

                if chars_decoded < chars_to_skip:
                    raise oefmt(space.w_IOError,
                                "can't reconstruct logical file position")
        finally:
            space.call_method(self.w_decoder, "setstate", w_saved_state)

        # The returned cookie corresponds to the last safe start point.
        cookie.chars_to_skip = chars_to_skip
        return space.newlong_from_rbigint(cookie.pack())
Example #25
0
 def read_w(self, space, w_size=None):
     self._check_closed(space)
     size = convert_size(space, w_size)
     v = self.buf.read(size)
     lgt = codepoints_in_utf8(v)
     return space.newutf8(v, lgt)
Example #26
0
 def getvalue_w(self, space):
     self._check_closed(space)
     v = self.buf.getvalue()
     lgt = codepoints_in_utf8(v)
     return space.newutf8(v, lgt)
Example #27
0
 def normalize(s):
     u = s.encode('utf8')
     w_s = space.newutf8(u, codepoints_in_utf8(u))
     w_res = ucd.normalize(space, NF_code, w_s)
     return space.utf8_w(w_res).decode('utf8')
Example #28
0
def fstring_find_literal(astbuilder, fstr, atom_node, rec):
    space = astbuilder.space
    raw = fstr.raw_mode

    # Return the next literal part.  Updates the current index inside 'fstr'.
    # Differs from CPython: this version handles double-braces on its own.
    s = fstr.unparsed
    literal_start = fstr.current_index
    assert literal_start >= 0

    # Get any literal string. It ends when we hit an un-doubled left
    # brace (which isn't part of a unicode name escape such as
    # "\N{EULER CONSTANT}"), or the end of the string.
    i = literal_start
    builder = StringBuilder()
    while i < len(s):
        ch = s[i]
        i += 1
        if not raw and ch == '\\' and i < len(s):
            ch = s[i]
            i += 1
            if ch == 'N':
                if i < len(s) and s[i] == '{':
                    while i < len(s) and s[i] != '}':
                        i += 1
                    if i < len(s):
                        i += 1
                    continue
                elif i < len(s):
                    i += 1
                break
            if ch == '{':
                msg = "invalid escape sequence '%s'"
                try:
                    space.warn(space.newtext(msg % ch),
                               space.w_DeprecationWarning)
                except error.OperationError as e:
                    if e.match(space, space.w_DeprecationWarning):
                        astbuilder.error(msg % ch, atom_node)
                    else:
                        raise
        if ch == '{' or ch == '}':
            # Check for doubled braces, but only at the top level. If
            # we checked at every level, then f'{0:{3}}' would fail
            # with the two closing braces.
            if rec == 0 and i < len(s) and s[i] == ch:
                assert 0 <= i <= len(s)
                builder.append(s[literal_start:i])
                i += 1  # skip over the second brace
                literal_start = i
            elif rec == 0 and ch == '}':
                i -= 1
                assert i >= 0
                fstr.current_index = i
                # Where a single '{' is the start of a new expression, a
                # single '}' is not allowed.
                astbuilder.error("f-string: single '}' is not allowed",
                                 atom_node)
            else:
                # We're either at a '{', which means we're starting another
                # expression; or a '}', which means we're at the end of this
                # f-string (for a nested format_spec).
                i -= 1
                break
    assert 0 <= i <= len(s)
    assert i == len(s) or s[i] == '{' or s[i] == '}'
    builder.append(s[literal_start:i])

    fstr.current_index = i
    literal = builder.build()
    lgt = codepoints_in_utf8(literal)
    if not raw and '\\' in literal:
        literal = parsestring.decode_unicode_utf8(space, literal, 0,
                                                  len(literal))
        literal, lgt, pos = unicodehelper.decode_unicode_escape(space, literal)
    return space.newtext(literal, lgt)
Example #29
0
    def subx(self, w_ptemplate, w_string, count):
        space = self.space
        # use a (much faster) string builder (possibly utf8) if w_ptemplate and
        # w_string are both string or both unicode objects, and if w_ptemplate
        # is a literal
        use_builder = '\x00'  # or 'S'tring or 'U'nicode/UTF8
        is_buffer = False
        filter_as_string = None
        if space.isinstance_w(w_string, space.w_unicode):
            if not self.is_known_unicode():
                raise oefmt(
                    space.w_TypeError,
                    "cannot use a bytes pattern on a string-like object")
        else:
            if self.is_known_unicode():
                raise oefmt(
                    space.w_TypeError,
                    "cannot use a string pattern on a bytes-like object")
        if space.is_true(space.callable(w_ptemplate)):
            w_filter = w_ptemplate
            filter_is_callable = True
        else:
            if space.isinstance_w(w_ptemplate, space.w_unicode):
                filter_as_string = space.utf8_w(w_ptemplate)
                literal = '\\' not in filter_as_string
                if space.isinstance_w(w_string, space.w_unicode) and literal:
                    use_builder = 'U'
            elif space.isinstance_w(w_ptemplate, space.w_bytes):
                filter_as_string = space.bytes_w(w_ptemplate)
                literal = '\\' not in filter_as_string
                if space.isinstance_w(w_string, space.w_bytes) and literal:
                    use_builder = 'S'
            else:
                if space.isinstance_w(w_ptemplate, space.w_bytes):
                    filter_as_string = space.bytes_w(w_ptemplate)
                else:
                    filter_as_string = space.readbuf_w(w_ptemplate).as_str()
                    is_buffer = True
                literal = '\\' not in filter_as_string
                if space.isinstance_w(w_string, space.w_bytes) and literal:
                    use_builder = 'S'
            if literal:
                w_filter = w_ptemplate
                filter_is_callable = False
            else:
                # not a literal; hand it over to the template compiler
                # FIX for a CPython 3.5 bug: if w_ptemplate is a buffer
                # (e.g. a bytearray), convert it to a byte string here.
                if is_buffer:
                    w_ptemplate = space.newbytes(filter_as_string)
                w_re = import_re(space)
                w_filter = space.call_method(w_re, '_subx', self, w_ptemplate)
                filter_is_callable = space.is_true(space.callable(w_filter))
        #
        # XXX this is a bit of a mess, but it improves performance a lot
        ctx = self.make_ctx(w_string)
        sublist_w = strbuilder = None
        if use_builder != '\x00':
            assert filter_as_string is not None
            strbuilder = StringBuilder(ctx.end)
        else:
            sublist_w = []
        n = 0
        last_pos = ctx.ZERO
        while not count or n < count:
            pattern = self.code
            sub_jitdriver.jit_merge_point(
                self=self,
                use_builder=use_builder,
                filter_is_callable=filter_is_callable,
                filter_type=type(w_filter),
                ctx=ctx,
                pattern=pattern,
                w_filter=w_filter,
                strbuilder=strbuilder,
                filter_as_string=filter_as_string,
                count=count,
                w_string=w_string,
                n=n,
                last_pos=last_pos,
                sublist_w=sublist_w)
            space = self.space
            if not searchcontext(space, ctx, pattern):
                break
            if last_pos < ctx.match_start:
                _sub_append_slice(ctx, space, use_builder, sublist_w,
                                  strbuilder, last_pos, ctx.match_start)
            if not (last_pos == ctx.match_start == ctx.match_end and n > 0):
                # the above ignores empty matches on latest position
                last_pos = ctx.match_end
                if filter_is_callable:
                    w_match = self.getmatch(ctx, True)
                    # make a copy of 'ctx'; see test_sub_matches_stay_valid
                    ctx = self.fresh_copy(ctx)
                    w_piece = space.call_function(w_filter, w_match)
                    if not space.is_w(w_piece, space.w_None):
                        assert strbuilder is None
                        assert use_builder == '\x00'
                        sublist_w.append(w_piece)
                else:
                    if use_builder != '\x00':
                        assert filter_as_string is not None
                        assert strbuilder is not None
                        strbuilder.append(filter_as_string)
                    else:
                        sublist_w.append(w_filter)
                n += 1
            elif last_pos >= ctx.end:
                break  # empty match at the end: finished

            start = ctx.match_end
            if start == ctx.match_start:
                if start == ctx.end:
                    break
                start = ctx.next_indirect(start)
            ctx.reset(start)

        if last_pos < ctx.end:
            _sub_append_slice(ctx, space, use_builder, sublist_w, strbuilder,
                              last_pos, ctx.end)
        if use_builder != '\x00':
            assert strbuilder is not None
            result_bytes = strbuilder.build()
            if use_builder == 'S':
                assert not isinstance(ctx, rsre_utf8.Utf8MatchContext)
                return space.newbytes(result_bytes), n
            elif use_builder == 'U':
                assert (isinstance(ctx, UnicodeAsciiMatchContext)
                        or isinstance(ctx, rsre_utf8.Utf8MatchContext))
                return space.newutf8(result_bytes,
                                     rutf8.codepoints_in_utf8(result_bytes)), n
            else:
                raise AssertionError(use_builder)
        else:
            if space.isinstance_w(w_string, space.w_unicode):
                w_emptystr = space.newutf8('', 0)
            else:
                w_emptystr = space.newbytes('')
            w_item = space.call_method(w_emptystr, 'join',
                                       space.newlist(sublist_w))
            return w_item, n
Example #30
0
        def call_errorhandler(errors, encoding, reason, input, startpos,
                              endpos):
            """Generic wrapper for calling into error handlers.

            Note that error handler receives and returns position into
            the unicode characters, not into the position of utf8 bytes,
            so it needs to be converted by the codec

            Returns (str_or_none, newpos) as error
            handlers return utf8 so we add whether they used unicode or bytes
            """
            w_errorhandler = lookup_error(space, errors)
            if decode:
                w_cls = space.w_UnicodeDecodeError
                assert isinstance(input, str)
                w_input = space.newbytes(input)
                length = len(input)
            else:
                w_cls = space.w_UnicodeEncodeError
                assert isinstance(input, str)
                length = rutf8.codepoints_in_utf8(input)
                w_input = space.newtext(input, length)
            w_exc = space.call_function(w_cls, space.newtext(encoding),
                                        w_input, space.newint(startpos),
                                        space.newint(endpos),
                                        space.newtext(reason))
            w_res = space.call_function(w_errorhandler, w_exc)
            if (not space.isinstance_w(w_res, space.w_tuple)
                    or space.len_w(w_res) != 2):
                if decode:
                    msg = ("decoding error handler must return "
                           "(str, int) tuple")
                else:
                    msg = ("encoding error handler must return "
                           "(str/bytes, int) tuple")
                raise OperationError(space.w_TypeError, space.newtext(msg))

            w_replace, w_newpos = space.fixedview(w_res, 2)
            if space.isinstance_w(w_replace, space.w_unicode):
                rettype = 'u'
            elif not decode and space.isinstance_w(w_replace, space.w_bytes):
                rettype = 'b'
            else:
                if decode:
                    msg = ("decoding error handler must return "
                           "(str, int) tuple")
                else:
                    msg = ("encoding error handler must return "
                           "(str/bytes, int) tuple")
                raise OperationError(space.w_TypeError, space.newtext(msg))
            try:
                newpos = space.int_w(w_newpos)
            except OperationError as e:
                if not e.match(space, space.w_OverflowError):
                    raise
                newpos = -1
            else:
                if newpos < 0:
                    newpos = length + newpos
            if newpos < 0 or newpos > length:
                raise oefmt(space.w_IndexError,
                            "position %d from error handler out of bounds",
                            newpos)
            w_obj = space.getattr(w_exc, space.newtext('object'))
            if decode:
                if not space.isinstance_w(w_obj, space.w_bytes):
                    raise oefmt(
                        space.w_ValueError,
                        "error handler modified exc.object must be bytes")
            else:
                if not space.isinstance_w(w_obj, space.w_unicode):
                    raise oefmt(
                        space.w_ValueError,
                        "error handler modified exc.object must be str")
            obj = space.utf8_w(w_obj)
            return space.utf8_w(w_replace), newpos, rettype, obj