Exemple #1
0
class W_StringOutputPort(W_OutputPort):
    errorname = "output-port"

    def __init__(self):
        self.closed = False
        self.str = StringBuilder()

    def write(self, s):
        self.str.append(s)

    def contents(self):
        return self.str.build()

    def seek(self, offset, end=False):
        if end or offset == self.str.getlength():
            return
        if offset > self.str.getlength():
            self.str.append("\0" * (self.str.getlength() - offset))
        else:
            # FIXME: this is potentially slow.
            content = self.contents()
            self.str = StringBuilder(offset)
            self.str.append_slice(content, 0, offset)

    def tell(self):
        return self.str.getlength()
Exemple #2
0
 def read(self, n=-1):
     assert isinstance(n, int)
     if n < 0:
         return self.readall()
     currentsize = len(self.buf) - self.pos
     start = self.pos
     assert start >= 0
     if n <= currentsize:
         stop = start + n
         assert stop >= 0
         result = self.buf[start:stop]
         self.pos += n
         return result
     else:
         builder = StringBuilder(n)
         builder.append_slice(self.buf, start, len(self.buf))
         while 1:
             self.buf = self.do_read(self.bufsize)
             if not self.buf:
                 self.pos = 0
                 break
             currentsize += len(self.buf)
             if currentsize >= n:
                 self.pos = len(self.buf) - (currentsize - n)
                 stop = self.pos
                 assert stop >= 0
                 builder.append_slice(self.buf, 0, stop)
                 break
             buf = self.buf
             assert buf is not None
             builder.append(buf)
         return builder.build()
Exemple #3
0
 def func():
     s = StringBuilder()
     s.append("a")
     s.append("abc")
     s.append_slice("abc", 1, 2)
     s.append_multiple_char('d', 4)
     return s.build()
 def decode_string_escaped(self, start):
     i = self.pos
     builder = StringBuilder((i - start) * 2)  # just an estimate
     assert start >= 0
     assert i >= 0
     builder.append_slice(self.s, start, i)
     while True:
         ch = self.ll_chars[i]
         i += 1
         if ch == '"':
             content_utf8 = builder.build()
             content_unicode = unicodehelper.decode_utf8(
                 self.space, content_utf8)
             self.last_type = TYPE_STRING
             self.pos = i
             return self.space.newunicode(content_unicode)
         elif ch == '\\':
             i = self.decode_escape_sequence(i, builder)
         elif ch < '\x20':
             if ch == '\0':
                 self._raise("Unterminated string starting at char %d",
                             start - 1)
             else:
                 self._raise("Invalid control character at char %d", i - 1)
         else:
             builder.append(ch)
Exemple #5
0
 def func():
     s = StringBuilder()
     s.append("a")
     s.append("abc")
     s.append_slice("abc", 1, 2)
     s.append_multiple_char('d', 4)
     return s.build()
Exemple #6
0
class W_BytesBuilder(W_Root):
    def __init__(self, space, size):
        if size < 0:
            self.builder = StringBuilder()
        else:
            self.builder = StringBuilder(size)

    @unwrap_spec(size=int)
    def descr__new__(space, w_subtype, size=-1):
        return W_BytesBuilder(space, size)

    @unwrap_spec(s='bytes')
    def descr_append(self, space, s):
        self.builder.append(s)

    @unwrap_spec(s='bytes', start=int, end=int)
    def descr_append_slice(self, space, s, start, end):
        if not 0 <= start <= end <= len(s):
            raise oefmt(space.w_ValueError, "bad start/stop")
        self.builder.append_slice(s, start, end)

    def descr_build(self, space):
        w_s = space.newbytes(self.builder.build())
        # after build(), we can continue to append more strings
        # to the same builder.  This is supported since
        # 2ff5087aca28 in RPython.
        return w_s

    def descr_len(self, space):
        if self.builder is None:
            raise oefmt(space.w_ValueError, "no length of built builder")
        return space.newint(self.builder.getlength())
Exemple #7
0
class W_StringOutputPort(W_OutputPort):
    errorname = "output-port"
    _attrs_ = ["closed", "str"]

    def __init__(self):
        self.closed = False
        self.str = StringBuilder()

    def write(self, s):
        self.str.append(s)

    def contents(self):
        return self.str.build()

    def seek(self, offset, end=False):
        if end or offset == self.str.getlength():
            return
        if offset > self.str.getlength():
            self.str.append("\0" * (self.str.getlength() - offset))
        else:
            # FIXME: this is potentially slow.
            content = self.contents()
            self.str = StringBuilder(offset)
            self.str.append_slice(content, 0, offset)

    def tell(self):
        return self.str.getlength()
Exemple #8
0
        def format(self):
            lgt = len(self.fmt) + 4 * len(self.values_w) + 10
            result = StringBuilder(lgt)
            self.result = result
            while True:
                # fast path: consume as many characters as possible
                fmt = self.fmt
                i = i0 = self.fmtpos
                while i < len(fmt):
                    if fmt[i] == '%':
                        break
                    i += 1
                else:
                    result.append_slice(fmt, i0, len(fmt))
                    break  # end of 'fmt' string
                result.append_slice(fmt, i0, i)
                self.fmtpos = i + 1

                c = self.peekchr()
                if c == '%':
                    self.forward()
                    self.result.append('%')
                    continue

                # interpret the next formatter
                w_value = self.parse_fmt()
                c = self.peekchr()
                self.forward()
                if c == '%':
                    # if we get here there were extra characters between the
                    # two %, forbidden now
                    self.two_percent_error(i + 1)
                    continue

                # first check whether it's a invalid char, *then* call
                # nextinputvalue, otherwise the error generated by
                # nextinputvalue can cover that of unknown_fmtchar
                for c1 in FORMATTER_CHARS:
                    if c == c1:
                        break
                else:
                    self.unknown_fmtchar()
                if w_value is None:
                    w_value = self.nextinputvalue()

                # dispatch on the formatter
                # (this turns into a switch after translation)
                for c1 in FORMATTER_CHARS:
                    if c == c1:
                        # 'c1' is an annotation constant here,
                        # so this getattr() is ok
                        do_fmt = getattr(self, 'fmt_' + c1)
                        do_fmt(w_value)
                        break

            self.checkconsumed()
            return result.build()
Exemple #9
0
def test_string_builder():
    s = StringBuilder()
    s.append("a")
    s.append("abc")
    assert s.getlength() == len('aabc')
    s.append("a")
    s.append_slice("abc", 1, 2)
    s.append_multiple_char('d', 4)
    assert s.build() == "aabcabdddd"
Exemple #10
0
class Utf8StringBuilder(object):
    @always_inline
    def __init__(self, size=0):
        self._s = StringBuilder(size)
        self._lgt = 0

    @always_inline
    def append(self, s):
        # for strings
        self._s.append(s)
        newlgt = codepoints_in_utf8(s)
        self._lgt += newlgt

    @always_inline
    def append_slice(self, s, start, end):
        self._s.append_slice(s, start, end)
        newlgt = codepoints_in_utf8(s, start, end)
        self._lgt += newlgt

    @signature(types.self(), char(), returns=none())
    @always_inline
    def append_char(self, s):
        # for characters, ascii
        self._s.append(s)
        self._lgt += 1

    @try_inline
    def append_code(self, code):
        unichr_as_utf8_append(self._s, code, True)
        self._lgt += 1

    @always_inline
    def append_utf8(self, utf8, length):
        self._s.append(utf8)
        self._lgt += length

    @always_inline
    def append_utf8_slice(self, utf8, start, end, slicelength):
        self._s.append_slice(utf8, start, end)
        self._lgt += slicelength
        if not we_are_translated():
            assert len(utf8[start:end].decode("utf-8")) == slicelength

    @always_inline
    def append_multiple_char(self, utf8, times):
        self._s.append(utf8 * times)
        self._lgt += times

    @always_inline
    def build(self):
        return self._s.build()

    @always_inline
    def getlength(self):
        return self._lgt
Exemple #11
0
def test_string_builder():
    s = StringBuilder()
    s.append("a")
    s.append("abc")
    assert s.getlength() == len('aabc')
    s.append("a")
    s.append_slice("abc", 1, 2)
    s.append_multiple_char('d', 4)
    result = s.build()
    assert result == "aabcabdddd"
    assert result == s.build()
    s.append("x")
    assert s.build() == result + "x"
Exemple #12
0
def test_string_builder():
    s = StringBuilder()
    s.append("a")
    s.append("abc")
    assert s.getlength() == len("aabc")
    s.append("a")
    s.append_slice("abc", 1, 2)
    s.append_multiple_char("d", 4)
    result = s.build()
    assert result == "aabcabdddd"
    assert result == s.build()
    s.append("x")
    assert s.build() == result + "x"
Exemple #13
0
class Utf8StringBuilder(object):
    @always_inline
    def __init__(self, size=0):
        self._s = StringBuilder(size)
        self._lgt = 0

    @always_inline
    def append(self, s):
        # for strings
        self._s.append(s)
        newlgt = get_utf8_length(s)
        self._lgt += newlgt

    @always_inline
    def append_slice(self, s, start, end):
        self._s.append_slice(s, start, end)
        newlgt = get_utf8_length(s, start, end)
        self._lgt += newlgt

    @signature(types.self(), char(), returns=none())
    @always_inline
    def append_char(self, s):
        # for characters, ascii
        self._s.append(s)
        self._lgt += 1

    @try_inline
    def append_code(self, code):
        unichr_as_utf8_append(self._s, code, True)
        self._lgt += 1

    @always_inline
    def append_utf8(self, utf8, length):
        self._s.append(utf8)
        self._lgt += length

    @always_inline
    def append_multiple_char(self, utf8, times):
        self._s.append(utf8 * times)
        self._lgt += times

    @always_inline
    def build(self):
        return self._s.build()

    @always_inline
    def getlength(self):
        return self._lgt
Exemple #14
0
def _decode_latin_1_slowpath(s):
    res = StringBuilder(len(s))
    i = 0
    while i < len(s):
        if ord(s[i]) > 0x7F:
            while i < len(s) and ord(s[i]) > 0x7F:
                unichr_as_utf8_append(res, ord(s[i]))
                i += 1
        else:
            start = i
            end = i + 1
            while end < len(s) and ord(s[end]) <= 0x7F:
                end += 1
            res.append_slice(s, start, end)
            i = end
    return res.build()
Exemple #15
0
def format(form, v):
    text = form.value
    result = StringBuilder()
    pos = 0
    for match in format_regex.finditer(text):
        match_start = match.start()
        assert match_start >= 0
        result.append_slice(text, pos, match_start)
        val = format_dict[match.group()]
        if val is None:
            val, v = v[0].tostring(), v[1:]
        result.append(val)
        pos = match.end()
        assert pos >= 0
    result.append_slice(text, pos, len(text))
    return result.build()
Exemple #16
0
def str_zfill__String_ANY(space, w_self, w_width):
    input = w_self._value
    width = space.int_w(w_width)

    num_zeros = width - len(input)
    if num_zeros <= 0:
        # cannot return w_self, in case it is a subclass of str
        return space.wrap(input)

    builder = StringBuilder(width)
    if len(input) > 0 and (input[0] == '+' or input[0] == '-'):
        builder.append(input[0])
        start = 1
    else:
        start = 0

    builder.append_multiple_char('0', num_zeros)
    builder.append_slice(input, start, len(input))
    return space.wrap(builder.build())
Exemple #17
0
def str_zfill__String_ANY(space, w_self, w_width):
    input = w_self._value
    width = space.int_w(w_width)

    num_zeros = width - len(input)
    if num_zeros <= 0:
        # cannot return w_self, in case it is a subclass of str
        return space.wrap(input)

    builder = StringBuilder(width)
    if len(input) > 0 and (input[0] == '+' or input[0] == '-'):
        builder.append(input[0])
        start = 1
    else:
        start = 0

    builder.append_multiple_char('0', num_zeros)
    builder.append_slice(input, start, len(input))
    return space.wrap(builder.build())
Exemple #18
0
        def format(self):
            lgt = len(self.fmt) + 4 * len(self.values_w) + 10
            result = StringBuilder(lgt)
            self.result = result
            while True:
                # fast path: consume as many characters as possible
                fmt = self.fmt
                i = i0 = self.fmtpos
                while i < len(fmt):
                    if fmt[i] == '%':
                        break
                    i += 1
                else:
                    result.append_slice(fmt, i0, len(fmt))
                    break  # end of 'fmt' string
                result.append_slice(fmt, i0, i)
                self.fmtpos = i + 1

                # interpret the next formatter
                w_value = self.parse_fmt()
                c = self.peekchr()
                self.forward()
                if c == '%':
                    self.std_wp('%', False)
                    continue
                if w_value is None:
                    w_value = self.nextinputvalue()

                # dispatch on the formatter
                # (this turns into a switch after translation)
                for c1 in FORMATTER_CHARS:
                    if c == c1:
                        # 'c1' is an annotation constant here,
                        # so this getattr() is ok
                        do_fmt = getattr(self, 'fmt_' + c1)
                        do_fmt(w_value)
                        break
                else:
                    self.unknown_fmtchar()

            self.checkconsumed()
            return result.build()
Exemple #19
0
 def decode_string_escaped(self, start, nonascii):
     i = self.pos
     builder = StringBuilder((i - start) * 2)  # just an estimate
     assert start >= 0
     assert i >= 0
     builder.append_slice(self.s, start, i)
     while True:
         ch = self.ll_chars[i]
         i += 1
         if ch == '"':
             content_utf8 = builder.build()
             length = unicodehelper.check_utf8_or_raise(
                 self.space, content_utf8)
             self.pos = i
             return self.space.newutf8(content_utf8, length)
         elif ch == '\\':
             i = self.decode_escape_sequence_to_utf8(i, builder)
         elif ch < '\x20':
             self._raise_control_char_in_string(ch, start, i)
         else:
             builder.append(ch)
Exemple #20
0
 def readline(self):
     pos = self.pos
     assert pos >= 0
     i = self.buf.find("\n", pos)
     start = self.pos
     assert start >= 0
     if i >= 0:  # new line found
         i += 1
         result = self.buf[start:i]
         self.pos = i
         return result
     temp = self.buf[start:]
     # read one buffer and most of the time a new line will be found
     self.buf = self.do_read(self.bufsize)
     i = self.buf.find("\n")
     if i >= 0:  # new line found
         i += 1
         result = temp + self.buf[:i]
         self.pos = i
         return result
     if not self.buf:
         self.pos = 0
         return temp
     # need to keep getting data until we find a new line
     builder = StringBuilder(len(temp) + len(self.buf))  # at least
     builder.append(temp)
     builder.append(self.buf)
     while 1:
         self.buf = self.do_read(self.bufsize)
         if not self.buf:
             self.pos = 0
             break
         i = self.buf.find("\n")
         if i >= 0:
             i += 1
             builder.append_slice(self.buf, 0, i)
             self.pos = i
             break
         builder.append(self.buf)
     return builder.build()
Exemple #21
0
def backslashreplace_errors(space, w_exc):
    check_exception(space, w_exc)
    if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
        w_obj = space.getattr(w_exc, space.newtext('object'))
        space.realutf8_w(w_obj)  # for errors
        w_obj = space.convert_arg_to_w_unicode(w_obj)
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        w_end = space.getattr(w_exc, space.newtext('end'))
        end = space.int_w(w_end)
        start = w_obj._index_to_byte(start)
        end = w_obj._index_to_byte(end)
        builder = StringBuilder()
        obj = w_obj._utf8
        pos = start
        while pos < end:
            oc = rutf8.codepoint_at_pos(obj, pos)
            num = hex(oc)
            if (oc >= 0x10000):
                builder.append("\\U")
                zeros = 8
            elif (oc >= 0x100):
                builder.append("\\u")
                zeros = 4
            else:
                builder.append("\\x")
                zeros = 2
            lnum = len(num)
            nb = zeros + 2 - lnum  # num starts with '0x'
            if nb > 0:
                builder.append_multiple_char('0', nb)
            builder.append_slice(num, 2, lnum)
            pos = rutf8.next_codepoint_pos(obj, pos)
        r = builder.build()
        lgt = rutf8.check_utf8(r, True)
        return space.newtuple([space.newutf8(r, lgt), w_end])
    else:
        raise oefmt(space.w_TypeError,
                    "don't know how to handle %T in error callback", w_exc)
Exemple #22
0
 def readall(self):
     pos = self.pos
     assert pos >= 0
     builder = StringBuilder()
     if self.buf:
         builder.append_slice(self.buf, pos, len(self.buf))
     self.buf = ""
     self.pos = 0
     bufsize = self.bufsize
     while 1:
         try:
             data = self.do_read(bufsize)
         except OSError as o:
             # like CPython < 3.4, partial results followed by an error
             # are returned as data
             if not builder.getlength():
                 raise
             break
         if not data:
             break
         builder.append(data)
         bufsize = min(bufsize * 2, self.bigsize)
     return builder.build()
Exemple #23
0
def string_escape_encode(s, quotes):
    buf = StringBuilder(len(s) + 2)

    quote = "'"
    if quotes:
        if quote in s and '"' not in s:
            quote = '"'
            buf.append('b"')
        else:
            buf.append("b'")

    startslice = 0

    for i in range(len(s)):
        c = s[i]
        use_bs_char = False # character quoted by backspace

        if c == '\\' or c == quote:
            bs_char = c
            use_bs_char = True
        elif c == '\t':
            bs_char = 't'
            use_bs_char = True
        elif c == '\r':
            bs_char = 'r'
            use_bs_char = True
        elif c == '\n':
            bs_char = 'n'
            use_bs_char = True
        elif not '\x20' <= c < '\x7f':
            n = ord(c)
            if i != startslice:
                buf.append_slice(s, startslice, i)
            startslice = i + 1
            buf.append('\\x')
            buf.append("0123456789abcdef"[n >> 4])
            buf.append("0123456789abcdef"[n & 0xF])

        if use_bs_char:
            if i != startslice:
                buf.append_slice(s, startslice, i)
            startslice = i + 1
            buf.append('\\')
            buf.append(bs_char)

    if len(s) != startslice:
        buf.append_slice(s, startslice, len(s))

    if quotes:
        buf.append(quote)

    return buf.build()
def string_escape_encode(s, quotes):
    buf = StringBuilder(len(s) + 2)

    quote = "'"
    if quotes:
        if quote in s and '"' not in s:
            quote = '"'
            buf.append('b"')
        else:
            buf.append("b'")

    startslice = 0

    for i in range(len(s)):
        c = s[i]
        use_bs_char = False  # character quoted by backspace

        if c == '\\' or c == quote:
            bs_char = c
            use_bs_char = True
        elif c == '\t':
            bs_char = 't'
            use_bs_char = True
        elif c == '\r':
            bs_char = 'r'
            use_bs_char = True
        elif c == '\n':
            bs_char = 'n'
            use_bs_char = True
        elif not '\x20' <= c < '\x7f':
            n = ord(c)
            if i != startslice:
                buf.append_slice(s, startslice, i)
            startslice = i + 1
            buf.append('\\x')
            buf.append("0123456789abcdef"[n >> 4])
            buf.append("0123456789abcdef"[n & 0xF])

        if use_bs_char:
            if i != startslice:
                buf.append_slice(s, startslice, i)
            startslice = i + 1
            buf.append('\\')
            buf.append(bs_char)

    if len(s) != startslice:
        buf.append_slice(s, startslice, len(s))

    if quotes:
        buf.append(quote)

    return buf.build()
Exemple #25
0
def string_escape_encode(s, quote):
    buf = StringBuilder(len(s) + 2)

    buf.append(quote)
    startslice = 0

    for i in range(len(s)):
        c = s[i]
        use_bs_char = False  # character quoted by backspace

        if c == "\\" or c == quote:
            bs_char = c
            use_bs_char = True
        elif c == "\t":
            bs_char = "t"
            use_bs_char = True
        elif c == "\r":
            bs_char = "r"
            use_bs_char = True
        elif c == "\n":
            bs_char = "n"
            use_bs_char = True
        elif not "\x20" <= c < "\x7f":
            n = ord(c)
            if i != startslice:
                buf.append_slice(s, startslice, i)
            startslice = i + 1
            buf.append("\\x")
            buf.append("0123456789abcdef"[n >> 4])
            buf.append("0123456789abcdef"[n & 0xF])

        if use_bs_char:
            if i != startslice:
                buf.append_slice(s, startslice, i)
            startslice = i + 1
            buf.append("\\")
            buf.append(bs_char)

    if len(s) != startslice:
        buf.append_slice(s, startslice, len(s))

    buf.append(quote)

    return buf.build()
Exemple #26
0
class Pack(object):

    def __init__(self, space, fmt, arg_w):
        self.space = space
        self.fmt = fmt
        # self.table = unroll_fmttable

        self.arg_w = arg_w
        self.arg_index = 0

    def pop_arg(self):
        if self.arg_index >= len(self.arg_w):
            raise FormatException("too few arguments")
        result = self.arg_w[self.arg_index]
        self.arg_index += 1
        return result

    def _get_fmtdesc(self, char):
        for fmtdesc in unroll_fmttable:
            if char == fmtdesc.fmtchar:
                return fmtdesc

    def _shrink(self, new_len):
        result_so_far = self.result.build()
        assert new_len < len(result_so_far)
        self.result = StringBuilder()
        self.result.append_slice(result_so_far, 0, new_len)

    @jit.unroll_safe
    def interpret(self):
        results = []
        pos = 0
        while pos < len(self.fmt):
            char = self.fmt[pos]
            rep = 1
            pos += 1
            if pos < len(self.fmt):
                c = self.fmt[pos]
                if '0' <= c <= '9':
                    start = pos
                    while pos < len(self.fmt) and '0' <= self.fmt[pos] <= '9':
                        pos += 1
                    rep = int(self.fmt[start:pos])
                elif c == '*':
                    pos += 1
                    rep = -1
            results.append((self._get_fmtdesc(char), rep))
        return results

    @jit.unroll_safe
    def build(self):
        self.fmt_interpreted = self.interpret()
        self.result = StringBuilder()

        for fmtdesc, repetitions in self.fmt_interpreted:
            if repetitions == -1 and fmtdesc.many_args:
                repetitions = len(self.arg_w) - self.arg_index
            try:
                fmtdesc.pack(self, fmtdesc, repetitions)
            except FormatException as e:
                self.space.ec.warn(
                    "pack(): Type %s: %s" % (fmtdesc.fmtchar, e.message))
        if self.arg_index < len(self.arg_w):
            self.space.ec.warn(
                "pack(): %s "
                "arguments unused" % (len(self.arg_w) - self.arg_index))

        return self.result.build()
Exemple #27
0
    def unicode_escape(s):
        size = len(s)
        result = StringBuilder(size)

        if quotes:
            if prefix:
                result.append(prefix)
            if s.find('\'') != -1 and s.find('\"') == -1:
                quote = ord('\"')
                result.append('"')
            else:
                quote = ord('\'')
                result.append('\'')
        else:
            quote = 0

            if size == 0:
                return ''

        pos = 0
        while pos < size:
            oc = codepoint_at_pos(s, pos)
            ch = s[pos]

            # Escape quotes
            if quotes and (oc == quote or ch == '\\'):
                result.append('\\')
                next_pos = next_codepoint_pos(s, pos)
                result.append_slice(s, pos, next_pos)
                pos = next_pos
                continue

            # The following logic is enabled only if MAXUNICODE == 0xffff, or
            # for testing on top of a host Python where sys.maxunicode == 0xffff
            if (not we_are_translated() and sys.maxunicode == 0xFFFF
                    and 0xD800 <= oc < 0xDC00 and pos + 3 < size):
                # Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes
                pos += 3
                oc2 = codepoint_at_pos(s, pos)

                if 0xDC00 <= oc2 <= 0xDFFF:
                    ucs = (((oc & 0x03FF) << 10) | (oc2 & 0x03FF)) + 0x00010000
                    char_escape_helper(result, ucs)
                    pos += 3
                    continue
                # Fall through: isolated surrogates are copied as-is
                pos -= 3

            # Map special whitespace to '\t', \n', '\r'
            if ch == '\t':
                result.append('\\t')
            elif ch == '\n':
                result.append('\\n')
            elif ch == '\r':
                result.append('\\r')
            elif ch == '\\':
                result.append('\\\\')

            # Map non-printable or non-ascii to '\xhh' or '\uhhhh'
            elif pass_printable and not (oc <= 0x10ffff
                                         and unicodedb.isprintable(oc)):
                char_escape_helper(result, oc)
            elif not pass_printable and (oc < 32 or oc >= 0x7F):
                char_escape_helper(result, oc)

            # Copy everything else as-is
            else:
                if oc < 128:
                    result.append(ch)
                else:
                    next_pos = next_codepoint_pos(s, pos)
                    result.append_slice(s, pos, next_pos)
            pos = next_codepoint_pos(s, pos)

        if quotes:
            result.append(chr(quote))
        return result.build()
Exemple #28
0
def raw_encode_basestring_ascii(space, w_string):
    if space.isinstance_w(w_string, space.w_bytes):
        s = space.bytes_w(w_string)
        for i in range(len(s)):
            c = s[i]
            if c >= ' ' and c <= '~' and c != '"' and c != '\\':
                pass
            else:
                first = i
                break
        else:
            # the input is a string with only non-special ascii chars
            return w_string

        unicodehelper.check_utf8_or_raise(space, s)
        sb = StringBuilder(len(s))
        sb.append_slice(s, 0, first)
    else:
        # We used to check if 'u' contains only safe characters, and return
        # 'w_string' directly.  But this requires an extra pass over all
        # characters, and the expected use case of this function, from
        # json.encoder, will anyway re-encode a unicode result back to
        # a string (with the ascii encoding).  This requires two passes
        # over the characters.  So we may as well directly turn it into a
        # string here --- only one pass.
        s = space.utf8_w(w_string)
        sb = StringBuilder(len(s))
        first = 0

    it = rutf8.Utf8StringIterator(s)
    for i in range(first):
        it.next()
    for c in it:
        if c <= ord('~'):
            if c == ord('"') or c == ord('\\'):
                sb.append('\\')
            elif c < ord(' '):
                sb.append(ESCAPE_BEFORE_SPACE[c])
                continue
            sb.append(chr(c))
        else:
            if c <= ord(u'\uffff'):
                sb.append('\\u')
                sb.append(HEX[c >> 12])
                sb.append(HEX[(c >> 8) & 0x0f])
                sb.append(HEX[(c >> 4) & 0x0f])
                sb.append(HEX[c & 0x0f])
            else:
                # surrogate pair
                n = c - 0x10000
                s1 = 0xd800 | ((n >> 10) & 0x3ff)
                sb.append('\\ud')
                sb.append(HEX[(s1 >> 8) & 0x0f])
                sb.append(HEX[(s1 >> 4) & 0x0f])
                sb.append(HEX[s1 & 0x0f])
                s2 = 0xdc00 | (n & 0x3ff)
                sb.append('\\ud')
                sb.append(HEX[(s2 >> 8) & 0x0f])
                sb.append(HEX[(s2 >> 4) & 0x0f])
                sb.append(HEX[s2 & 0x0f])

    res = sb.build()
    return space.newtext(res)
Exemple #29
0
def raw_encode_basestring_ascii(space, w_string):
    if space.isinstance_w(w_string, space.w_str):
        s = space.str_w(w_string)
        for i in range(len(s)):
            c = s[i]
            if c >= ' ' and c <= '~' and c != '"' and c != '\\':
                pass
            else:
                first = i
                break
        else:
            # the input is a string with only non-special ascii chars
            return w_string

        eh = unicodehelper.decode_error_handler(space)
        u = str_decode_utf_8(
                s, len(s), None, final=True, errorhandler=eh,
                allow_surrogates=True)[0]
        sb = StringBuilder(len(u))
        sb.append_slice(s, 0, first)
    else:
        # We used to check if 'u' contains only safe characters, and return
        # 'w_string' directly.  But this requires an extra pass over all
        # characters, and the expected use case of this function, from
        # json.encoder, will anyway re-encode a unicode result back to
        # a string (with the ascii encoding).  This requires two passes
        # over the characters.  So we may as well directly turn it into a
        # string here --- only one pass.
        u = space.unicode_w(w_string)
        sb = StringBuilder(len(u))
        first = 0

    for i in range(first, len(u)):
        c = u[i]
        if c <= u'~':
            if c == u'"' or c == u'\\':
                sb.append('\\')
            elif c < u' ':
                sb.append(ESCAPE_BEFORE_SPACE[ord(c)])
                continue
            sb.append(chr(ord(c)))
        else:
            if c <= u'\uffff':
                sb.append('\\u')
                sb.append(HEX[ord(c) >> 12])
                sb.append(HEX[(ord(c) >> 8) & 0x0f])
                sb.append(HEX[(ord(c) >> 4) & 0x0f])
                sb.append(HEX[ord(c) & 0x0f])
            else:
                # surrogate pair
                n = ord(c) - 0x10000
                s1 = 0xd800 | ((n >> 10) & 0x3ff)
                sb.append('\\ud')
                sb.append(HEX[(s1 >> 8) & 0x0f])
                sb.append(HEX[(s1 >> 4) & 0x0f])
                sb.append(HEX[s1 & 0x0f])
                s2 = 0xdc00 | (n & 0x3ff)
                sb.append('\\ud')
                sb.append(HEX[(s2 >> 8) & 0x0f])
                sb.append(HEX[(s2 >> 4) & 0x0f])
                sb.append(HEX[s2 & 0x0f])

    res = sb.build()
    return space.wrap(res)
Exemple #30
0
def raw_encode_basestring_ascii(space, w_string):
    if space.isinstance_w(w_string, space.w_bytes):
        s = space.bytes_w(w_string)
        for i in range(len(s)):
            c = s[i]
            if c >= ' ' and c <= '~' and c != '"' and c != '\\':
                pass
            else:
                first = i
                break
        else:
            # the input is a string with only non-special ascii chars
            return w_string

        eh = unicodehelper.decode_error_handler(space)
        u = str_decode_utf_8(
                s, len(s), None, final=True, errorhandler=eh,
                allow_surrogates=True)[0]
        sb = StringBuilder(len(u))
        sb.append_slice(s, 0, first)
    else:
        # We used to check if 'u' contains only safe characters, and return
        # 'w_string' directly.  But this requires an extra pass over all
        # characters, and the expected use case of this function, from
        # json.encoder, will anyway re-encode a unicode result back to
        # a string (with the ascii encoding).  This requires two passes
        # over the characters.  So we may as well directly turn it into a
        # string here --- only one pass.
        u = space.unicode_w(w_string)
        sb = StringBuilder(len(u))
        first = 0

    for i in range(first, len(u)):
        c = u[i]
        if c <= u'~':
            if c == u'"' or c == u'\\':
                sb.append('\\')
            elif c < u' ':
                sb.append(ESCAPE_BEFORE_SPACE[ord(c)])
                continue
            sb.append(chr(ord(c)))
        else:
            if c <= u'\uffff':
                sb.append('\\u')
                sb.append(HEX[ord(c) >> 12])
                sb.append(HEX[(ord(c) >> 8) & 0x0f])
                sb.append(HEX[(ord(c) >> 4) & 0x0f])
                sb.append(HEX[ord(c) & 0x0f])
            else:
                # surrogate pair
                n = ord(c) - 0x10000
                s1 = 0xd800 | ((n >> 10) & 0x3ff)
                sb.append('\\ud')
                sb.append(HEX[(s1 >> 8) & 0x0f])
                sb.append(HEX[(s1 >> 4) & 0x0f])
                sb.append(HEX[s1 & 0x0f])
                s2 = 0xdc00 | (n & 0x3ff)
                sb.append('\\ud')
                sb.append(HEX[(s2 >> 8) & 0x0f])
                sb.append(HEX[(s2 >> 4) & 0x0f])
                sb.append(HEX[s2 & 0x0f])

    res = sb.build()
    return space.newtext(res)
Exemple #31
0
def replace_impl(interp, pce, replace_obj, subject, limit=-1):
    replace_obj.setup(interp, pce)
    space = interp.space
    rffi.setintfield(pce.extra, 'c_match_limit', interp.regexp_backtrack_limit)
    rffi.setintfield(pce.extra, 'c_match_limit_recursion',
                     interp.regexp_recursion_limit)

    # Calculate the size of the offsets array
    num_subpats = pce.capturecount + 1
    size_offsets = num_subpats * 3

    # Initialize some stuff
    builder = StringBuilder(len(subject))

    # Allocate some more raw stuff
    rawsubject = rffi.str2charp(subject)
    offsets = lltype.malloc(rffi.INTP.TO, size_offsets, flavor='raw')
    try:
        exoptions = 0
        g_notempty = 0
        start_offset = 0
        original_limit = limit
        interp.regexp_error_code = PREG_NO_ERROR

        while limit != 0:
            # Execute the regular expression.
            count = _pcre.pcre_exec(pce.re, pce.extra, rawsubject,
                                    len(subject), start_offset,
                                    exoptions|g_notempty,
                                    offsets, size_offsets)

            # the string was already proved to be valid UTF-8
            exoptions |= _pcre.PCRE_NO_UTF8_CHECK

            # Check for too many substrings condition.
            if count == 0:
                interp.notice("Matched, but too many substrings")
                count = size_offsets // 3

            # If something has matched
            if count > 0:

                # copy the part of the string before the match
                match_end = rffi.cast(lltype.Signed, offsets[0])
                builder.append_slice(subject, start_offset, match_end)

                # ask the replace_obj how to handle this match
                replace_obj.next_replace(builder, subject, count, offsets)

                limit -= 1

            elif count == _pcre.PCRE_ERROR_NOMATCH:
                # If we previously set PCRE_NOTEMPTY after a null match,
                # this is not necessarily the end. We need to advance
                # the start offset, and continue. Fudge the offset
                # values to achieve this, unless we're already at the
                # end of the string.
                if g_notempty != 0 and start_offset < len(subject):
                    next_offset = start_offset
                    next_offset += pce.utf8size(subject, start_offset)
                    builder.append_slice(subject, start_offset, next_offset)
                    offsets[0] = rffi.cast(rffi.INT, start_offset)
                    offsets[1] = rffi.cast(rffi.INT, next_offset)
                else:
                    builder.append_slice(subject, start_offset, len(subject))
                    break

            else:
                handle_exec_error(interp, count)
                return None, -1

            # If we have matched an empty string, mimic what Perl's /g
            # options does.  This turns out to be rather cunning. First
            # we set PCRE_NOTEMPTY and try the match again at the same
            # point. If this fails (picked up above) we advance to the
            # next character.
            g_notempty = (_pcre.PCRE_NOTEMPTY | _pcre.PCRE_ANCHORED
                          if (rffi.cast(lltype.Signed, offsets[1]) ==
                              rffi.cast(lltype.Signed, offsets[0]))
                          else 0)

            # Advance to the position right after the last full match
            start_offset = rffi.cast(lltype.Signed, offsets[1])

        else:
            # reached limit == 0: copy the end of the string
            builder.append_slice(subject, start_offset, len(subject))

    finally:
        lltype.free(offsets, flavor='raw')
        rffi.free_charp(rawsubject)

    return space.newstr(builder.build()), original_limit - limit
Exemple #32
0
def replace_impl(interp, pce, replace_obj, subject, limit=-1):
    replace_obj.setup(interp, pce)
    space = interp.space
    rffi.setintfield(pce.extra, 'c_match_limit', interp.regexp_backtrack_limit)
    rffi.setintfield(pce.extra, 'c_match_limit_recursion',
                     interp.regexp_recursion_limit)

    # Calculate the size of the offsets array
    num_subpats = pce.capturecount + 1
    size_offsets = num_subpats * 3

    # Initialize some stuff
    builder = StringBuilder(len(subject))

    # Allocate some more raw stuff
    rawsubject = rffi.str2charp(subject)
    offsets = lltype.malloc(rffi.INTP.TO, size_offsets, flavor='raw')
    try:
        exoptions = 0
        g_notempty = 0
        start_offset = 0
        original_limit = limit
        interp.regexp_error_code = PREG_NO_ERROR

        while limit != 0:
            # Execute the regular expression.
            count = _pcre.pcre_exec(pce.re, pce.extra, rawsubject,
                                    len(subject), start_offset,
                                    exoptions | g_notempty, offsets,
                                    size_offsets)

            # the string was already proved to be valid UTF-8
            exoptions |= _pcre.PCRE_NO_UTF8_CHECK

            # Check for too many substrings condition.
            if count == 0:
                interp.notice("Matched, but too many substrings")
                count = size_offsets // 3

            # If something has matched
            if count > 0:

                # copy the part of the string before the match
                match_end = rffi.cast(lltype.Signed, offsets[0])
                builder.append_slice(subject, start_offset, match_end)

                # ask the replace_obj how to handle this match
                replace_obj.next_replace(builder, subject, count, offsets)

                limit -= 1

            elif count == _pcre.PCRE_ERROR_NOMATCH:
                # If we previously set PCRE_NOTEMPTY after a null match,
                # this is not necessarily the end. We need to advance
                # the start offset, and continue. Fudge the offset
                # values to achieve this, unless we're already at the
                # end of the string.
                if g_notempty != 0 and start_offset < len(subject):
                    next_offset = start_offset
                    next_offset += pce.utf8size(subject, start_offset)
                    builder.append_slice(subject, start_offset, next_offset)
                    offsets[0] = rffi.cast(rffi.INT, start_offset)
                    offsets[1] = rffi.cast(rffi.INT, next_offset)
                else:
                    builder.append_slice(subject, start_offset, len(subject))
                    break

            else:
                handle_exec_error(interp, count)
                return None, -1

            # If we have matched an empty string, mimic what Perl's /g
            # options does.  This turns out to be rather cunning. First
            # we set PCRE_NOTEMPTY and try the match again at the same
            # point. If this fails (picked up above) we advance to the
            # next character.
            g_notempty = (_pcre.PCRE_NOTEMPTY | _pcre.PCRE_ANCHORED if
                          (rffi.cast(lltype.Signed, offsets[1]) == rffi.cast(
                              lltype.Signed, offsets[0])) else 0)

            # Advance to the position right after the last full match
            start_offset = rffi.cast(lltype.Signed, offsets[1])

        else:
            # reached limit == 0: copy the end of the string
            builder.append_slice(subject, start_offset, len(subject))

    finally:
        lltype.free(offsets, flavor='raw')
        rffi.free_charp(rawsubject)

    return space.newstr(builder.build()), original_limit - limit