class W_StringOutputPort(W_OutputPort): errorname = "output-port" def __init__(self): self.closed = False self.str = StringBuilder() def write(self, s): self.str.append(s) def contents(self): return self.str.build() def seek(self, offset, end=False): if end or offset == self.str.getlength(): return if offset > self.str.getlength(): self.str.append("\0" * (self.str.getlength() - offset)) else: # FIXME: this is potentially slow. content = self.contents() self.str = StringBuilder(offset) self.str.append_slice(content, 0, offset) def tell(self): return self.str.getlength()
def read(self, n=-1): assert isinstance(n, int) if n < 0: return self.readall() currentsize = len(self.buf) - self.pos start = self.pos assert start >= 0 if n <= currentsize: stop = start + n assert stop >= 0 result = self.buf[start:stop] self.pos += n return result else: builder = StringBuilder(n) builder.append_slice(self.buf, start, len(self.buf)) while 1: self.buf = self.do_read(self.bufsize) if not self.buf: self.pos = 0 break currentsize += len(self.buf) if currentsize >= n: self.pos = len(self.buf) - (currentsize - n) stop = self.pos assert stop >= 0 builder.append_slice(self.buf, 0, stop) break buf = self.buf assert buf is not None builder.append(buf) return builder.build()
def func(): s = StringBuilder() s.append("a") s.append("abc") s.append_slice("abc", 1, 2) s.append_multiple_char('d', 4) return s.build()
def decode_string_escaped(self, start): i = self.pos builder = StringBuilder((i - start) * 2) # just an estimate assert start >= 0 assert i >= 0 builder.append_slice(self.s, start, i) while True: ch = self.ll_chars[i] i += 1 if ch == '"': content_utf8 = builder.build() content_unicode = unicodehelper.decode_utf8( self.space, content_utf8) self.last_type = TYPE_STRING self.pos = i return self.space.newunicode(content_unicode) elif ch == '\\': i = self.decode_escape_sequence(i, builder) elif ch < '\x20': if ch == '\0': self._raise("Unterminated string starting at char %d", start - 1) else: self._raise("Invalid control character at char %d", i - 1) else: builder.append(ch)
class W_BytesBuilder(W_Root): def __init__(self, space, size): if size < 0: self.builder = StringBuilder() else: self.builder = StringBuilder(size) @unwrap_spec(size=int) def descr__new__(space, w_subtype, size=-1): return W_BytesBuilder(space, size) @unwrap_spec(s='bytes') def descr_append(self, space, s): self.builder.append(s) @unwrap_spec(s='bytes', start=int, end=int) def descr_append_slice(self, space, s, start, end): if not 0 <= start <= end <= len(s): raise oefmt(space.w_ValueError, "bad start/stop") self.builder.append_slice(s, start, end) def descr_build(self, space): w_s = space.newbytes(self.builder.build()) # after build(), we can continue to append more strings # to the same builder. This is supported since # 2ff5087aca28 in RPython. return w_s def descr_len(self, space): if self.builder is None: raise oefmt(space.w_ValueError, "no length of built builder") return space.newint(self.builder.getlength())
class W_StringOutputPort(W_OutputPort): errorname = "output-port" _attrs_ = ["closed", "str"] def __init__(self): self.closed = False self.str = StringBuilder() def write(self, s): self.str.append(s) def contents(self): return self.str.build() def seek(self, offset, end=False): if end or offset == self.str.getlength(): return if offset > self.str.getlength(): self.str.append("\0" * (self.str.getlength() - offset)) else: # FIXME: this is potentially slow. content = self.contents() self.str = StringBuilder(offset) self.str.append_slice(content, 0, offset) def tell(self): return self.str.getlength()
def format(self): lgt = len(self.fmt) + 4 * len(self.values_w) + 10 result = StringBuilder(lgt) self.result = result while True: # fast path: consume as many characters as possible fmt = self.fmt i = i0 = self.fmtpos while i < len(fmt): if fmt[i] == '%': break i += 1 else: result.append_slice(fmt, i0, len(fmt)) break # end of 'fmt' string result.append_slice(fmt, i0, i) self.fmtpos = i + 1 c = self.peekchr() if c == '%': self.forward() self.result.append('%') continue # interpret the next formatter w_value = self.parse_fmt() c = self.peekchr() self.forward() if c == '%': # if we get here there were extra characters between the # two %, forbidden now self.two_percent_error(i + 1) continue # first check whether it's a invalid char, *then* call # nextinputvalue, otherwise the error generated by # nextinputvalue can cover that of unknown_fmtchar for c1 in FORMATTER_CHARS: if c == c1: break else: self.unknown_fmtchar() if w_value is None: w_value = self.nextinputvalue() # dispatch on the formatter # (this turns into a switch after translation) for c1 in FORMATTER_CHARS: if c == c1: # 'c1' is an annotation constant here, # so this getattr() is ok do_fmt = getattr(self, 'fmt_' + c1) do_fmt(w_value) break self.checkconsumed() return result.build()
def test_string_builder(): s = StringBuilder() s.append("a") s.append("abc") assert s.getlength() == len('aabc') s.append("a") s.append_slice("abc", 1, 2) s.append_multiple_char('d', 4) assert s.build() == "aabcabdddd"
class Utf8StringBuilder(object): @always_inline def __init__(self, size=0): self._s = StringBuilder(size) self._lgt = 0 @always_inline def append(self, s): # for strings self._s.append(s) newlgt = codepoints_in_utf8(s) self._lgt += newlgt @always_inline def append_slice(self, s, start, end): self._s.append_slice(s, start, end) newlgt = codepoints_in_utf8(s, start, end) self._lgt += newlgt @signature(types.self(), char(), returns=none()) @always_inline def append_char(self, s): # for characters, ascii self._s.append(s) self._lgt += 1 @try_inline def append_code(self, code): unichr_as_utf8_append(self._s, code, True) self._lgt += 1 @always_inline def append_utf8(self, utf8, length): self._s.append(utf8) self._lgt += length @always_inline def append_utf8_slice(self, utf8, start, end, slicelength): self._s.append_slice(utf8, start, end) self._lgt += slicelength if not we_are_translated(): assert len(utf8[start:end].decode("utf-8")) == slicelength @always_inline def append_multiple_char(self, utf8, times): self._s.append(utf8 * times) self._lgt += times @always_inline def build(self): return self._s.build() @always_inline def getlength(self): return self._lgt
def test_string_builder(): s = StringBuilder() s.append("a") s.append("abc") assert s.getlength() == len('aabc') s.append("a") s.append_slice("abc", 1, 2) s.append_multiple_char('d', 4) result = s.build() assert result == "aabcabdddd" assert result == s.build() s.append("x") assert s.build() == result + "x"
def test_string_builder(): s = StringBuilder() s.append("a") s.append("abc") assert s.getlength() == len("aabc") s.append("a") s.append_slice("abc", 1, 2) s.append_multiple_char("d", 4) result = s.build() assert result == "aabcabdddd" assert result == s.build() s.append("x") assert s.build() == result + "x"
class Utf8StringBuilder(object): @always_inline def __init__(self, size=0): self._s = StringBuilder(size) self._lgt = 0 @always_inline def append(self, s): # for strings self._s.append(s) newlgt = get_utf8_length(s) self._lgt += newlgt @always_inline def append_slice(self, s, start, end): self._s.append_slice(s, start, end) newlgt = get_utf8_length(s, start, end) self._lgt += newlgt @signature(types.self(), char(), returns=none()) @always_inline def append_char(self, s): # for characters, ascii self._s.append(s) self._lgt += 1 @try_inline def append_code(self, code): unichr_as_utf8_append(self._s, code, True) self._lgt += 1 @always_inline def append_utf8(self, utf8, length): self._s.append(utf8) self._lgt += length @always_inline def append_multiple_char(self, utf8, times): self._s.append(utf8 * times) self._lgt += times @always_inline def build(self): return self._s.build() @always_inline def getlength(self): return self._lgt
def _decode_latin_1_slowpath(s): res = StringBuilder(len(s)) i = 0 while i < len(s): if ord(s[i]) > 0x7F: while i < len(s) and ord(s[i]) > 0x7F: unichr_as_utf8_append(res, ord(s[i])) i += 1 else: start = i end = i + 1 while end < len(s) and ord(s[end]) <= 0x7F: end += 1 res.append_slice(s, start, end) i = end return res.build()
def format(form, v): text = form.value result = StringBuilder() pos = 0 for match in format_regex.finditer(text): match_start = match.start() assert match_start >= 0 result.append_slice(text, pos, match_start) val = format_dict[match.group()] if val is None: val, v = v[0].tostring(), v[1:] result.append(val) pos = match.end() assert pos >= 0 result.append_slice(text, pos, len(text)) return result.build()
def str_zfill__String_ANY(space, w_self, w_width): input = w_self._value width = space.int_w(w_width) num_zeros = width - len(input) if num_zeros <= 0: # cannot return w_self, in case it is a subclass of str return space.wrap(input) builder = StringBuilder(width) if len(input) > 0 and (input[0] == '+' or input[0] == '-'): builder.append(input[0]) start = 1 else: start = 0 builder.append_multiple_char('0', num_zeros) builder.append_slice(input, start, len(input)) return space.wrap(builder.build())
def format(self): lgt = len(self.fmt) + 4 * len(self.values_w) + 10 result = StringBuilder(lgt) self.result = result while True: # fast path: consume as many characters as possible fmt = self.fmt i = i0 = self.fmtpos while i < len(fmt): if fmt[i] == '%': break i += 1 else: result.append_slice(fmt, i0, len(fmt)) break # end of 'fmt' string result.append_slice(fmt, i0, i) self.fmtpos = i + 1 # interpret the next formatter w_value = self.parse_fmt() c = self.peekchr() self.forward() if c == '%': self.std_wp('%', False) continue if w_value is None: w_value = self.nextinputvalue() # dispatch on the formatter # (this turns into a switch after translation) for c1 in FORMATTER_CHARS: if c == c1: # 'c1' is an annotation constant here, # so this getattr() is ok do_fmt = getattr(self, 'fmt_' + c1) do_fmt(w_value) break else: self.unknown_fmtchar() self.checkconsumed() return result.build()
def decode_string_escaped(self, start, nonascii): i = self.pos builder = StringBuilder((i - start) * 2) # just an estimate assert start >= 0 assert i >= 0 builder.append_slice(self.s, start, i) while True: ch = self.ll_chars[i] i += 1 if ch == '"': content_utf8 = builder.build() length = unicodehelper.check_utf8_or_raise( self.space, content_utf8) self.pos = i return self.space.newutf8(content_utf8, length) elif ch == '\\': i = self.decode_escape_sequence_to_utf8(i, builder) elif ch < '\x20': self._raise_control_char_in_string(ch, start, i) else: builder.append(ch)
def readline(self): pos = self.pos assert pos >= 0 i = self.buf.find("\n", pos) start = self.pos assert start >= 0 if i >= 0: # new line found i += 1 result = self.buf[start:i] self.pos = i return result temp = self.buf[start:] # read one buffer and most of the time a new line will be found self.buf = self.do_read(self.bufsize) i = self.buf.find("\n") if i >= 0: # new line found i += 1 result = temp + self.buf[:i] self.pos = i return result if not self.buf: self.pos = 0 return temp # need to keep getting data until we find a new line builder = StringBuilder(len(temp) + len(self.buf)) # at least builder.append(temp) builder.append(self.buf) while 1: self.buf = self.do_read(self.bufsize) if not self.buf: self.pos = 0 break i = self.buf.find("\n") if i >= 0: i += 1 builder.append_slice(self.buf, 0, i) self.pos = i break builder.append(self.buf) return builder.build()
def backslashreplace_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): w_obj = space.getattr(w_exc, space.newtext('object')) space.realutf8_w(w_obj) # for errors w_obj = space.convert_arg_to_w_unicode(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) start = w_obj._index_to_byte(start) end = w_obj._index_to_byte(end) builder = StringBuilder() obj = w_obj._utf8 pos = start while pos < end: oc = rutf8.codepoint_at_pos(obj, pos) num = hex(oc) if (oc >= 0x10000): builder.append("\\U") zeros = 8 elif (oc >= 0x100): builder.append("\\u") zeros = 4 else: builder.append("\\x") zeros = 2 lnum = len(num) nb = zeros + 2 - lnum # num starts with '0x' if nb > 0: builder.append_multiple_char('0', nb) builder.append_slice(num, 2, lnum) pos = rutf8.next_codepoint_pos(obj, pos) r = builder.build() lgt = rutf8.check_utf8(r, True) return space.newtuple([space.newutf8(r, lgt), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def readall(self): pos = self.pos assert pos >= 0 builder = StringBuilder() if self.buf: builder.append_slice(self.buf, pos, len(self.buf)) self.buf = "" self.pos = 0 bufsize = self.bufsize while 1: try: data = self.do_read(bufsize) except OSError as o: # like CPython < 3.4, partial results followed by an error # are returned as data if not builder.getlength(): raise break if not data: break builder.append(data) bufsize = min(bufsize * 2, self.bigsize) return builder.build()
def string_escape_encode(s, quotes): buf = StringBuilder(len(s) + 2) quote = "'" if quotes: if quote in s and '"' not in s: quote = '"' buf.append('b"') else: buf.append("b'") startslice = 0 for i in range(len(s)): c = s[i] use_bs_char = False # character quoted by backspace if c == '\\' or c == quote: bs_char = c use_bs_char = True elif c == '\t': bs_char = 't' use_bs_char = True elif c == '\r': bs_char = 'r' use_bs_char = True elif c == '\n': bs_char = 'n' use_bs_char = True elif not '\x20' <= c < '\x7f': n = ord(c) if i != startslice: buf.append_slice(s, startslice, i) startslice = i + 1 buf.append('\\x') buf.append("0123456789abcdef"[n >> 4]) buf.append("0123456789abcdef"[n & 0xF]) if use_bs_char: if i != startslice: buf.append_slice(s, startslice, i) startslice = i + 1 buf.append('\\') buf.append(bs_char) if len(s) != startslice: buf.append_slice(s, startslice, len(s)) if quotes: buf.append(quote) return buf.build()
def string_escape_encode(s, quote): buf = StringBuilder(len(s) + 2) buf.append(quote) startslice = 0 for i in range(len(s)): c = s[i] use_bs_char = False # character quoted by backspace if c == "\\" or c == quote: bs_char = c use_bs_char = True elif c == "\t": bs_char = "t" use_bs_char = True elif c == "\r": bs_char = "r" use_bs_char = True elif c == "\n": bs_char = "n" use_bs_char = True elif not "\x20" <= c < "\x7f": n = ord(c) if i != startslice: buf.append_slice(s, startslice, i) startslice = i + 1 buf.append("\\x") buf.append("0123456789abcdef"[n >> 4]) buf.append("0123456789abcdef"[n & 0xF]) if use_bs_char: if i != startslice: buf.append_slice(s, startslice, i) startslice = i + 1 buf.append("\\") buf.append(bs_char) if len(s) != startslice: buf.append_slice(s, startslice, len(s)) buf.append(quote) return buf.build()
class Pack(object): def __init__(self, space, fmt, arg_w): self.space = space self.fmt = fmt # self.table = unroll_fmttable self.arg_w = arg_w self.arg_index = 0 def pop_arg(self): if self.arg_index >= len(self.arg_w): raise FormatException("too few arguments") result = self.arg_w[self.arg_index] self.arg_index += 1 return result def _get_fmtdesc(self, char): for fmtdesc in unroll_fmttable: if char == fmtdesc.fmtchar: return fmtdesc def _shrink(self, new_len): result_so_far = self.result.build() assert new_len < len(result_so_far) self.result = StringBuilder() self.result.append_slice(result_so_far, 0, new_len) @jit.unroll_safe def interpret(self): results = [] pos = 0 while pos < len(self.fmt): char = self.fmt[pos] rep = 1 pos += 1 if pos < len(self.fmt): c = self.fmt[pos] if '0' <= c <= '9': start = pos while pos < len(self.fmt) and '0' <= self.fmt[pos] <= '9': pos += 1 rep = int(self.fmt[start:pos]) elif c == '*': pos += 1 rep = -1 results.append((self._get_fmtdesc(char), rep)) return results @jit.unroll_safe def build(self): self.fmt_interpreted = self.interpret() self.result = StringBuilder() for fmtdesc, repetitions in self.fmt_interpreted: if repetitions == -1 and fmtdesc.many_args: repetitions = len(self.arg_w) - self.arg_index try: fmtdesc.pack(self, fmtdesc, repetitions) except FormatException as e: self.space.ec.warn( "pack(): Type %s: %s" % (fmtdesc.fmtchar, e.message)) if self.arg_index < len(self.arg_w): self.space.ec.warn( "pack(): %s " "arguments unused" % (len(self.arg_w) - self.arg_index)) return self.result.build()
def unicode_escape(s): size = len(s) result = StringBuilder(size) if quotes: if prefix: result.append(prefix) if s.find('\'') != -1 and s.find('\"') == -1: quote = ord('\"') result.append('"') else: quote = ord('\'') result.append('\'') else: quote = 0 if size == 0: return '' pos = 0 while pos < size: oc = codepoint_at_pos(s, pos) ch = s[pos] # Escape quotes if quotes and (oc == quote or ch == '\\'): result.append('\\') next_pos = next_codepoint_pos(s, pos) result.append_slice(s, pos, next_pos) pos = next_pos continue # The following logic is enabled only if MAXUNICODE == 0xffff, or # for testing on top of a host Python where sys.maxunicode == 0xffff if (not we_are_translated() and sys.maxunicode == 0xFFFF and 0xD800 <= oc < 0xDC00 and pos + 3 < size): # Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes pos += 3 oc2 = codepoint_at_pos(s, pos) if 0xDC00 <= oc2 <= 0xDFFF: ucs = (((oc & 0x03FF) << 10) | (oc2 & 0x03FF)) + 0x00010000 char_escape_helper(result, ucs) pos += 3 continue # Fall through: isolated surrogates are copied as-is pos -= 3 # Map special whitespace to '\t', \n', '\r' if ch == '\t': result.append('\\t') elif ch == '\n': result.append('\\n') elif ch == '\r': result.append('\\r') elif ch == '\\': result.append('\\\\') # Map non-printable or non-ascii to '\xhh' or '\uhhhh' elif pass_printable and not (oc <= 0x10ffff and unicodedb.isprintable(oc)): char_escape_helper(result, oc) elif not pass_printable and (oc < 32 or oc >= 0x7F): char_escape_helper(result, oc) # Copy everything else as-is else: if oc < 128: result.append(ch) else: next_pos = next_codepoint_pos(s, pos) result.append_slice(s, pos, next_pos) pos = next_codepoint_pos(s, pos) if quotes: result.append(chr(quote)) return result.build()
def raw_encode_basestring_ascii(space, w_string): if space.isinstance_w(w_string, space.w_bytes): s = space.bytes_w(w_string) for i in range(len(s)): c = s[i] if c >= ' ' and c <= '~' and c != '"' and c != '\\': pass else: first = i break else: # the input is a string with only non-special ascii chars return w_string unicodehelper.check_utf8_or_raise(space, s) sb = StringBuilder(len(s)) sb.append_slice(s, 0, first) else: # We used to check if 'u' contains only safe characters, and return # 'w_string' directly. But this requires an extra pass over all # characters, and the expected use case of this function, from # json.encoder, will anyway re-encode a unicode result back to # a string (with the ascii encoding). This requires two passes # over the characters. So we may as well directly turn it into a # string here --- only one pass. s = space.utf8_w(w_string) sb = StringBuilder(len(s)) first = 0 it = rutf8.Utf8StringIterator(s) for i in range(first): it.next() for c in it: if c <= ord('~'): if c == ord('"') or c == ord('\\'): sb.append('\\') elif c < ord(' '): sb.append(ESCAPE_BEFORE_SPACE[c]) continue sb.append(chr(c)) else: if c <= ord(u'\uffff'): sb.append('\\u') sb.append(HEX[c >> 12]) sb.append(HEX[(c >> 8) & 0x0f]) sb.append(HEX[(c >> 4) & 0x0f]) sb.append(HEX[c & 0x0f]) else: # surrogate pair n = c - 0x10000 s1 = 0xd800 | ((n >> 10) & 0x3ff) sb.append('\\ud') sb.append(HEX[(s1 >> 8) & 0x0f]) sb.append(HEX[(s1 >> 4) & 0x0f]) sb.append(HEX[s1 & 0x0f]) s2 = 0xdc00 | (n & 0x3ff) sb.append('\\ud') sb.append(HEX[(s2 >> 8) & 0x0f]) sb.append(HEX[(s2 >> 4) & 0x0f]) sb.append(HEX[s2 & 0x0f]) res = sb.build() return space.newtext(res)
def raw_encode_basestring_ascii(space, w_string): if space.isinstance_w(w_string, space.w_str): s = space.str_w(w_string) for i in range(len(s)): c = s[i] if c >= ' ' and c <= '~' and c != '"' and c != '\\': pass else: first = i break else: # the input is a string with only non-special ascii chars return w_string eh = unicodehelper.decode_error_handler(space) u = str_decode_utf_8( s, len(s), None, final=True, errorhandler=eh, allow_surrogates=True)[0] sb = StringBuilder(len(u)) sb.append_slice(s, 0, first) else: # We used to check if 'u' contains only safe characters, and return # 'w_string' directly. But this requires an extra pass over all # characters, and the expected use case of this function, from # json.encoder, will anyway re-encode a unicode result back to # a string (with the ascii encoding). This requires two passes # over the characters. So we may as well directly turn it into a # string here --- only one pass. u = space.unicode_w(w_string) sb = StringBuilder(len(u)) first = 0 for i in range(first, len(u)): c = u[i] if c <= u'~': if c == u'"' or c == u'\\': sb.append('\\') elif c < u' ': sb.append(ESCAPE_BEFORE_SPACE[ord(c)]) continue sb.append(chr(ord(c))) else: if c <= u'\uffff': sb.append('\\u') sb.append(HEX[ord(c) >> 12]) sb.append(HEX[(ord(c) >> 8) & 0x0f]) sb.append(HEX[(ord(c) >> 4) & 0x0f]) sb.append(HEX[ord(c) & 0x0f]) else: # surrogate pair n = ord(c) - 0x10000 s1 = 0xd800 | ((n >> 10) & 0x3ff) sb.append('\\ud') sb.append(HEX[(s1 >> 8) & 0x0f]) sb.append(HEX[(s1 >> 4) & 0x0f]) sb.append(HEX[s1 & 0x0f]) s2 = 0xdc00 | (n & 0x3ff) sb.append('\\ud') sb.append(HEX[(s2 >> 8) & 0x0f]) sb.append(HEX[(s2 >> 4) & 0x0f]) sb.append(HEX[s2 & 0x0f]) res = sb.build() return space.wrap(res)
def raw_encode_basestring_ascii(space, w_string): if space.isinstance_w(w_string, space.w_bytes): s = space.bytes_w(w_string) for i in range(len(s)): c = s[i] if c >= ' ' and c <= '~' and c != '"' and c != '\\': pass else: first = i break else: # the input is a string with only non-special ascii chars return w_string eh = unicodehelper.decode_error_handler(space) u = str_decode_utf_8( s, len(s), None, final=True, errorhandler=eh, allow_surrogates=True)[0] sb = StringBuilder(len(u)) sb.append_slice(s, 0, first) else: # We used to check if 'u' contains only safe characters, and return # 'w_string' directly. But this requires an extra pass over all # characters, and the expected use case of this function, from # json.encoder, will anyway re-encode a unicode result back to # a string (with the ascii encoding). This requires two passes # over the characters. So we may as well directly turn it into a # string here --- only one pass. u = space.unicode_w(w_string) sb = StringBuilder(len(u)) first = 0 for i in range(first, len(u)): c = u[i] if c <= u'~': if c == u'"' or c == u'\\': sb.append('\\') elif c < u' ': sb.append(ESCAPE_BEFORE_SPACE[ord(c)]) continue sb.append(chr(ord(c))) else: if c <= u'\uffff': sb.append('\\u') sb.append(HEX[ord(c) >> 12]) sb.append(HEX[(ord(c) >> 8) & 0x0f]) sb.append(HEX[(ord(c) >> 4) & 0x0f]) sb.append(HEX[ord(c) & 0x0f]) else: # surrogate pair n = ord(c) - 0x10000 s1 = 0xd800 | ((n >> 10) & 0x3ff) sb.append('\\ud') sb.append(HEX[(s1 >> 8) & 0x0f]) sb.append(HEX[(s1 >> 4) & 0x0f]) sb.append(HEX[s1 & 0x0f]) s2 = 0xdc00 | (n & 0x3ff) sb.append('\\ud') sb.append(HEX[(s2 >> 8) & 0x0f]) sb.append(HEX[(s2 >> 4) & 0x0f]) sb.append(HEX[s2 & 0x0f]) res = sb.build() return space.newtext(res)
def replace_impl(interp, pce, replace_obj, subject, limit=-1): replace_obj.setup(interp, pce) space = interp.space rffi.setintfield(pce.extra, 'c_match_limit', interp.regexp_backtrack_limit) rffi.setintfield(pce.extra, 'c_match_limit_recursion', interp.regexp_recursion_limit) # Calculate the size of the offsets array num_subpats = pce.capturecount + 1 size_offsets = num_subpats * 3 # Initialize some stuff builder = StringBuilder(len(subject)) # Allocate some more raw stuff rawsubject = rffi.str2charp(subject) offsets = lltype.malloc(rffi.INTP.TO, size_offsets, flavor='raw') try: exoptions = 0 g_notempty = 0 start_offset = 0 original_limit = limit interp.regexp_error_code = PREG_NO_ERROR while limit != 0: # Execute the regular expression. count = _pcre.pcre_exec(pce.re, pce.extra, rawsubject, len(subject), start_offset, exoptions|g_notempty, offsets, size_offsets) # the string was already proved to be valid UTF-8 exoptions |= _pcre.PCRE_NO_UTF8_CHECK # Check for too many substrings condition. if count == 0: interp.notice("Matched, but too many substrings") count = size_offsets // 3 # If something has matched if count > 0: # copy the part of the string before the match match_end = rffi.cast(lltype.Signed, offsets[0]) builder.append_slice(subject, start_offset, match_end) # ask the replace_obj how to handle this match replace_obj.next_replace(builder, subject, count, offsets) limit -= 1 elif count == _pcre.PCRE_ERROR_NOMATCH: # If we previously set PCRE_NOTEMPTY after a null match, # this is not necessarily the end. We need to advance # the start offset, and continue. Fudge the offset # values to achieve this, unless we're already at the # end of the string. if g_notempty != 0 and start_offset < len(subject): next_offset = start_offset next_offset += pce.utf8size(subject, start_offset) builder.append_slice(subject, start_offset, next_offset) offsets[0] = rffi.cast(rffi.INT, start_offset) offsets[1] = rffi.cast(rffi.INT, next_offset) else: builder.append_slice(subject, start_offset, len(subject)) break else: handle_exec_error(interp, count) return None, -1 # If we have matched an empty string, mimic what Perl's /g # options does. This turns out to be rather cunning. First # we set PCRE_NOTEMPTY and try the match again at the same # point. If this fails (picked up above) we advance to the # next character. g_notempty = (_pcre.PCRE_NOTEMPTY | _pcre.PCRE_ANCHORED if (rffi.cast(lltype.Signed, offsets[1]) == rffi.cast(lltype.Signed, offsets[0])) else 0) # Advance to the position right after the last full match start_offset = rffi.cast(lltype.Signed, offsets[1]) else: # reached limit == 0: copy the end of the string builder.append_slice(subject, start_offset, len(subject)) finally: lltype.free(offsets, flavor='raw') rffi.free_charp(rawsubject) return space.newstr(builder.build()), original_limit - limit
def replace_impl(interp, pce, replace_obj, subject, limit=-1): replace_obj.setup(interp, pce) space = interp.space rffi.setintfield(pce.extra, 'c_match_limit', interp.regexp_backtrack_limit) rffi.setintfield(pce.extra, 'c_match_limit_recursion', interp.regexp_recursion_limit) # Calculate the size of the offsets array num_subpats = pce.capturecount + 1 size_offsets = num_subpats * 3 # Initialize some stuff builder = StringBuilder(len(subject)) # Allocate some more raw stuff rawsubject = rffi.str2charp(subject) offsets = lltype.malloc(rffi.INTP.TO, size_offsets, flavor='raw') try: exoptions = 0 g_notempty = 0 start_offset = 0 original_limit = limit interp.regexp_error_code = PREG_NO_ERROR while limit != 0: # Execute the regular expression. count = _pcre.pcre_exec(pce.re, pce.extra, rawsubject, len(subject), start_offset, exoptions | g_notempty, offsets, size_offsets) # the string was already proved to be valid UTF-8 exoptions |= _pcre.PCRE_NO_UTF8_CHECK # Check for too many substrings condition. if count == 0: interp.notice("Matched, but too many substrings") count = size_offsets // 3 # If something has matched if count > 0: # copy the part of the string before the match match_end = rffi.cast(lltype.Signed, offsets[0]) builder.append_slice(subject, start_offset, match_end) # ask the replace_obj how to handle this match replace_obj.next_replace(builder, subject, count, offsets) limit -= 1 elif count == _pcre.PCRE_ERROR_NOMATCH: # If we previously set PCRE_NOTEMPTY after a null match, # this is not necessarily the end. We need to advance # the start offset, and continue. Fudge the offset # values to achieve this, unless we're already at the # end of the string. if g_notempty != 0 and start_offset < len(subject): next_offset = start_offset next_offset += pce.utf8size(subject, start_offset) builder.append_slice(subject, start_offset, next_offset) offsets[0] = rffi.cast(rffi.INT, start_offset) offsets[1] = rffi.cast(rffi.INT, next_offset) else: builder.append_slice(subject, start_offset, len(subject)) break else: handle_exec_error(interp, count) return None, -1 # If we have matched an empty string, mimic what Perl's /g # options does. This turns out to be rather cunning. First # we set PCRE_NOTEMPTY and try the match again at the same # point. If this fails (picked up above) we advance to the # next character. g_notempty = (_pcre.PCRE_NOTEMPTY | _pcre.PCRE_ANCHORED if (rffi.cast(lltype.Signed, offsets[1]) == rffi.cast( lltype.Signed, offsets[0])) else 0) # Advance to the position right after the last full match start_offset = rffi.cast(lltype.Signed, offsets[1]) else: # reached limit == 0: copy the end of the string builder.append_slice(subject, start_offset, len(subject)) finally: lltype.free(offsets, flavor='raw') rffi.free_charp(rawsubject) return space.newstr(builder.build()), original_limit - limit