def __init__(self, *res, **options): """ Compiles a regular expression. Once compiled, it can be used repeatedly to search, split or replace text in a string. :param res: List of Bytestring expressions to compile :param kwargs: Config options to pass (flags bitmask, size_limit, dfa_size_limit) """ flags = options.pop('flags', DEFAULT_FLAGS) if not all(isinstance(re, bytes) for re in res): raise TypeError("'rure.lib.RureSet' must be instantiated with a " "list of bytestrings as first argument.") self._err = ffi.gc(_lib.rure_error_new(), _lib.rure_error_free) self._opts = ffi.gc(_lib.rure_options_new(), _lib.rure_options_free) self.options = options if 'size_limit' in options: _lib.rure_options_size_limit(self._opts, options['size_limit']) if 'dfa_size_limit' in options: _lib.rure_options_dfa_size_limit(self._opts, options['dfa_size_limit']) patterns = [] patterns_lengths = [] for re in res: patterns.append(ffi.new("uint8_t []", re)) patterns_lengths.append(len(re)) s = checked_call(_lib.rure_compile_set, self._err, ffi.new("uint8_t *[]", patterns), ffi.new("size_t []", patterns_lengths), len(patterns), flags, self._opts) self._ptr = ffi.gc(s, _lib.rure_set_free)
def captures(self, haystack, start=0): """Returns the capture groups corresponding to the leftmost-first match in text. Capture group 0 always corresponds to the entire match. If no match is found, then None is returned. You should only use captures if you need access to submatches. Otherwise, find is faster for discovering the location of the overall match. """ hlen = len(haystack) captures = ffi.gc(_lib.rure_captures_new(self._ptr), _lib.rure_captures_free) match = ffi.new('rure_match *') if _lib.rure_find_captures( self._ptr, haystack, hlen, start, captures ): return self.capture_cls(*[ RureMatch(match.start, match.end) if _lib.rure_captures_at(captures, i, match) else None for i in range(0, _lib.rure_captures_len(captures)) ])
def matches(self, haystack, start=0): """ Returns a list of booleans indicating whether the regex at each index was matched in the string given """ matches = ffi.new("bool[]", len(self)) _lib.rure_set_matches(self._ptr, haystack, len(haystack), start, matches) return [bool(match) for match in matches]
def shortest_match(self, haystack, start=0): """Returns end location if and only if re matches anywhere in text. The end location is the place at which the regex engine determined that a match exists, but may occur before the end of the proper leftmost-first match. """ hlen = len(haystack) end = ffi.new('size_t *') if _lib.rure_shortest_match(self._ptr, haystack, hlen, start, end): return end[0]
def find(self, haystack, start=0): """ Returns the start and end byte range of the leftmost-first match in text. If no match exists, then None is returned. Note that this should only be used if you want to discover the position of the match. Testing the existence of a match is faster if you use is_match. """ match = ffi.new('rure_match *') if _lib.rure_find(self._ptr, haystack, len(haystack), start, match): return RureMatch(match.start, match.end)
def capture_names(self): """ An iterator over the names of all possible captures. None indicates an unnamed capture; the first element (capture 0, the whole matched region) is always unnamed. """ cn_iter = ffi.gc(_lib.rure_iter_capture_names_new(self._ptr), _lib.rure_iter_capture_names_free) ptr = ffi.new('char **') while _lib.rure_iter_capture_names_next(cn_iter, ptr): name = ffi.string(ptr[0]) if name: yield name else: yield None
def find_iter(self, haystack, start=0): """Returns the capture groups corresponding to the leftmost-first match in text. Capture group 0 always corresponds to the entire match. If no match is found, then None is returned. You should only use captures if you need access to submatches. Otherwise, find is faster for discovering the location of the overall match. """ hlen = len(haystack) find_iter = ffi.gc(_lib.rure_iter_new(self._ptr), _lib.rure_iter_free) match = ffi.new('rure_match *') while _lib.rure_iter_next(find_iter, haystack, hlen, match): yield RureMatch(match.start, match.end)
def captures_iter(self, haystack, start=0): """Returns an iterator over all the non-overlapping capture groups matched in text. This is operationally the same as find_iter, except it yields information about submatches. """ hlen = len(haystack) captures = ffi.gc(_lib.rure_captures_new(self._ptr), _lib.rure_captures_free) captures_iter = ffi.gc(_lib.rure_iter_new(self._ptr), _lib.rure_iter_free) match = ffi.new('rure_match *') while _lib.rure_iter_next_captures(captures_iter, haystack, hlen, captures): yield self.capture_cls(*[ RureMatch(match.start, match.end) if _lib. rure_captures_at(captures, i, match) else None for i in range(0, _lib.rure_captures_len(captures)) ])