Beispiel #1
0
    def __init__(self, *res, **options):
        """ Compiles a regular expression. Once compiled, it can be used
        repeatedly to search, split or replace text in a string.

        :param res:     List of Bytestring expressions to compile
        :param kwargs:  Config options to pass (flags bitmask,
                                                size_limit,
                                                dfa_size_limit)
        """

        flags = options.pop('flags', DEFAULT_FLAGS)
        if not all(isinstance(re, bytes) for re in res):
            raise TypeError("'rure.lib.RureSet' must be instantiated with a "
                            "list of bytestrings as first argument.")

        self._err = ffi.gc(_lib.rure_error_new(), _lib.rure_error_free)
        self._opts = ffi.gc(_lib.rure_options_new(), _lib.rure_options_free)
        self.options = options
        if 'size_limit' in options:
            _lib.rure_options_size_limit(self._opts, options['size_limit'])
        if 'dfa_size_limit' in options:
            _lib.rure_options_dfa_size_limit(self._opts,
                                             options['dfa_size_limit'])

        patterns = []
        patterns_lengths = []
        for re in res:
            patterns.append(ffi.new("uint8_t []", re))
            patterns_lengths.append(len(re))

        s = checked_call(_lib.rure_compile_set, self._err,
                         ffi.new("uint8_t *[]", patterns),
                         ffi.new("size_t []", patterns_lengths), len(patterns),
                         flags, self._opts)
        self._ptr = ffi.gc(s, _lib.rure_set_free)
Beispiel #2
0
    def captures(self, haystack, start=0):
        """Returns the capture groups corresponding to the leftmost-first match
        in text. Capture group 0 always corresponds to the entire match.
        If no match is found, then None is returned.

        You should only use captures if you need access to submatches.
        Otherwise, find is faster for discovering the location of the overall
        match.
        """
        hlen = len(haystack)
        captures = ffi.gc(_lib.rure_captures_new(self._ptr),
                          _lib.rure_captures_free)
        match = ffi.new('rure_match *')
        if _lib.rure_find_captures(
            self._ptr,
            haystack,
            hlen,
            start,
            captures
        ):
            return self.capture_cls(*[
                RureMatch(match.start, match.end)
                    if _lib.rure_captures_at(captures, i, match) else None
                for i in range(0, _lib.rure_captures_len(captures))
            ])
Beispiel #3
0
 def matches(self, haystack, start=0):
     """
     Returns a list of booleans indicating whether the regex at each index
     was matched in the string given
     """
     matches = ffi.new("bool[]", len(self))
     _lib.rure_set_matches(self._ptr, haystack, len(haystack), start,
                           matches)
     return [bool(match) for match in matches]
Beispiel #4
0
 def shortest_match(self, haystack, start=0):
     """Returns end location if and only if re matches anywhere in
     text. The end location is the place at which the regex engine
     determined that a match exists, but may occur before the end of
     the proper leftmost-first match.
     """
     hlen = len(haystack)
     end = ffi.new('size_t *')
     if _lib.rure_shortest_match(self._ptr, haystack, hlen, start, end):
         return end[0]
Beispiel #5
0
    def find(self, haystack, start=0):
        """ Returns the start and end byte range of the leftmost-first match
        in text. If no match exists, then None is returned.

        Note that this should only be used if you want to discover the position
        of the match. Testing the existence of a match is faster if you use
        is_match.
        """
        match = ffi.new('rure_match *')
        if _lib.rure_find(self._ptr, haystack, len(haystack), start, match):
            return RureMatch(match.start, match.end)
Beispiel #6
0
 def capture_names(self):
     """ An iterator over the names of all possible captures.
     None indicates an unnamed capture; the first element (capture 0,
     the whole matched region) is always unnamed.
     """
     cn_iter = ffi.gc(_lib.rure_iter_capture_names_new(self._ptr),
                      _lib.rure_iter_capture_names_free)
     ptr = ffi.new('char **')
     while _lib.rure_iter_capture_names_next(cn_iter, ptr):
         name = ffi.string(ptr[0])
         if name:
             yield name
         else:
             yield None
Beispiel #7
0
    def find_iter(self, haystack, start=0):
        """Returns the capture groups corresponding to the leftmost-first match
        in text. Capture group 0 always corresponds to the entire match.
        If no match is found, then None is returned.

        You should only use captures if you need access to submatches.
        Otherwise, find is faster for discovering the location of the overall
        match.
        """
        hlen = len(haystack)
        find_iter = ffi.gc(_lib.rure_iter_new(self._ptr), _lib.rure_iter_free)

        match = ffi.new('rure_match *')
        while _lib.rure_iter_next(find_iter, haystack, hlen, match):
            yield RureMatch(match.start, match.end)
Beispiel #8
0
 def captures_iter(self, haystack, start=0):
     """Returns an iterator over all the non-overlapping capture groups
     matched in text. This is operationally the same as find_iter,
     except it yields information about submatches.
     """
     hlen = len(haystack)
     captures = ffi.gc(_lib.rure_captures_new(self._ptr),
                       _lib.rure_captures_free)
     captures_iter = ffi.gc(_lib.rure_iter_new(self._ptr),
                            _lib.rure_iter_free)
     match = ffi.new('rure_match *')
     while _lib.rure_iter_next_captures(captures_iter, haystack, hlen,
                                        captures):
         yield self.capture_cls(*[
             RureMatch(match.start, match.end) if _lib.
             rure_captures_at(captures, i, match) else None
             for i in range(0, _lib.rure_captures_len(captures))
         ])