def search_for(pattern, string, flags=0, max_match=0, use_regex=False): """ Searches for a given pattern in a string. :param pattern: A pattern that defines what to match. :param string: The string to search in. :param flags: Additional flags to pass to the regex processor. :param max_match: Defines the maximum number of matches to perform. If 0 or less is provided, the number of splits is not limited. :param use_regex: Specifies whether to treat the pattern as a regex or simple string. :return: An iterator returning MatchObject's. """ if not use_regex: pattern = re.escape(pattern) return limit(re.finditer(pattern, string, flags), max_match)
def nested_search_in_between(begin, end, string, max_matches=0, remove_empty_matches=False, use_regex=False): """ Searches for a string enclosed between a specified begin- and end-sequence. Also enclosed \\n are put into the result. Doesn't handle escape sequences, but supports nesting. Nested sequences are ignored during the match. Means you get only the first nesting level returned. If you want to acquire more levels, just reinvoke this function again on the return value. Using the same begin- and end-sequence won't match anything. :param begin: A pattern that defines where to start matching. :param end: A pattern that defines where to end matching. :param string: The string where to search in. :param max_matches: Defines the maximum number of matches. If 0 or less is provided, the number of splits is not limited. :param remove_empty_matches: Defines whether empty entries should be removed from the result. An entry is considered empty if no inner match was performed (regardless of matched start and end patterns). :param use_regex: Specifies whether to treat the begin and end patterns as regexes or simple strings. :return: An iterator returning the matched strings. """ if not use_regex: begin = re.escape(begin) end = re.escape(end) strings = _nested_search_in_between(begin, end, string) if remove_empty_matches: strings = filter(lambda x: str(x.inside) != "", strings) return limit(strings, max_matches)
def unescaped_search_for(pattern, string, flags=0, max_match=0, use_regex=False): """ Searches for a given pattern in a string that is not escaped. :param pattern: A pattern that defines what to match unescaped. :param string: The string to search in. :param flags: Additional flags to pass to the regex processor. :param max_match: Defines the maximum number of matches to perform. If 0 or less is provided, the number of splits is not limited. :param use_regex: Specifies whether to treat the pattern as a regex or simple string. :return: An iterator returning MatchObject's. """ _iter = limit( filter(lambda match: not position_is_escaped(string, match.start()), search_for(pattern, string, flags, 0, use_regex)), max_match) for elem in _iter: yield elem
def test_finite(self): for test_limit in (1, 2, 3, 7, 8, 10, 22, 500000): self.assertEqual(tuple(limit(self.sequence, test_limit)), self.sequence[0:test_limit])
def test_infinite(self): for test_limit in (0, -1, -2, -6555123): self.assertEqual(tuple(limit(self.sequence, test_limit)), self.sequence)
def unescaped_search_in_between(begin, end, string, max_matches=0, remove_empty_matches=False, use_regex=False): """ Searches for a string enclosed between a specified begin- and end-sequence. Also enclosed \\n are put into the result. Handles escaped begin- and end-sequences (and so only patterns that are unescaped). .. warning:: Using the escape character '\\' in the begin- or end-sequences the function can return strange results. The backslash can interfere with the escaping regex-sequence used internally to match the enclosed string. :param begin: A regex pattern that defines where to start matching. :param end: A regex pattern that defines where to end matching. :param string: The string where to search in. :param max_matches: Defines the maximum number of matches. If 0 or less is provided, the number of matches is not limited. :param remove_empty_matches: Defines whether empty entries should be removed from the result. An entry is considered empty if no inner match was performed (regardless of matched start and end patterns). :param use_regex: Specifies whether to treat the begin and end patterns as regexes or simple strings. :return: An iterator returning the matched strings. """ if not use_regex: begin = re.escape(begin) end = re.escape(end) # No need to compile the begin sequence, capturing groups get escaped. begin_pattern_groups = 0 else: # Compilation of the begin sequence is needed to get the number of # capturing groups in it. begin_pattern_groups = re.compile(begin).groups # Regex explanation: # 1. (?<!\\)(?:\\\\)* Unescapes the following char. The first part of # this regex is a look-behind assertion. Only match # the following if no single backslash is before it. # The second part matches all double backslashes. # In fact this sequence matches all escapes that # occur as a multiple of two, means the following # statement is not escaped. # 2. (begin) A capturing group that matches the begin sequence. # 3. (.*?) Match any char unlimited times, as few times as # possible. Save the match in the capturing group # after all capturing groups that can appear in # 'begin'. # 4. (?<!\\)((?:\\\\)*) Again the unescaping regex, but now all escape- # characters get captured. # 5. (end) A capturing group that matches the end sequence. # Because the 3. group is lazy (matches as few times # as possible) the next occurring end-sequence is # matched. regex = (r"(?<!\\)(?:\\\\)*(" + begin + r")(.*?)(?<!\\)((?:\\\\)*)(" + end + ")") matches = re.finditer(regex, string, re.DOTALL) if remove_empty_matches: matches = trim_empty_matches( matches, (begin_pattern_groups + 2, begin_pattern_groups + 3)) matches = limit(matches, max_matches) for m in matches: yield InBetweenMatch.from_values( m.group(1), m.start(1), m.group(begin_pattern_groups + 2) + m.group(begin_pattern_groups + 3), m.start(begin_pattern_groups + 2), m.group(begin_pattern_groups + 4), m.start(begin_pattern_groups + 4))
def search_in_between(begin, end, string, max_matches=0, remove_empty_matches=False, use_regex=False): """ Searches for a string enclosed between a specified begin- and end-sequence. Also enclosed \\n are put into the result. Doesn't handle escape sequences. :param begin: A pattern that defines where to start matching. :param end: A pattern that defines where to end matching. :param string: The string where to search in. :param max_matches: Defines the maximum number of matches. If 0 or less is provided, the number of matches is not limited. :param remove_empty_matches: Defines whether empty entries should be removed from the result. An entry is considered empty if no inner match was performed (regardless of matched start and end patterns). :param use_regex: Specifies whether to treat the begin and end patterns as regexes or simple strings. :return: An iterator returning InBetweenMatch objects that hold information about the matched begin, inside and end string matched. """ if not use_regex: begin = re.escape(begin) end = re.escape(end) # No need to compile the begin sequence, capturing groups get escaped. begin_pattern_groups = 0 else: # Compilation of the begin sequence is needed to get the number of # capturing groups in it. begin_pattern_groups = re.compile(begin).groups # Regex explanation: # 1. (begin) A capturing group that matches the begin sequence. # 2. (.*?) Match any char unlimited times, as few times as possible. Save # the match in the second capturing group (`match.group(2)`). # 3. (end) A capturing group that matches the end sequence. # Because the previous group is lazy (matches as few times as # possible) the next occurring end-sequence is matched. regex = "(" + begin + ")(.*?)(" + end + ")" matches = re.finditer(regex, string, re.DOTALL) if remove_empty_matches: matches = trim_empty_matches(matches, (begin_pattern_groups + 2, )) matches = limit(matches, max_matches) for m in matches: yield InBetweenMatch.from_values(m.group(1), m.start(1), m.group(begin_pattern_groups + 2), m.start(begin_pattern_groups + 2), m.group(begin_pattern_groups + 3), m.start(begin_pattern_groups + 3))