def test_extended(self): expected_results = [[("(", 0, "", 1, ")", 1), ("(", 6, "This is a word", 7, ")", 21), ("(", 25, "(in a word", 26, ")", 36)], [("(", 4, "((((((((((((((((((1", 5, ")", 24)], [("(", 6, "do (it ", 7, ")", 14), ("(", 41, "", 42, ")", 42), ("(", 44, "hello.", 45, ")", 51)], [("(", 0, "", 1, ")", 1), ("(", 8, r"This\ is a word" + self.bs, 9, ")", 25), ("(", 29, r"(in a\\\ word" + 5 * self.bs, 30, ")", 48)], [("(", 5, r"\(\((((((\\\(((((((((((1", 6, ")", 30) ], [("(", 7, "do (it ", 8, ")", 15), ("(", 45, "", 46, ")", 46), ("(", 48, "hello.", 49, ")", 55)]] self.assertResultsEqual( search_in_between, {(begin_pattern, end_pattern, test_string, 0, False, use_regex): [InBetweenMatch.from_values(*args) for args in result] for test_string, result in zip( self.search_in_between_test_strings, expected_results) for use_regex, begin_pattern, end_pattern in [(True, r"\(", r"\)"), (False, self.search_in_between_begin_pattern, self.search_in_between_end_pattern)]}, list)
def test_extended(self): expected_results = [ [("(", 0, "", 1, ")", 1), ("(", 6, "This is a word", 7, ")", 21), ("(", 25, "(in a word", 26, ")", 36)], [("(", 4, "((((((((((((((((((1", 5, ")", 24)], [("(", 6, "do (it ", 7, ")", 14), ("(", 41, "", 42, ")", 42), ("(", 44, "hello.", 45, ")", 51)], [("(", 0, "", 1, ")", 1), ("(", 8, r"This\ is a word" + self.bs, 9, ")", 25), ("(", 29, r"(in a\\\ word" + 5 * self.bs, 30, ")", 48)], [("(", 5, r"\(\((((((\\\(((((((((((1", 6, ")", 30)], [("(", 7, "do (it ", 8, ")", 15), ("(", 45, "", 46, ")", 46), ("(", 48, "hello.", 49, ")", 55)]] self.assertResultsEqual( search_in_between, {(begin_pattern, end_pattern, test_string, 0, False, use_regex): [InBetweenMatch.from_values(*args) for args in result] for test_string, result in zip( self.search_in_between_test_strings, expected_results) for use_regex, begin_pattern, end_pattern in [(True, r"\(", r"\)"), (False, self.search_in_between_begin_pattern, self.search_in_between_end_pattern)]}, list)
def test_auto_trim(self): expected_results = [ [("(", 6, "This is a word", 7, ")", 21), ("(", 25, "(in a word) another ", 26, ")", 46)], [("(", 4, "((((((((((((((((((1)2)3))))))))))))))))", 5, ")", 44)], [("(", 6, "do (it ) more ", 7, ")", 21), ("(", 44, "hello.", 45, ")", 51)], [("(", 8, r"This\ is a word" + self.bs, 9, ")", 25), ("(", 29, r"(in a\\\ word\\\\\) another " + self.bs, 30, ")", 59) ], [("(", 5, r"\(\((((((\\\(((((((((((1)2)3))\\\\\)))))))))))))\)" + self.bs, 6, ")", 57)], [("(", 7, "do (it ) more ", 8, ")", 22), ("(", 48, "hello.", 49, ")", 55)] ] self.assertResultsEqual( nested_search_in_between, {(begin_pattern, end_pattern, test_string, 0, True, use_regex): [InBetweenMatch.from_values(*args) for args in result] for test_string, result in zip( self.search_in_between_test_strings, expected_results) for use_regex, begin_pattern, end_pattern in [(True, r"\(", r"\)"), (False, self.search_in_between_begin_pattern, self.search_in_between_end_pattern)]}, list)
def test_auto_trim(self): expected_results = [ [], [(";", 2, r"\\\\\;\\#", 3, ";", 12), (";", 25, "+ios", 26, ";", 30)], [(";", 1, "2", 2, ";", 3), (";", 5, "4", 6, ";", 7), (";", 9, "6", 10, ";", 11)], [(";", 1, "2", 2, ";", 3), (";", 5, "4", 6, ";", 7), (";", 9, "6", 10, ";", 11)], [], [], [], [], [(";", 3, "a", 4, ";", 5)]] self.assertResultsEqual( unescaped_search_in_between, {(self.auto_trim_test_pattern, self.auto_trim_test_pattern, test_string, 0, True, use_regex): [InBetweenMatch.from_values(*args) for args in result] for test_string, result in zip(self.auto_trim_test_strings, expected_results) for use_regex in [True, False]}, list)
def test_regex_pattern(self): expected_results = [ [("abc", 0, "", 3, "abc", 3)], [("ab", 0, "c", 2, "ab", 3)], [("ab", 0, "c", 2, "ab", 3), ("ab", 21, r"bc\+'**'", 23, "ac", 31)], [(self.bs, 12, r"\13q4ujsabbc", 13, self.bs, 25)], [("###", 9, r"\\13q4ujsabbc\+'**'ac", 12, "###", 33), ("#", 37, ".", 38, "####", 39)], [("a", 0, "", 1, "b", 1), ("a", 3, "", 4, "b", 4), ("b", 7, "", 8, "a", 8), ("##", 9, "", 11, "#\\", 11), ("a", 21, "", 22, "b", 22), ("b", 23, r"c\+'**'", 24, "a", 31), ("##", 33, "", 35, "#.", 35), ("#.", 37, "", 39, "##", 39), ("##", 41, "-", 43, "b", 44)], [("abcabc", 0, r"cba###\\13q4ujs", 6, "abbc", 21)], []] self.assertResultsEqual( unescaped_search_in_between, {(pattern, pattern, self.multi_pattern_test_string, 0, False, True): [InBetweenMatch.from_values(*args) for args in result] for pattern, result in zip(self.multi_patterns, expected_results)}, list)
def search_in_between(begin, end, string, max_matches=0, remove_empty_matches=False, use_regex=False): """ Searches for a string enclosed between a specified begin- and end-sequence. Also enclosed \\n are put into the result. Doesn't handle escape sequences. :param begin: A pattern that defines where to start matching. :param end: A pattern that defines where to end matching. :param string: The string where to search in. :param max_matches: Defines the maximum number of matches. If 0 or less is provided, the number of matches is not limited. :param remove_empty_matches: Defines whether empty entries should be removed from the result. An entry is considered empty if no inner match was performed (regardless of matched start and end patterns). :param use_regex: Specifies whether to treat the begin and end patterns as regexes or simple strings. :return: An iterator returning InBetweenMatch objects that hold information about the matched begin, inside and end string matched. """ if not use_regex: begin = re.escape(begin) end = re.escape(end) # No need to compile the begin sequence, capturing groups get escaped. begin_pattern_groups = 0 else: # Compilation of the begin sequence is needed to get the number of # capturing groups in it. begin_pattern_groups = re.compile(begin).groups # Regex explanation: # 1. (begin) A capturing group that matches the begin sequence. # 2. (.*?) Match any char unlimited times, as few times as possible. Save # the match in the second capturing group (`match.group(2)`). # 3. (end) A capturing group that matches the end sequence. # Because the previous group is lazy (matches as few times as # possible) the next occurring end-sequence is matched. regex = "(" + begin + ")(.*?)(" + end + ")" matches = re.finditer(regex, string, re.DOTALL) if remove_empty_matches: matches = trim_empty_matches(matches, (begin_pattern_groups + 2,)) matches = limit(matches, max_matches) for m in matches: yield InBetweenMatch.from_values( m.group(1), m.start(1), m.group(begin_pattern_groups + 2), m.start(begin_pattern_groups + 2), m.group(begin_pattern_groups + 3), m.start(begin_pattern_groups + 3), )
def test_regex_pattern(self): self.assertResultsEqual( nested_search_in_between, {(r"(?:)\(", r"\)(?:)", test_string, 0, False, True): [InBetweenMatch.from_values(*args) for args in result] for test_string, result in zip( self.search_in_between_test_strings, self.test_basic_expected_results)}, list)
def test_basic(self): self.assertResultsEqual( nested_search_in_between, {(self.search_in_between_begin_pattern, self.search_in_between_end_pattern, test_string, 0, False, False): [InBetweenMatch.from_values(*args) for args in result] for test_string, result in zip( self.search_in_between_test_strings, self.test_basic_expected_results)}, list)
def test_from_values(self): uut = InBetweenMatch.from_values("hello", 47, "world", 77, "rises", 90) self.assertEqual(str(uut.begin), "hello") self.assertEqual(uut.begin.position, 47) self.assertEqual(str(uut.inside), "world") self.assertEqual(uut.inside.position, 77) self.assertEqual(str(uut.end), "rises") self.assertEqual(uut.end.position, 90)
def test_properties(self): uut = InBetweenMatch(Match("ABC", 0), Match("DEF", 3), Match("GHI", 6)) self.assertEqual(str(uut.begin), "ABC") self.assertEqual(uut.begin.position, 0) self.assertEqual(str(uut.inside), "DEF") self.assertEqual(uut.inside.position, 3) self.assertEqual(str(uut.end), "GHI") self.assertEqual(uut.end.position, 6)
def test_max_match(self): self.assertResultsEqual( nested_search_in_between, {(self.search_in_between_begin_pattern, self.search_in_between_end_pattern, test_string, max_match, False, False): [InBetweenMatch.from_values(*args) for args in result] for max_match in [1, 2, 5, 22] for test_string, result in zip(self.search_in_between_test_strings, [ elem[0:max_match] for elem in self.test_basic_expected_results ])}, list)
def test_basic(self): expected_results = self.test_basic_expected_results self.assertResultsEqual( unescaped_search_in_between, {(self.test_basic_pattern, self.test_basic_pattern, test_string, 0, False, use_regex): [InBetweenMatch.from_values(*args) for args in result] for test_string, result in zip(self.test_strings, expected_results) for use_regex in [True, False]}, list)
def test_max_match(self): search_pattern = self.test_basic_pattern expected_master_results = self.test_basic_expected_results self.assertResultsEqual( unescaped_search_in_between, {(search_pattern, search_pattern, test_string, max_match, False, use_regex): [InBetweenMatch.from_values(*args) for args in result] for max_match in [1, 2, 3, 4, 5, 100] for test_string, result in zip(self.test_strings, [elem[0:max_match] for elem in expected_master_results]) for use_regex in [True, False]}, list)
def test_basic(self): expected_results = self.test_basic_expected_results self.assertResultsEqual( unescaped_search_in_between, { (self.test_basic_pattern, self.test_basic_pattern, test_string, 0, False, use_regex): [ InBetweenMatch.from_values(*args) for args in result ] for test_string, result in zip(self.test_strings, expected_results) for use_regex in [True, False] }, list, )
def _nested_search_in_between(begin, end, string): """ Searches for a string enclosed between a specified begin- and end-sequence. Matches infinite times. This is a function specifically designed to be invoked from ``nested_search_in_between()``. :param begin: A regex pattern that defines where to start matching. :param end: A regex pattern that defines where to end matching. :param string: The string where to search in. :return: An iterator returning the matched strings. """ # Regex explanation: # 1. (begin) A capturing group that matches the begin sequence. # 2. (end) A capturing group that matches the end sequence. Because the # 1st group is lazy (matches as few times as possible) the next # occurring end-sequence is matched. # The '|' in the regex matches either the first or the second part. regex = "(" + begin + ")|(" + end + ")" left_match = None nesting_level = 0 for match in re.finditer(regex, string, re.DOTALL): if match.group(1) is not None: if nesting_level == 0: # Store the match of the first nesting level to be able to # return the string until the next fitting end sequence. left_match = match nesting_level += 1 else: # The second group matched. This is the only alternative if group 1 # didn't, otherwise no match would be performed. No need to compile # the begin and end sequences to get the number of capturing groups # in them. if nesting_level > 0: nesting_level -= 1 if nesting_level == 0 and left_match != None: yield InBetweenMatch.from_values( left_match.group(), left_match.start(), string[left_match.end() : match.start()], left_match.end(), match.group(), match.start(), ) left_match = None
def _nested_search_in_between(begin, end, string): """ Searches for a string enclosed between a specified begin- and end-sequence. Matches infinite times. This is a function specifically designed to be invoked from ``nested_search_in_between()``. :param begin: A regex pattern that defines where to start matching. :param end: A regex pattern that defines where to end matching. :param string: The string where to search in. :return: An iterator returning the matched strings. """ # Regex explanation: # 1. (begin) A capturing group that matches the begin sequence. # 2. (end) A capturing group that matches the end sequence. Because the # 1st group is lazy (matches as few times as possible) the next # occurring end-sequence is matched. # The '|' in the regex matches either the first or the second part. regex = "(" + begin + ")|(" + end + ")" left_match = None nesting_level = 0 for match in re.finditer(regex, string, re.DOTALL): if match.group(1) is not None: if nesting_level == 0: # Store the match of the first nesting level to be able to # return the string until the next fitting end sequence. left_match = match nesting_level += 1 else: # The second group matched. This is the only alternative if group 1 # didn't, otherwise no match would be performed. No need to compile # the begin and end sequences to get the number of capturing groups # in them. if nesting_level > 0: nesting_level -= 1 if nesting_level == 0 and left_match != None: yield InBetweenMatch.from_values( left_match.group(), left_match.start(), string[left_match.end(): match.start()], left_match.end(), match.group(), match.start()) left_match = None
def test_max_match(self): self.assertResultsEqual( nested_search_in_between, {(self.search_in_between_begin_pattern, self.search_in_between_end_pattern, test_string, max_match, False, False): [InBetweenMatch.from_values(*args) for args in result] for max_match in [1, 2, 5, 22] for test_string, result in zip( self.search_in_between_test_strings, [elem[0:max_match] for elem in self.test_basic_expected_results])}, list)
def test_disabled_regex(self): search_pattern = r"\'" expected_results = [[] for x in range(len(self.test_strings))] self.assertResultsEqual( search_in_between, {(search_pattern, search_pattern, test_string, 0, auto_trim, # For remove_empty_matches both works, True and False. False): [InBetweenMatch.from_values(*args) for args in result] for test_string, result in zip(self.test_strings, expected_results) for auto_trim in [True, False]}, list)
def test_max_match(self): search_pattern = self.test_basic_pattern expected_master_results = self.test_basic_expected_results self.assertResultsEqual( unescaped_search_in_between, { (search_pattern, search_pattern, test_string, max_match, False, use_regex): [ InBetweenMatch.from_values(*args) for args in result ] for max_match in [1, 2, 3, 4, 5, 100] for test_string, result in zip( self.test_strings, [elem[0:max_match] for elem in expected_master_results] ) for use_regex in [True, False] }, list, )
def test_auto_trim(self): expected_results = [ [("(", 6, "This is a word", 7, ")", 21), ("(", 25, "(in a word) another ", 26, ")", 46)], [("(", 4, "((((((((((((((((((1)2)3))))))))))))))))", 5, ")", 44)], [("(", 6, "do (it ) more ", 7, ")", 21), ("(", 44, "hello.", 45, ")", 51)], [("(", 8, r"This\ is a word" + self.bs, 9, ")", 25), ("(", 29, r"(in a\\\ word\\\\\) another " + self.bs, 30, ")", 59)], [("(", 5, r"\(\((((((\\\(((((((((((1)2)3))\\\\\)))))))))))))\)" + self.bs, 6, ")", 57)], [("(", 7, "do (it ) more ", 8, ")", 22), ("(", 48, "hello.", 49, ")", 55)]] self.assertResultsEqual( nested_search_in_between, {(begin_pattern, end_pattern, test_string, 0, True, use_regex): [InBetweenMatch.from_values(*args) for args in result] for test_string, result in zip( self.search_in_between_test_strings, expected_results) for use_regex, begin_pattern, end_pattern in [ (True, r"\(", r"\)"), (False, self.search_in_between_begin_pattern, self.search_in_between_end_pattern)]}, list)
def unescaped_search_in_between(begin, end, string, max_matches=0, remove_empty_matches=False, use_regex=False): """ Searches for a string enclosed between a specified begin- and end-sequence. Also enclosed \\n are put into the result. Handles escaped begin- and end-sequences (and so only patterns that are unescaped). .. warning:: Using the escape character '\\' in the begin- or end-sequences the function can return strange results. The backslash can interfere with the escaping regex-sequence used internally to match the enclosed string. :param begin: A regex pattern that defines where to start matching. :param end: A regex pattern that defines where to end matching. :param string: The string where to search in. :param max_matches: Defines the maximum number of matches. If 0 or less is provided, the number of matches is not limited. :param remove_empty_matches: Defines whether empty entries should be removed from the result. An entry is considered empty if no inner match was performed (regardless of matched start and end patterns). :param use_regex: Specifies whether to treat the begin and end patterns as regexes or simple strings. :return: An iterator returning the matched strings. """ if not use_regex: begin = re.escape(begin) end = re.escape(end) # No need to compile the begin sequence, capturing groups get escaped. begin_pattern_groups = 0 else: # Compilation of the begin sequence is needed to get the number of # capturing groups in it. begin_pattern_groups = re.compile(begin).groups # Regex explanation: # 1. (?<!\\)(?:\\\\)* Unescapes the following char. The first part of # this regex is a look-behind assertion. Only match # the following if no single backslash is before it. # The second part matches all double backslashes. # In fact this sequence matches all escapes that # occur as a multiple of two, means the following # statement is not escaped. # 2. (begin) A capturing group that matches the begin sequence. # 3. (.*?) Match any char unlimited times, as few times as # possible. Save the match in the capturing group # after all capturing groups that can appear in # 'begin'. # 4. (?<!\\)((?:\\\\)*) Again the unescaping regex, but now all escape- # characters get captured. # 5. (end) A capturing group that matches the end sequence. # Because the 3. group is lazy (matches as few times # as possible) the next occurring end-sequence is # matched. regex = r"(?<!\\)(?:\\\\)*(" + begin + r")(.*?)(?<!\\)((?:\\\\)*)(" + end + ")" matches = re.finditer(regex, string, re.DOTALL) if remove_empty_matches: matches = trim_empty_matches(matches, (begin_pattern_groups + 2, begin_pattern_groups + 3)) matches = limit(matches, max_matches) for m in matches: yield InBetweenMatch.from_values( m.group(1), m.start(1), m.group(begin_pattern_groups + 2) + m.group(begin_pattern_groups + 3), m.start(begin_pattern_groups + 2), m.group(begin_pattern_groups + 4), m.start(begin_pattern_groups + 4), )
def unescaped_search_in_between(begin, end, string, max_matches=0, remove_empty_matches=False, use_regex=False): """ Searches for a string enclosed between a specified begin- and end-sequence. Also enclosed \\n are put into the result. Handles escaped begin- and end-sequences (and so only patterns that are unescaped). .. warning:: Using the escape character '\\' in the begin- or end-sequences the function can return strange results. The backslash can interfere with the escaping regex-sequence used internally to match the enclosed string. :param begin: A regex pattern that defines where to start matching. :param end: A regex pattern that defines where to end matching. :param string: The string where to search in. :param max_matches: Defines the maximum number of matches. If 0 or less is provided, the number of matches is not limited. :param remove_empty_matches: Defines whether empty entries should be removed from the result. An entry is considered empty if no inner match was performed (regardless of matched start and end patterns). :param use_regex: Specifies whether to treat the begin and end patterns as regexes or simple strings. :return: An iterator returning the matched strings. """ if not use_regex: begin = re.escape(begin) end = re.escape(end) # No need to compile the begin sequence, capturing groups get escaped. begin_pattern_groups = 0 else: # Compilation of the begin sequence is needed to get the number of # capturing groups in it. begin_pattern_groups = re.compile(begin).groups # Regex explanation: # 1. (?<!\\)(?:\\\\)* Unescapes the following char. The first part of # this regex is a look-behind assertion. Only match # the following if no single backslash is before it. # The second part matches all double backslashes. # In fact this sequence matches all escapes that # occur as a multiple of two, means the following # statement is not escaped. # 2. (begin) A capturing group that matches the begin sequence. # 3. (.*?) Match any char unlimited times, as few times as # possible. Save the match in the capturing group # after all capturing groups that can appear in # 'begin'. # 4. (?<!\\)((?:\\\\)*) Again the unescaping regex, but now all escape- # characters get captured. # 5. (end) A capturing group that matches the end sequence. # Because the 3. group is lazy (matches as few times # as possible) the next occurring end-sequence is # matched. regex = (r"(?<!\\)(?:\\\\)*(" + begin + r")(.*?)(?<!\\)((?:\\\\)*)(" + end + ")") matches = re.finditer(regex, string, re.DOTALL) if remove_empty_matches: matches = trim_empty_matches( matches, (begin_pattern_groups + 2, begin_pattern_groups + 3)) matches = limit(matches, max_matches) for m in matches: yield InBetweenMatch.from_values( m.group(1), m.start(1), m.group(begin_pattern_groups + 2) + m.group(begin_pattern_groups + 3), m.start(begin_pattern_groups + 2), m.group(begin_pattern_groups + 4), m.start(begin_pattern_groups + 4))
def search_in_between(begin, end, string, max_matches=0, remove_empty_matches=False, use_regex=False): """ Searches for a string enclosed between a specified begin- and end-sequence. Also enclosed \\n are put into the result. Doesn't handle escape sequences. :param begin: A pattern that defines where to start matching. :param end: A pattern that defines where to end matching. :param string: The string where to search in. :param max_matches: Defines the maximum number of matches. If 0 or less is provided, the number of matches is not limited. :param remove_empty_matches: Defines whether empty entries should be removed from the result. An entry is considered empty if no inner match was performed (regardless of matched start and end patterns). :param use_regex: Specifies whether to treat the begin and end patterns as regexes or simple strings. :return: An iterator returning InBetweenMatch objects that hold information about the matched begin, inside and end string matched. """ if not use_regex: begin = re.escape(begin) end = re.escape(end) # No need to compile the begin sequence, capturing groups get escaped. begin_pattern_groups = 0 else: # Compilation of the begin sequence is needed to get the number of # capturing groups in it. begin_pattern_groups = re.compile(begin).groups # Regex explanation: # 1. (begin) A capturing group that matches the begin sequence. # 2. (.*?) Match any char unlimited times, as few times as possible. Save # the match in the second capturing group (`match.group(2)`). # 3. (end) A capturing group that matches the end sequence. # Because the previous group is lazy (matches as few times as # possible) the next occurring end-sequence is matched. regex = "(" + begin + ")(.*?)(" + end + ")" matches = re.finditer(regex, string, re.DOTALL) if remove_empty_matches: matches = trim_empty_matches(matches, (begin_pattern_groups + 2, )) matches = limit(matches, max_matches) for m in matches: yield InBetweenMatch.from_values(m.group(1), m.start(1), m.group(begin_pattern_groups + 2), m.start(begin_pattern_groups + 2), m.group(begin_pattern_groups + 3), m.start(begin_pattern_groups + 3))