Example #1
0
    def test_extended(self):
        expected_results = [[("(", 0, "", 1, ")", 1),
                             ("(", 6, "This is a word", 7, ")", 21),
                             ("(", 25, "(in a word", 26, ")", 36)],
                            [("(", 4, "((((((((((((((((((1", 5, ")", 24)],
                            [("(", 6, "do (it ", 7, ")", 14),
                             ("(", 41, "", 42, ")", 42),
                             ("(", 44, "hello.", 45, ")", 51)],
                            [("(", 0, "", 1, ")", 1),
                             ("(", 8, r"This\ is a word" + self.bs, 9, ")",
                              25),
                             ("(", 29, r"(in a\\\ word" + 5 * self.bs, 30, ")",
                              48)],
                            [("(", 5, r"\(\((((((\\\(((((((((((1", 6, ")", 30)
                             ],
                            [("(", 7, "do (it ", 8, ")", 15),
                             ("(", 45, "", 46, ")", 46),
                             ("(", 48, "hello.", 49, ")", 55)]]

        self.assertResultsEqual(
            search_in_between,
            {(begin_pattern, end_pattern, test_string, 0, False, use_regex):
             [InBetweenMatch.from_values(*args) for args in result]
             for test_string, result in zip(
                 self.search_in_between_test_strings, expected_results)
             for use_regex, begin_pattern, end_pattern in
             [(True, r"\(", r"\)"),
              (False, self.search_in_between_begin_pattern,
               self.search_in_between_end_pattern)]}, list)
Example #2
0
    def test_extended(self):
        expected_results = [
            [("(", 0, "", 1, ")", 1),
             ("(", 6, "This is a word", 7, ")", 21),
             ("(", 25, "(in a word", 26, ")", 36)],
            [("(", 4, "((((((((((((((((((1", 5, ")", 24)],
            [("(", 6, "do (it ", 7, ")", 14),
             ("(", 41, "", 42, ")", 42),
             ("(", 44, "hello.", 45, ")", 51)],
            [("(", 0, "", 1, ")", 1),
             ("(", 8, r"This\ is a word" + self.bs, 9, ")", 25),
             ("(", 29, r"(in a\\\ word" + 5 * self.bs, 30, ")", 48)],
            [("(", 5, r"\(\((((((\\\(((((((((((1", 6, ")", 30)],
            [("(", 7, "do (it ", 8, ")", 15),
             ("(", 45, "", 46, ")", 46),
             ("(", 48, "hello.", 49, ")", 55)]]

        self.assertResultsEqual(
            search_in_between,
            {(begin_pattern,
              end_pattern,
              test_string,
              0,
              False,
              use_regex): [InBetweenMatch.from_values(*args)
                           for args in result]
             for test_string, result in zip(
                 self.search_in_between_test_strings,
                 expected_results)
             for use_regex, begin_pattern, end_pattern in
             [(True, r"\(", r"\)"),
              (False,
               self.search_in_between_begin_pattern,
               self.search_in_between_end_pattern)]},
            list)
Example #3
0
    def test_auto_trim(self):
        expected_results = [
            [("(", 6, "This is a word", 7, ")", 21),
             ("(", 25, "(in a word) another ", 26, ")", 46)],
            [("(", 4, "((((((((((((((((((1)2)3))))))))))))))))", 5, ")", 44)],
            [("(", 6, "do (it ) more ", 7, ")", 21),
             ("(", 44, "hello.", 45, ")", 51)],
            [("(", 8, r"This\ is a word" + self.bs, 9, ")", 25),
             ("(", 29, r"(in a\\\ word\\\\\) another " + self.bs, 30, ")", 59)
             ],
            [("(", 5,
              r"\(\((((((\\\(((((((((((1)2)3))\\\\\)))))))))))))\)" + self.bs,
              6, ")", 57)],
            [("(", 7, "do (it ) more ", 8, ")", 22),
             ("(", 48, "hello.", 49, ")", 55)]
        ]

        self.assertResultsEqual(
            nested_search_in_between,
            {(begin_pattern, end_pattern, test_string, 0, True, use_regex):
             [InBetweenMatch.from_values(*args) for args in result]
             for test_string, result in zip(
                 self.search_in_between_test_strings, expected_results)
             for use_regex, begin_pattern, end_pattern in
             [(True, r"\(", r"\)"),
              (False, self.search_in_between_begin_pattern,
               self.search_in_between_end_pattern)]}, list)
    def test_auto_trim(self):
        expected_results = [
            [],
            [(";", 2, r"\\\\\;\\#", 3, ";", 12),
             (";", 25, "+ios", 26, ";", 30)],
            [(";", 1, "2", 2, ";", 3),
             (";", 5, "4", 6, ";", 7),
             (";", 9, "6", 10, ";", 11)],
            [(";", 1, "2", 2, ";", 3),
             (";", 5, "4", 6, ";", 7),
             (";", 9, "6", 10, ";", 11)],
            [],
            [],
            [],
            [],
            [(";", 3, "a", 4, ";", 5)]]

        self.assertResultsEqual(
            unescaped_search_in_between,
            {(self.auto_trim_test_pattern,
              self.auto_trim_test_pattern,
              test_string,
              0,
              True,
              use_regex): [InBetweenMatch.from_values(*args)
                           for args in result]
             for test_string, result in zip(self.auto_trim_test_strings,
                                            expected_results)
             for use_regex in [True, False]},
            list)
    def test_regex_pattern(self):
        expected_results = [
            [("abc", 0, "", 3, "abc", 3)],
            [("ab", 0, "c", 2, "ab", 3)],
            [("ab", 0, "c", 2, "ab", 3),
             ("ab", 21, r"bc\+'**'", 23, "ac", 31)],
            [(self.bs, 12, r"\13q4ujsabbc", 13, self.bs, 25)],
            [("###", 9, r"\\13q4ujsabbc\+'**'ac", 12, "###", 33),
             ("#", 37, ".", 38, "####", 39)],
            [("a", 0, "", 1, "b", 1),
             ("a", 3, "", 4, "b", 4),
             ("b", 7, "", 8, "a", 8),
             ("##", 9, "", 11, "#\\", 11),
             ("a", 21, "", 22, "b", 22),
             ("b", 23, r"c\+'**'", 24, "a", 31),
             ("##", 33, "", 35, "#.", 35),
             ("#.", 37, "", 39, "##", 39),
             ("##", 41, "-", 43, "b", 44)],
            [("abcabc", 0, r"cba###\\13q4ujs", 6, "abbc", 21)],
            []]

        self.assertResultsEqual(
            unescaped_search_in_between,
            {(pattern,
              pattern,
              self.multi_pattern_test_string,
              0,
              False,
              True): [InBetweenMatch.from_values(*args)
                      for args in result]
             for pattern, result in zip(self.multi_patterns,
                                        expected_results)},
            list)
Example #6
0
def search_in_between(begin, end, string, max_matches=0, remove_empty_matches=False, use_regex=False):
    """
    Searches for a string enclosed between a specified begin- and end-sequence.
    Also enclosed \\n are put into the result. Doesn't handle escape sequences.

    :param begin:                A pattern that defines where to start
                                 matching.
    :param end:                  A pattern that defines where to end matching.
    :param string:               The string where to search in.
    :param max_matches:          Defines the maximum number of matches. If 0 or
                                 less is provided, the number of matches is not
                                 limited.
    :param remove_empty_matches: Defines whether empty entries should
                                 be removed from the result. An entry is
                                 considered empty if no inner match was
                                 performed (regardless of matched start and
                                 end patterns).
    :param use_regex:            Specifies whether to treat the begin and end
                                 patterns as regexes or simple strings.
    :return:                     An iterator returning InBetweenMatch objects
                                 that hold information about the matched begin,
                                 inside and end string matched.
    """

    if not use_regex:
        begin = re.escape(begin)
        end = re.escape(end)
        # No need to compile the begin sequence, capturing groups get escaped.
        begin_pattern_groups = 0
    else:
        # Compilation of the begin sequence is needed to get the number of
        # capturing groups in it.
        begin_pattern_groups = re.compile(begin).groups

    # Regex explanation:
    # 1. (begin) A capturing group that matches the begin sequence.
    # 2. (.*?)   Match any char unlimited times, as few times as possible. Save
    #            the match in the second capturing group (`match.group(2)`).
    # 3. (end)   A capturing group that matches the end sequence.
    #            Because the previous group is lazy (matches as few times as
    #            possible) the next occurring end-sequence is matched.
    regex = "(" + begin + ")(.*?)(" + end + ")"

    matches = re.finditer(regex, string, re.DOTALL)

    if remove_empty_matches:
        matches = trim_empty_matches(matches, (begin_pattern_groups + 2,))

    matches = limit(matches, max_matches)

    for m in matches:
        yield InBetweenMatch.from_values(
            m.group(1),
            m.start(1),
            m.group(begin_pattern_groups + 2),
            m.start(begin_pattern_groups + 2),
            m.group(begin_pattern_groups + 3),
            m.start(begin_pattern_groups + 3),
        )
Example #7
0
 def test_regex_pattern(self):
     self.assertResultsEqual(
         nested_search_in_between,
         {(r"(?:)\(", r"\)(?:)", test_string, 0, False, True):
          [InBetweenMatch.from_values(*args) for args in result]
          for test_string, result in zip(
              self.search_in_between_test_strings,
              self.test_basic_expected_results)}, list)
Example #8
0
 def test_basic(self):
     self.assertResultsEqual(
         nested_search_in_between,
         {(self.search_in_between_begin_pattern,
           self.search_in_between_end_pattern, test_string, 0, False,
           False): [InBetweenMatch.from_values(*args) for args in result]
          for test_string, result in zip(
              self.search_in_between_test_strings,
              self.test_basic_expected_results)}, list)
Example #9
0
    def test_from_values(self):
        uut = InBetweenMatch.from_values("hello", 47, "world", 77, "rises", 90)

        self.assertEqual(str(uut.begin), "hello")
        self.assertEqual(uut.begin.position, 47)
        self.assertEqual(str(uut.inside), "world")
        self.assertEqual(uut.inside.position, 77)
        self.assertEqual(str(uut.end), "rises")
        self.assertEqual(uut.end.position, 90)
Example #10
0
    def test_from_values(self):
        uut = InBetweenMatch.from_values("hello", 47, "world", 77, "rises", 90)

        self.assertEqual(str(uut.begin), "hello")
        self.assertEqual(uut.begin.position, 47)
        self.assertEqual(str(uut.inside), "world")
        self.assertEqual(uut.inside.position, 77)
        self.assertEqual(str(uut.end), "rises")
        self.assertEqual(uut.end.position, 90)
Example #11
0
    def test_properties(self):
        uut = InBetweenMatch(Match("ABC", 0), Match("DEF", 3), Match("GHI", 6))

        self.assertEqual(str(uut.begin), "ABC")
        self.assertEqual(uut.begin.position, 0)
        self.assertEqual(str(uut.inside), "DEF")
        self.assertEqual(uut.inside.position, 3)
        self.assertEqual(str(uut.end), "GHI")
        self.assertEqual(uut.end.position, 6)
 def test_regex_pattern(self):
     self.assertResultsEqual(
         nested_search_in_between,
         {(r"(?:)\(", r"\)(?:)", test_string, 0, False, True):
              [InBetweenMatch.from_values(*args) for args in result]
          for test_string, result in zip(
              self.search_in_between_test_strings,
              self.test_basic_expected_results)},
         list)
Example #13
0
 def test_max_match(self):
     self.assertResultsEqual(
         nested_search_in_between,
         {(self.search_in_between_begin_pattern,
           self.search_in_between_end_pattern, test_string, max_match,
           False, False):
          [InBetweenMatch.from_values(*args) for args in result]
          for max_match in [1, 2, 5, 22] for test_string, result in
          zip(self.search_in_between_test_strings, [
              elem[0:max_match] for elem in self.test_basic_expected_results
          ])}, list)
Example #14
0
    def test_basic(self):
        expected_results = self.test_basic_expected_results

        self.assertResultsEqual(
            unescaped_search_in_between,
            {(self.test_basic_pattern, self.test_basic_pattern, test_string, 0,
              False, use_regex):
             [InBetweenMatch.from_values(*args) for args in result]
             for test_string, result in zip(self.test_strings,
                                            expected_results)
             for use_regex in [True, False]}, list)
Example #15
0
    def test_max_match(self):
        search_pattern = self.test_basic_pattern
        expected_master_results = self.test_basic_expected_results

        self.assertResultsEqual(
            unescaped_search_in_between,
            {(search_pattern, search_pattern, test_string, max_match, False,
              use_regex):
             [InBetweenMatch.from_values(*args) for args in result]
             for max_match in [1, 2, 3, 4, 5, 100] for test_string, result in
             zip(self.test_strings,
                 [elem[0:max_match] for elem in expected_master_results])
             for use_regex in [True, False]}, list)
 def test_basic(self):
     self.assertResultsEqual(
         nested_search_in_between,
         {(self.search_in_between_begin_pattern,
           self.search_in_between_end_pattern,
           test_string,
           0,
           False,
           False): [InBetweenMatch.from_values(*args)
                    for args in result]
          for test_string, result in zip(
              self.search_in_between_test_strings,
              self.test_basic_expected_results)},
         list)
    def test_basic(self):
        expected_results = self.test_basic_expected_results

        self.assertResultsEqual(
            unescaped_search_in_between,
            {
                (self.test_basic_pattern, self.test_basic_pattern, test_string, 0, False, use_regex): [
                    InBetweenMatch.from_values(*args) for args in result
                ]
                for test_string, result in zip(self.test_strings, expected_results)
                for use_regex in [True, False]
            },
            list,
        )
Example #18
0
def _nested_search_in_between(begin, end, string):
    """
    Searches for a string enclosed between a specified begin- and end-sequence.
    Matches infinite times.

    This is a function specifically designed to be invoked from
    ``nested_search_in_between()``.

    :param begin:  A regex pattern that defines where to start matching.
    :param end:    A regex pattern that defines where to end matching.
    :param string: The string where to search in.
    :return:       An iterator returning the matched strings.
    """
    # Regex explanation:
    # 1. (begin) A capturing group that matches the begin sequence.
    # 2. (end)   A capturing group that matches the end sequence. Because the
    #            1st group is lazy (matches as few times as possible) the next
    #            occurring end-sequence is matched.
    # The '|' in the regex matches either the first or the second part.
    regex = "(" + begin + ")|(" + end + ")"

    left_match = None
    nesting_level = 0
    for match in re.finditer(regex, string, re.DOTALL):
        if match.group(1) is not None:
            if nesting_level == 0:
                # Store the match of the first nesting level to be able to
                # return the string until the next fitting end sequence.
                left_match = match
            nesting_level += 1
        else:
            # The second group matched. This is the only alternative if group 1
            # didn't, otherwise no match would be performed. No need to compile
            # the begin and end sequences to get the number of capturing groups
            # in them.
            if nesting_level > 0:
                nesting_level -= 1

            if nesting_level == 0 and left_match != None:
                yield InBetweenMatch.from_values(
                    left_match.group(),
                    left_match.start(),
                    string[left_match.end() : match.start()],
                    left_match.end(),
                    match.group(),
                    match.start(),
                )

                left_match = None
Example #19
0
def _nested_search_in_between(begin, end, string):
    """
    Searches for a string enclosed between a specified begin- and end-sequence.
    Matches infinite times.

    This is a function specifically designed to be invoked from
    ``nested_search_in_between()``.

    :param begin:  A regex pattern that defines where to start matching.
    :param end:    A regex pattern that defines where to end matching.
    :param string: The string where to search in.
    :return:       An iterator returning the matched strings.
    """
    # Regex explanation:
    # 1. (begin) A capturing group that matches the begin sequence.
    # 2. (end)   A capturing group that matches the end sequence. Because the
    #            1st group is lazy (matches as few times as possible) the next
    #            occurring end-sequence is matched.
    # The '|' in the regex matches either the first or the second part.
    regex = "(" + begin + ")|(" + end + ")"

    left_match = None
    nesting_level = 0
    for match in re.finditer(regex, string, re.DOTALL):
        if match.group(1) is not None:
            if nesting_level == 0:
                # Store the match of the first nesting level to be able to
                # return the string until the next fitting end sequence.
                left_match = match
            nesting_level += 1
        else:
            # The second group matched. This is the only alternative if group 1
            # didn't, otherwise no match would be performed. No need to compile
            # the begin and end sequences to get the number of capturing groups
            # in them.
            if nesting_level > 0:
                nesting_level -= 1

            if nesting_level == 0 and left_match != None:
                yield InBetweenMatch.from_values(
                    left_match.group(),
                    left_match.start(),
                    string[left_match.end(): match.start()],
                    left_match.end(),
                    match.group(),
                    match.start())

                left_match = None
 def test_max_match(self):
     self.assertResultsEqual(
         nested_search_in_between,
         {(self.search_in_between_begin_pattern,
           self.search_in_between_end_pattern,
           test_string,
           max_match,
           False,
           False): [InBetweenMatch.from_values(*args)
                    for args in result]
          for max_match in [1, 2, 5, 22]
          for test_string, result in zip(
              self.search_in_between_test_strings,
              [elem[0:max_match]
                  for elem in self.test_basic_expected_results])},
         list)
Example #21
0
    def test_disabled_regex(self):
        search_pattern = r"\'"
        expected_results = [[] for x in range(len(self.test_strings))]

        self.assertResultsEqual(
            search_in_between,
            {(search_pattern,
              search_pattern,
              test_string,
              0,
              auto_trim, # For remove_empty_matches both works, True and False.
              False): [InBetweenMatch.from_values(*args)
                       for args in result]
             for test_string, result in zip(self.test_strings,
                                            expected_results)
             for auto_trim in [True, False]},
            list)
    def test_max_match(self):
        search_pattern = self.test_basic_pattern
        expected_master_results = self.test_basic_expected_results

        self.assertResultsEqual(
            unescaped_search_in_between,
            {
                (search_pattern, search_pattern, test_string, max_match, False, use_regex): [
                    InBetweenMatch.from_values(*args) for args in result
                ]
                for max_match in [1, 2, 3, 4, 5, 100]
                for test_string, result in zip(
                    self.test_strings, [elem[0:max_match] for elem in expected_master_results]
                )
                for use_regex in [True, False]
            },
            list,
        )
    def test_auto_trim(self):
        expected_results = [
            [("(", 6, "This is a word", 7, ")", 21),
             ("(", 25, "(in a word) another ", 26, ")", 46)],
            [("(", 4, "((((((((((((((((((1)2)3))))))))))))))))", 5, ")", 44)],
            [("(", 6, "do (it ) more ", 7, ")", 21),
             ("(", 44, "hello.", 45, ")", 51)],
            [("(", 8, r"This\ is a word" + self.bs, 9, ")", 25),
             ("(", 29,
              r"(in a\\\ word\\\\\) another " + self.bs, 30,
              ")", 59)],
            [("(",
              5,
              r"\(\((((((\\\(((((((((((1)2)3))\\\\\)))))))))))))\)" + self.bs,
              6,
              ")",
              57)],
            [("(", 7, "do (it ) more ", 8, ")", 22),
             ("(", 48, "hello.", 49, ")", 55)]]

        self.assertResultsEqual(
            nested_search_in_between,
            {(begin_pattern,
              end_pattern,
              test_string,
              0,
              True,
              use_regex): [InBetweenMatch.from_values(*args)
                           for args in result]
             for test_string, result in zip(
                 self.search_in_between_test_strings,
                 expected_results)
             for use_regex, begin_pattern, end_pattern in [
                 (True, r"\(", r"\)"),
                 (False,
                  self.search_in_between_begin_pattern,
                  self.search_in_between_end_pattern)]},
            list)
Example #24
0
def unescaped_search_in_between(begin, end, string, max_matches=0, remove_empty_matches=False, use_regex=False):
    """
    Searches for a string enclosed between a specified begin- and end-sequence.
    Also enclosed \\n are put into the result.
    Handles escaped begin- and end-sequences (and so only patterns that are
    unescaped).

    .. warning::

        Using the escape character '\\' in the begin- or end-sequences
        the function can return strange results. The backslash can
        interfere with the escaping regex-sequence used internally to
        match the enclosed string.

    :param begin:                A regex pattern that defines where to start
                                 matching.
    :param end:                  A regex pattern that defines where to end
                                 matching.
    :param string:               The string where to search in.
    :param max_matches:          Defines the maximum number of matches. If 0 or
                                 less is provided, the number of matches is not
                                 limited.
    :param remove_empty_matches: Defines whether empty entries should
                                 be removed from the result. An entry is
                                 considered empty if no inner match was
                                 performed (regardless of matched start and
                                 end patterns).
    :param use_regex:            Specifies whether to treat the begin and end
                                 patterns as regexes or simple strings.
    :return:                     An iterator returning the matched strings.
    """
    if not use_regex:
        begin = re.escape(begin)
        end = re.escape(end)
        # No need to compile the begin sequence, capturing groups get escaped.
        begin_pattern_groups = 0
    else:
        # Compilation of the begin sequence is needed to get the number of
        # capturing groups in it.
        begin_pattern_groups = re.compile(begin).groups

    # Regex explanation:
    # 1. (?<!\\)(?:\\\\)*   Unescapes the following char. The first part of
    #                       this regex is a look-behind assertion. Only match
    #                       the following if no single backslash is before it.
    #                       The second part matches all double backslashes.
    #                       In fact this sequence matches all escapes that
    #                       occur as a multiple of two, means the following
    #                       statement is not escaped.
    # 2. (begin)            A capturing group that matches the begin sequence.
    # 3. (.*?)              Match any char unlimited times, as few times as
    #                       possible. Save the match in the capturing group
    #                       after all capturing groups that can appear in
    #                       'begin'.
    # 4. (?<!\\)((?:\\\\)*) Again the unescaping regex, but now all escape-
    #                       characters get captured.
    # 5. (end)              A capturing group that matches the end sequence.
    #                       Because the 3. group is lazy (matches as few times
    #                       as possible) the next occurring end-sequence is
    #                       matched.
    regex = r"(?<!\\)(?:\\\\)*(" + begin + r")(.*?)(?<!\\)((?:\\\\)*)(" + end + ")"

    matches = re.finditer(regex, string, re.DOTALL)

    if remove_empty_matches:
        matches = trim_empty_matches(matches, (begin_pattern_groups + 2, begin_pattern_groups + 3))

    matches = limit(matches, max_matches)

    for m in matches:
        yield InBetweenMatch.from_values(
            m.group(1),
            m.start(1),
            m.group(begin_pattern_groups + 2) + m.group(begin_pattern_groups + 3),
            m.start(begin_pattern_groups + 2),
            m.group(begin_pattern_groups + 4),
            m.start(begin_pattern_groups + 4),
        )
Example #25
0
def unescaped_search_in_between(begin,
                                end,
                                string,
                                max_matches=0,
                                remove_empty_matches=False,
                                use_regex=False):
    """
    Searches for a string enclosed between a specified begin- and end-sequence.
    Also enclosed \\n are put into the result.
    Handles escaped begin- and end-sequences (and so only patterns that are
    unescaped).

    .. warning::

        Using the escape character '\\' in the begin- or end-sequences
        the function can return strange results. The backslash can
        interfere with the escaping regex-sequence used internally to
        match the enclosed string.

    :param begin:                A regex pattern that defines where to start
                                 matching.
    :param end:                  A regex pattern that defines where to end
                                 matching.
    :param string:               The string where to search in.
    :param max_matches:          Defines the maximum number of matches. If 0 or
                                 less is provided, the number of matches is not
                                 limited.
    :param remove_empty_matches: Defines whether empty entries should
                                 be removed from the result. An entry is
                                 considered empty if no inner match was
                                 performed (regardless of matched start and
                                 end patterns).
    :param use_regex:            Specifies whether to treat the begin and end
                                 patterns as regexes or simple strings.
    :return:                     An iterator returning the matched strings.
    """
    if not use_regex:
        begin = re.escape(begin)
        end = re.escape(end)
        # No need to compile the begin sequence, capturing groups get escaped.
        begin_pattern_groups = 0
    else:
        # Compilation of the begin sequence is needed to get the number of
        # capturing groups in it.
        begin_pattern_groups = re.compile(begin).groups

    # Regex explanation:
    # 1. (?<!\\)(?:\\\\)*   Unescapes the following char. The first part of
    #                       this regex is a look-behind assertion. Only match
    #                       the following if no single backslash is before it.
    #                       The second part matches all double backslashes.
    #                       In fact this sequence matches all escapes that
    #                       occur as a multiple of two, means the following
    #                       statement is not escaped.
    # 2. (begin)            A capturing group that matches the begin sequence.
    # 3. (.*?)              Match any char unlimited times, as few times as
    #                       possible. Save the match in the capturing group
    #                       after all capturing groups that can appear in
    #                       'begin'.
    # 4. (?<!\\)((?:\\\\)*) Again the unescaping regex, but now all escape-
    #                       characters get captured.
    # 5. (end)              A capturing group that matches the end sequence.
    #                       Because the 3. group is lazy (matches as few times
    #                       as possible) the next occurring end-sequence is
    #                       matched.
    regex = (r"(?<!\\)(?:\\\\)*(" + begin + r")(.*?)(?<!\\)((?:\\\\)*)(" +
             end + ")")

    matches = re.finditer(regex, string, re.DOTALL)

    if remove_empty_matches:
        matches = trim_empty_matches(
            matches, (begin_pattern_groups + 2, begin_pattern_groups + 3))

    matches = limit(matches, max_matches)

    for m in matches:
        yield InBetweenMatch.from_values(
            m.group(1), m.start(1),
            m.group(begin_pattern_groups + 2) +
            m.group(begin_pattern_groups + 3),
            m.start(begin_pattern_groups + 2),
            m.group(begin_pattern_groups + 4),
            m.start(begin_pattern_groups + 4))
Example #26
0
def search_in_between(begin,
                      end,
                      string,
                      max_matches=0,
                      remove_empty_matches=False,
                      use_regex=False):
    """
    Searches for a string enclosed between a specified begin- and end-sequence.
    Also enclosed \\n are put into the result. Doesn't handle escape sequences.

    :param begin:                A pattern that defines where to start
                                 matching.
    :param end:                  A pattern that defines where to end matching.
    :param string:               The string where to search in.
    :param max_matches:          Defines the maximum number of matches. If 0 or
                                 less is provided, the number of matches is not
                                 limited.
    :param remove_empty_matches: Defines whether empty entries should
                                 be removed from the result. An entry is
                                 considered empty if no inner match was
                                 performed (regardless of matched start and
                                 end patterns).
    :param use_regex:            Specifies whether to treat the begin and end
                                 patterns as regexes or simple strings.
    :return:                     An iterator returning InBetweenMatch objects
                                 that hold information about the matched begin,
                                 inside and end string matched.
    """

    if not use_regex:
        begin = re.escape(begin)
        end = re.escape(end)
        # No need to compile the begin sequence, capturing groups get escaped.
        begin_pattern_groups = 0
    else:
        # Compilation of the begin sequence is needed to get the number of
        # capturing groups in it.
        begin_pattern_groups = re.compile(begin).groups

    # Regex explanation:
    # 1. (begin) A capturing group that matches the begin sequence.
    # 2. (.*?)   Match any char unlimited times, as few times as possible. Save
    #            the match in the second capturing group (`match.group(2)`).
    # 3. (end)   A capturing group that matches the end sequence.
    #            Because the previous group is lazy (matches as few times as
    #            possible) the next occurring end-sequence is matched.
    regex = "(" + begin + ")(.*?)(" + end + ")"

    matches = re.finditer(regex, string, re.DOTALL)

    if remove_empty_matches:
        matches = trim_empty_matches(matches, (begin_pattern_groups + 2, ))

    matches = limit(matches, max_matches)

    for m in matches:
        yield InBetweenMatch.from_values(m.group(1), m.start(1),
                                         m.group(begin_pattern_groups + 2),
                                         m.start(begin_pattern_groups + 2),
                                         m.group(begin_pattern_groups + 3),
                                         m.start(begin_pattern_groups + 3))