def test_extended(self):
        expected_results = [[("(", 0, "", 1, ")", 1),
                             ("(", 6, "This is a word", 7, ")", 21),
                             ("(", 25, "(in a word", 26, ")", 36)],
                            [("(", 4, "((((((((((((((((((1", 5, ")", 24)],
                            [("(", 6, "do (it ", 7, ")", 14),
                             ("(", 41, "", 42, ")", 42),
                             ("(", 44, "hello.", 45, ")", 51)],
                            [("(", 0, "", 1, ")", 1),
                             ("(", 8, r"This\ is a word" + self.bs, 9, ")",
                              25),
                             ("(", 29, r"(in a\\\ word" + 5 * self.bs, 30, ")",
                              48)],
                            [("(", 5, r"\(\((((((\\\(((((((((((1", 6, ")", 30)
                             ],
                            [("(", 7, "do (it ", 8, ")", 15),
                             ("(", 45, "", 46, ")", 46),
                             ("(", 48, "hello.", 49, ")", 55)]]

        self.assertResultsEqual(
            search_in_between,
            {(begin_pattern, end_pattern, test_string, 0, False, use_regex):
             [InBetweenMatch.from_values(*args) for args in result]
             for test_string, result in zip(
                 self.search_in_between_test_strings, expected_results)
             for use_regex, begin_pattern, end_pattern in
             [(True, r"\(", r"\)"),
              (False, self.search_in_between_begin_pattern,
               self.search_in_between_end_pattern)]}, list)
    def test_regex_pattern(self):
        expected_results = [[("abc", 0, "", 3, "abc", 3)],
                            [("ab", 0, "c", 2, "ab", 3)],
                            [("ab", 0, "c", 2, "ab", 3),
                             ("ab", 21, r"bc\+'**'", 23, "ac", 31)],
                            [(self.bs, 12, "", 13, self.bs, 13)],
                            [("###", 9, r"\\13q4ujsabbc\+'**'ac", 12, "###",
                              33), ("#", 37, ".", 38, "####", 39)],
                            [("a", 0, "", 1, "b", 1), ("a", 3, "", 4, "b", 4),
                             ("b", 7, "", 8, "a", 8),
                             ("##", 9, "", 11, "#\\", 11),
                             ("a", 21, "", 22, "b", 22),
                             ("b", 23, r"c\+'**'", 24, "a", 31),
                             ("##", 33, "", 35, "#.", 35),
                             ("#.", 37, "", 39, "##", 39),
                             ("##", 41, "-", 43, "b", 44)],
                            [("abcabc", 0, r"cba###\\13q4ujs", 6, "abbc", 21)],
                            [("1", 14, "3q4ujsabbc" + self.bs, 15, "+", 26)]]

        self.assertResultsEqual(
            search_in_between,
            {(pattern, pattern, self.multi_pattern_test_string, 0, False,
              True): [InBetweenMatch.from_values(*args) for args in result]
             for pattern, result in zip(self.multi_patterns, expected_results)
             }, list)
    def test_auto_trim(self):
        expected_results = [
            [],
            [(";", 2, r"\\\\\;\\#", 3, ";", 12),
             (";", 25, "+ios", 26, ";", 30)],
            [(";", 1, "2", 2, ";", 3),
             (";", 5, "4", 6, ";", 7),
             (";", 9, "6", 10, ";", 11)],
            [(";", 1, "2", 2, ";", 3),
             (";", 5, "4", 6, ";", 7),
             (";", 9, "6", 10, ";", 11)],
            [],
            [],
            [],
            [],
            [(";", 3, "a", 4, ";", 5)]]

        self.assertResultsEqual(
            unescaped_search_in_between,
            {(self.auto_trim_test_pattern,
              self.auto_trim_test_pattern,
              test_string,
              0,
              True,
              use_regex): [InBetweenMatch.from_values(*args)
                           for args in result]
             for test_string, result in zip(self.auto_trim_test_strings,
                                            expected_results)
             for use_regex in [True, False]},
            list)
    def test_from_values(self):
        uut = InBetweenMatch.from_values("hello", 47, "world", 77, "rises", 90)

        self.assertEqual(str(uut.begin), "hello")
        self.assertEqual(uut.begin.position, 47)
        self.assertEqual(str(uut.inside), "world")
        self.assertEqual(uut.inside.position, 77)
        self.assertEqual(str(uut.end), "rises")
        self.assertEqual(uut.end.position, 90)
    def test_properties(self):
        uut = InBetweenMatch(Match("ABC", 0), Match("DEF", 3), Match("GHI", 6))

        self.assertEqual(str(uut.begin), "ABC")
        self.assertEqual(uut.begin.position, 0)
        self.assertEqual(str(uut.inside), "DEF")
        self.assertEqual(uut.inside.position, 3)
        self.assertEqual(str(uut.end), "GHI")
        self.assertEqual(uut.end.position, 6)
    def test_basic(self):
        expected_results = self.test_basic_expected_results

        self.assertResultsEqual(
            search_in_between,
            {(self.test_basic_pattern, self.test_basic_pattern, test_string, 0,
              False, use_regex):
             [InBetweenMatch.from_values(*args) for args in result]
             for test_string, result in zip(self.test_strings,
                                            expected_results)
             for use_regex in [True, False]}, list)
    def test_max_match(self):
        search_pattern = self.test_basic_pattern
        expected_master_results = self.test_basic_expected_results

        self.assertResultsEqual(
            search_in_between,
            {(search_pattern, search_pattern, test_string, max_match, False,
              use_regex):
             [InBetweenMatch.from_values(*args) for args in result]
             for max_match in [1, 2, 3, 4, 5, 100] for test_string, result in
             zip(self.test_strings,
                 [elem[0:max_match] for elem in expected_master_results])
             for use_regex in [True, False]}, list)
Exemple #8
0
def _nested_search_in_between(begin, end, string):
    """
    Searches for a string enclosed between a specified begin- and end-sequence.
    Matches infinite times.

    This is a function specifically designed to be invoked from
    ``nested_search_in_between()``.

    :param begin:  A regex pattern that defines where to start matching.
    :param end:    A regex pattern that defines where to end matching.
    :param string: The string where to search in.
    :return:       An iterator returning the matched strings.
    """
    # Regex explanation:
    # 1. (begin) A capturing group that matches the begin sequence.
    # 2. (end)   A capturing group that matches the end sequence. Because the
    #            1st group is lazy (matches as few times as possible) the next
    #            occurring end-sequence is matched.
    # The '|' in the regex matches either the first or the second part.
    regex = "(" + begin + ")|(" + end + ")"

    left_match = None
    nesting_level = 0
    for match in re.finditer(regex, string, re.DOTALL):
        if match.group(1) is not None:
            if nesting_level == 0:
                # Store the match of the first nesting level to be able to
                # return the string until the next fitting end sequence.
                left_match = match
            nesting_level += 1
        else:
            # The second group matched. This is the only alternative if group 1
            # didn't, otherwise no match would be performed. No need to compile
            # the begin and end sequences to get the number of capturing groups
            # in them.
            if nesting_level > 0:
                nesting_level -= 1

            if nesting_level == 0 and left_match != None:
                yield InBetweenMatch.from_values(
                    left_match.group(), left_match.start(),
                    string[left_match.end():match.start()], left_match.end(),
                    match.group(), match.start())

                left_match = None
    def test_disabled_regex(self):
        search_pattern = r"'()?"
        expected_results = [[] for x in range(len(self.test_strings))]

        self.assertResultsEqual(
            unescaped_search_in_between,
            {(search_pattern,
              search_pattern,
              test_string,
              0,
              # For remove_empty_matches both works, True and False.
              auto_trim,
              False): [InBetweenMatch.from_values(*args)
                       for args in result]
             for test_string, result in zip(self.test_strings,
                                            expected_results)
             for auto_trim in [True, False]},
            list)
Exemple #10
0
def unescaped_search_in_between(begin,
                                end,
                                string,
                                max_matches=0,
                                remove_empty_matches=False,
                                use_regex=False):
    """
    Searches for a string enclosed between a specified begin- and end-sequence.
    Also enclosed \\n are put into the result.
    Handles escaped begin- and end-sequences (and so only patterns that are
    unescaped).

    .. warning::

        Using the escape character '\\' in the begin- or end-sequences
        the function can return strange results. The backslash can
        interfere with the escaping regex-sequence used internally to
        match the enclosed string.

    :param begin:                A regex pattern that defines where to start
                                 matching.
    :param end:                  A regex pattern that defines where to end
                                 matching.
    :param string:               The string where to search in.
    :param max_matches:          Defines the maximum number of matches. If 0 or
                                 less is provided, the number of matches is not
                                 limited.
    :param remove_empty_matches: Defines whether empty entries should
                                 be removed from the result. An entry is
                                 considered empty if no inner match was
                                 performed (regardless of matched start and
                                 end patterns).
    :param use_regex:            Specifies whether to treat the begin and end
                                 patterns as regexes or simple strings.
    :return:                     An iterator returning the matched strings.
    """
    if not use_regex:
        begin = re.escape(begin)
        end = re.escape(end)
        # No need to compile the begin sequence, capturing groups get escaped.
        begin_pattern_groups = 0
    else:
        # Compilation of the begin sequence is needed to get the number of
        # capturing groups in it.
        begin_pattern_groups = re.compile(begin).groups

    # Regex explanation:
    # 1. (?<!\\)(?:\\\\)*   Unescapes the following char. The first part of
    #                       this regex is a look-behind assertion. Only match
    #                       the following if no single backslash is before it.
    #                       The second part matches all double backslashes.
    #                       In fact this sequence matches all escapes that
    #                       occur as a multiple of two, means the following
    #                       statement is not escaped.
    # 2. (begin)            A capturing group that matches the begin sequence.
    # 3. (.*?)              Match any char unlimited times, as few times as
    #                       possible. Save the match in the capturing group
    #                       after all capturing groups that can appear in
    #                       'begin'.
    # 4. (?<!\\)((?:\\\\)*) Again the unescaping regex, but now all escape-
    #                       characters get captured.
    # 5. (end)              A capturing group that matches the end sequence.
    #                       Because the 3. group is lazy (matches as few times
    #                       as possible) the next occurring end-sequence is
    #                       matched.
    regex = (r"(?<!\\)(?:\\\\)*(" + begin + r")(.*?)(?<!\\)((?:\\\\)*)(" +
             end + ")")

    matches = re.finditer(regex, string, re.DOTALL)

    if remove_empty_matches:
        matches = trim_empty_matches(
            matches, (begin_pattern_groups + 2, begin_pattern_groups + 3))

    matches = limit(matches, max_matches)

    for m in matches:
        yield InBetweenMatch.from_values(
            m.group(1), m.start(1),
            m.group(begin_pattern_groups + 2) +
            m.group(begin_pattern_groups + 3),
            m.start(begin_pattern_groups + 2),
            m.group(begin_pattern_groups + 4),
            m.start(begin_pattern_groups + 4))
Exemple #11
0
def search_in_between(begin,
                      end,
                      string,
                      max_matches=0,
                      remove_empty_matches=False,
                      use_regex=False):
    """
    Searches for a string enclosed between a specified begin- and end-sequence.
    Also enclosed \\n are put into the result. Doesn't handle escape sequences.

    :param begin:                A pattern that defines where to start
                                 matching.
    :param end:                  A pattern that defines where to end matching.
    :param string:               The string where to search in.
    :param max_matches:          Defines the maximum number of matches. If 0 or
                                 less is provided, the number of matches is not
                                 limited.
    :param remove_empty_matches: Defines whether empty entries should
                                 be removed from the result. An entry is
                                 considered empty if no inner match was
                                 performed (regardless of matched start and
                                 end patterns).
    :param use_regex:            Specifies whether to treat the begin and end
                                 patterns as regexes or simple strings.
    :return:                     An iterator returning InBetweenMatch objects
                                 that hold information about the matched begin,
                                 inside and end string matched.
    """

    if not use_regex:
        begin = re.escape(begin)
        end = re.escape(end)
        # No need to compile the begin sequence, capturing groups get escaped.
        begin_pattern_groups = 0
    else:
        # Compilation of the begin sequence is needed to get the number of
        # capturing groups in it.
        begin_pattern_groups = re.compile(begin).groups

    # Regex explanation:
    # 1. (begin) A capturing group that matches the begin sequence.
    # 2. (.*?)   Match any char unlimited times, as few times as possible. Save
    #            the match in the second capturing group (`match.group(2)`).
    # 3. (end)   A capturing group that matches the end sequence.
    #            Because the previous group is lazy (matches as few times as
    #            possible) the next occurring end-sequence is matched.
    regex = "(" + begin + ")(.*?)(" + end + ")"

    matches = re.finditer(regex, string, re.DOTALL)

    if remove_empty_matches:
        matches = trim_empty_matches(matches, (begin_pattern_groups + 2, ))

    matches = limit(matches, max_matches)

    for m in matches:
        yield InBetweenMatch.from_values(m.group(1), m.start(1),
                                         m.group(begin_pattern_groups + 2),
                                         m.start(begin_pattern_groups + 2),
                                         m.group(begin_pattern_groups + 3),
                                         m.start(begin_pattern_groups + 3))