Ejemplo n.º 1
0
def search_in_between(begin, end, string, max_matches=0, remove_empty_matches=False, use_regex=False):
    """
    Searches for a string enclosed between a specified begin- and end-sequence.
    Also enclosed \\n are put into the result. Doesn't handle escape sequences.

    :param begin:                A pattern that defines where to start
                                 matching.
    :param end:                  A pattern that defines where to end matching.
    :param string:               The string where to search in.
    :param max_matches:          Defines the maximum number of matches. If 0 or
                                 less is provided, the number of matches is not
                                 limited.
    :param remove_empty_matches: Defines whether empty entries should
                                 be removed from the result. An entry is
                                 considered empty if no inner match was
                                 performed (regardless of matched start and
                                 end patterns).
    :param use_regex:            Specifies whether to treat the begin and end
                                 patterns as regexes or simple strings.
    :return:                     An iterator returning InBetweenMatch objects
                                 that hold information about the matched begin,
                                 inside and end string matched.
    """

    if not use_regex:
        begin = re.escape(begin)
        end = re.escape(end)
        # No need to compile the begin sequence, capturing groups get escaped.
        begin_pattern_groups = 0
    else:
        # Compilation of the begin sequence is needed to get the number of
        # capturing groups in it.
        begin_pattern_groups = re.compile(begin).groups

    # Regex explanation:
    # 1. (begin) A capturing group that matches the begin sequence.
    # 2. (.*?)   Match any char unlimited times, as few times as possible. Save
    #            the match in the second capturing group (`match.group(2)`).
    # 3. (end)   A capturing group that matches the end sequence.
    #            Because the previous group is lazy (matches as few times as
    #            possible) the next occurring end-sequence is matched.
    regex = "(" + begin + ")(.*?)(" + end + ")"

    matches = re.finditer(regex, string, re.DOTALL)

    if remove_empty_matches:
        matches = trim_empty_matches(matches, (begin_pattern_groups + 2,))

    matches = limit(matches, max_matches)

    for m in matches:
        yield InBetweenMatch.from_values(
            m.group(1),
            m.start(1),
            m.group(begin_pattern_groups + 2),
            m.start(begin_pattern_groups + 2),
            m.group(begin_pattern_groups + 3),
            m.start(begin_pattern_groups + 3),
        )
Ejemplo n.º 2
0
def search_for(pattern, string, flags=0, max_match=0, use_regex=False):
    """
    Searches for a given pattern in a string.

    :param pattern:   A pattern that defines what to match.
    :param string:    The string to search in.
    :param flags:     Additional flags to pass to the regex processor.
    :param max_match: Defines the maximum number of matches to perform. If 0 or
                      less is provided, the number of splits is not limited.
    :param use_regex: Specifies whether to treat the pattern as a regex or
                      simple string.
    :return:          An iterator returning MatchObject's.
    """
    if not use_regex:
        pattern = re.escape(pattern)

    return limit(re.finditer(pattern, string, flags), max_match)
Ejemplo n.º 3
0
def search_for(pattern, string, flags=0, max_match=0, use_regex=False):
    """
    Searches for a given pattern in a string.

    :param pattern:   A pattern that defines what to match.
    :param string:    The string to search in.
    :param flags:     Additional flags to pass to the regex processor.
    :param max_match: Defines the maximum number of matches to perform. If 0 or
                      less is provided, the number of splits is not limited.
    :param use_regex: Specifies whether to treat the pattern as a regex or
                      simple string.
    :return:          An iterator returning MatchObject's.
    """
    if not use_regex:
        pattern = re.escape(pattern)

    return limit(re.finditer(pattern, string, flags), max_match)
Ejemplo n.º 4
0
def nested_search_in_between(begin,
                             end,
                             string,
                             max_matches=0,
                             remove_empty_matches=False,
                             use_regex=False):
    """
    Searches for a string enclosed between a specified begin- and end-sequence.
    Also enclosed \\n are put into the result. Doesn't handle escape sequences,
    but supports nesting.

    Nested sequences are ignored during the match. Means you get only the first
    nesting level returned. If you want to acquire more levels, just reinvoke
    this function again on the return value.

    Using the same begin- and end-sequence won't match anything.

    :param begin:                A pattern that defines where to start
                                 matching.
    :param end:                  A pattern that defines where to end matching.
    :param string:               The string where to search in.
    :param max_matches:          Defines the maximum number of matches. If 0 or
                                 less is provided, the number of splits is not
                                 limited.
    :param remove_empty_matches: Defines whether empty entries should
                                 be removed from the result. An entry is
                                 considered empty if no inner match was
                                 performed (regardless of matched start and
                                 end patterns).
    :param use_regex:            Specifies whether to treat the begin and end
                                 patterns as regexes or simple strings.
    :return:                     An iterator returning the matched strings.
    """

    if not use_regex:
        begin = re.escape(begin)
        end = re.escape(end)

    strings = _nested_search_in_between(begin, end, string)

    if remove_empty_matches:
        strings = filter(lambda x: str(x.inside) != "", strings)

    return limit(strings, max_matches)
Ejemplo n.º 5
0
def nested_search_in_between(begin,
                             end,
                             string,
                             max_matches=0,
                             remove_empty_matches=False,
                             use_regex=False):
    """
    Searches for a string enclosed between a specified begin- and end-sequence.
    Also enclosed \n are put into the result. Doesn't handle escape sequences,
    but supports nesting.

    Nested sequences are ignored during the match. Means you get only the first
    nesting level returned. If you want to acquire more levels, just reinvoke
    this function again on the return value.

    Using the same begin- and end-sequence won't match anything.

    :param begin:                A pattern that defines where to start
                                 matching.
    :param end:                  A pattern that defines where to end matching.
    :param string:               The string where to search in.
    :param max_matches           Defines the maximum number of matches. If 0 or
                                 less is provided, the number of splits is not
                                 limited.
    :param remove_empty_matches: Defines whether empty entries should
                                 be removed from the result. An entry is
                                 considered empty if no inner match was
                                 performed (regardless of matched start and
                                 end patterns).
    :param use_regex:            Specifies whether to treat the begin and end
                                 patterns as regexes or simple strings.
    :return:                     An iterator returning the matched strings.
    """

    if not use_regex:
        begin = re.escape(begin)
        end = re.escape(end)

    strings = _nested_search_in_between(begin, end, string)

    if remove_empty_matches:
        strings = filter(lambda x: str(x.inside) != "", strings)

    return limit(strings, max_matches)
Ejemplo n.º 6
0
def unescaped_search_for(pattern,
                         string,
                         flags=0,
                         max_match=0,
                         use_regex=False):
    """
    Searches for a given pattern in a string that is not escaped.

    :param pattern:   A pattern that defines what to match unescaped.
    :param string:    The string to search in.
    :param flags:     Additional flags to pass to the regex processor.
    :param max_match: Defines the maximum number of matches to perform. If 0 or
                      less is provided, the number of splits is not limited.
    :param use_regex: Specifies whether to treat the pattern as a regex or
                      simple string.
    :return:          An iterator returning MatchObject's.
    """
    _iter = limit(
        filter(lambda match: not position_is_escaped(string, match.start()),
               search_for(pattern, string, flags, 0, use_regex)), max_match)

    for elem in _iter:
        yield elem
Ejemplo n.º 7
0
def unescaped_search_for(pattern, string, flags=0, max_match=0, use_regex=False):
    """
    Searches for a given pattern in a string that is not escaped.

    :param pattern:   A pattern that defines what to match unescaped.
    :param string:    The string to search in.
    :param flags:     Additional flags to pass to the regex processor.
    :param max_match: Defines the maximum number of matches to perform. If 0 or
                      less is provided, the number of splits is not limited.
    :param use_regex: Specifies whether to treat the pattern as a regex or
                      simple string.
    :return:          An iterator returning MatchObject's.
    """
    _iter = limit(
        filter(
            lambda match: not position_is_escaped(string, match.start()),
            search_for(pattern, string, flags, 0, use_regex),
        ),
        max_match,
    )

    for elem in _iter:
        yield elem
Ejemplo n.º 8
0
def unescaped_search_in_between(begin,
                                end,
                                string,
                                max_matches=0,
                                remove_empty_matches=False,
                                use_regex=False):
    """
    Searches for a string enclosed between a specified begin- and end-sequence.
    Also enclosed \\n are put into the result.
    Handles escaped begin- and end-sequences (and so only patterns that are
    unescaped).

    .. warning::

        Using the escape character '\\' in the begin- or end-sequences
        the function can return strange results. The backslash can
        interfere with the escaping regex-sequence used internally to
        match the enclosed string.

    :param begin:                A regex pattern that defines where to start
                                 matching.
    :param end:                  A regex pattern that defines where to end
                                 matching.
    :param string:               The string where to search in.
    :param max_matches:          Defines the maximum number of matches. If 0 or
                                 less is provided, the number of matches is not
                                 limited.
    :param remove_empty_matches: Defines whether empty entries should
                                 be removed from the result. An entry is
                                 considered empty if no inner match was
                                 performed (regardless of matched start and
                                 end patterns).
    :param use_regex:            Specifies whether to treat the begin and end
                                 patterns as regexes or simple strings.
    :return:                     An iterator returning the matched strings.
    """
    if not use_regex:
        begin = re.escape(begin)
        end = re.escape(end)
        # No need to compile the begin sequence, capturing groups get escaped.
        begin_pattern_groups = 0
    else:
        # Compilation of the begin sequence is needed to get the number of
        # capturing groups in it.
        begin_pattern_groups = re.compile(begin).groups

    # Regex explanation:
    # 1. (?<!\\)(?:\\\\)*   Unescapes the following char. The first part of
    #                       this regex is a look-behind assertion. Only match
    #                       the following if no single backslash is before it.
    #                       The second part matches all double backslashes.
    #                       In fact this sequence matches all escapes that
    #                       occur as a multiple of two, means the following
    #                       statement is not escaped.
    # 2. (begin)            A capturing group that matches the begin sequence.
    # 3. (.*?)              Match any char unlimited times, as few times as
    #                       possible. Save the match in the capturing group
    #                       after all capturing groups that can appear in
    #                       'begin'.
    # 4. (?<!\\)((?:\\\\)*) Again the unescaping regex, but now all escape-
    #                       characters get captured.
    # 5. (end)              A capturing group that matches the end sequence.
    #                       Because the 3. group is lazy (matches as few times
    #                       as possible) the next occurring end-sequence is
    #                       matched.
    regex = (r"(?<!\\)(?:\\\\)*(" + begin + r")(.*?)(?<!\\)((?:\\\\)*)(" +
             end + ")")

    matches = re.finditer(regex, string, re.DOTALL)

    if remove_empty_matches:
        matches = trim_empty_matches(
            matches, (begin_pattern_groups + 2, begin_pattern_groups + 3))

    matches = limit(matches, max_matches)

    for m in matches:
        yield InBetweenMatch.from_values(
            m.group(1), m.start(1),
            m.group(begin_pattern_groups + 2) +
            m.group(begin_pattern_groups + 3),
            m.start(begin_pattern_groups + 2),
            m.group(begin_pattern_groups + 4),
            m.start(begin_pattern_groups + 4))
Ejemplo n.º 9
0
def search_in_between(begin,
                      end,
                      string,
                      max_matches=0,
                      remove_empty_matches=False,
                      use_regex=False):
    """
    Searches for a string enclosed between a specified begin- and end-sequence.
    Also enclosed \\n are put into the result. Doesn't handle escape sequences.

    :param begin:                A pattern that defines where to start
                                 matching.
    :param end:                  A pattern that defines where to end matching.
    :param string:               The string where to search in.
    :param max_matches:          Defines the maximum number of matches. If 0 or
                                 less is provided, the number of matches is not
                                 limited.
    :param remove_empty_matches: Defines whether empty entries should
                                 be removed from the result. An entry is
                                 considered empty if no inner match was
                                 performed (regardless of matched start and
                                 end patterns).
    :param use_regex:            Specifies whether to treat the begin and end
                                 patterns as regexes or simple strings.
    :return:                     An iterator returning InBetweenMatch objects
                                 that hold information about the matched begin,
                                 inside and end string matched.
    """

    if not use_regex:
        begin = re.escape(begin)
        end = re.escape(end)
        # No need to compile the begin sequence, capturing groups get escaped.
        begin_pattern_groups = 0
    else:
        # Compilation of the begin sequence is needed to get the number of
        # capturing groups in it.
        begin_pattern_groups = re.compile(begin).groups

    # Regex explanation:
    # 1. (begin) A capturing group that matches the begin sequence.
    # 2. (.*?)   Match any char unlimited times, as few times as possible. Save
    #            the match in the second capturing group (`match.group(2)`).
    # 3. (end)   A capturing group that matches the end sequence.
    #            Because the previous group is lazy (matches as few times as
    #            possible) the next occurring end-sequence is matched.
    regex = "(" + begin + ")(.*?)(" + end + ")"

    matches = re.finditer(regex, string, re.DOTALL)

    if remove_empty_matches:
        matches = trim_empty_matches(matches, (begin_pattern_groups + 2, ))

    matches = limit(matches, max_matches)

    for m in matches:
        yield InBetweenMatch.from_values(m.group(1), m.start(1),
                                         m.group(begin_pattern_groups + 2),
                                         m.start(begin_pattern_groups + 2),
                                         m.group(begin_pattern_groups + 3),
                                         m.start(begin_pattern_groups + 3))
Ejemplo n.º 10
0
def unescaped_search_in_between(begin, end, string, max_matches=0, remove_empty_matches=False, use_regex=False):
    """
    Searches for a string enclosed between a specified begin- and end-sequence.
    Also enclosed \\n are put into the result.
    Handles escaped begin- and end-sequences (and so only patterns that are
    unescaped).

    .. warning::

        Using the escape character '\\' in the begin- or end-sequences
        the function can return strange results. The backslash can
        interfere with the escaping regex-sequence used internally to
        match the enclosed string.

    :param begin:                A regex pattern that defines where to start
                                 matching.
    :param end:                  A regex pattern that defines where to end
                                 matching.
    :param string:               The string where to search in.
    :param max_matches:          Defines the maximum number of matches. If 0 or
                                 less is provided, the number of matches is not
                                 limited.
    :param remove_empty_matches: Defines whether empty entries should
                                 be removed from the result. An entry is
                                 considered empty if no inner match was
                                 performed (regardless of matched start and
                                 end patterns).
    :param use_regex:            Specifies whether to treat the begin and end
                                 patterns as regexes or simple strings.
    :return:                     An iterator returning the matched strings.
    """
    if not use_regex:
        begin = re.escape(begin)
        end = re.escape(end)
        # No need to compile the begin sequence, capturing groups get escaped.
        begin_pattern_groups = 0
    else:
        # Compilation of the begin sequence is needed to get the number of
        # capturing groups in it.
        begin_pattern_groups = re.compile(begin).groups

    # Regex explanation:
    # 1. (?<!\\)(?:\\\\)*   Unescapes the following char. The first part of
    #                       this regex is a look-behind assertion. Only match
    #                       the following if no single backslash is before it.
    #                       The second part matches all double backslashes.
    #                       In fact this sequence matches all escapes that
    #                       occur as a multiple of two, means the following
    #                       statement is not escaped.
    # 2. (begin)            A capturing group that matches the begin sequence.
    # 3. (.*?)              Match any char unlimited times, as few times as
    #                       possible. Save the match in the capturing group
    #                       after all capturing groups that can appear in
    #                       'begin'.
    # 4. (?<!\\)((?:\\\\)*) Again the unescaping regex, but now all escape-
    #                       characters get captured.
    # 5. (end)              A capturing group that matches the end sequence.
    #                       Because the 3. group is lazy (matches as few times
    #                       as possible) the next occurring end-sequence is
    #                       matched.
    regex = r"(?<!\\)(?:\\\\)*(" + begin + r")(.*?)(?<!\\)((?:\\\\)*)(" + end + ")"

    matches = re.finditer(regex, string, re.DOTALL)

    if remove_empty_matches:
        matches = trim_empty_matches(matches, (begin_pattern_groups + 2, begin_pattern_groups + 3))

    matches = limit(matches, max_matches)

    for m in matches:
        yield InBetweenMatch.from_values(
            m.group(1),
            m.start(1),
            m.group(begin_pattern_groups + 2) + m.group(begin_pattern_groups + 3),
            m.start(begin_pattern_groups + 2),
            m.group(begin_pattern_groups + 4),
            m.start(begin_pattern_groups + 4),
        )
Ejemplo n.º 11
0
 def test_finite(self):
     for test_limit in (1, 2, 3, 7, 8, 10, 22, 500000):
         self.assertEqual(tuple(limit(self.sequence, test_limit)),
                          self.sequence[0:test_limit])
Ejemplo n.º 12
0
 def test_infinite(self):
     for test_limit in (0, -1, -2, -6555123):
         self.assertEqual(tuple(limit(self.sequence, test_limit)),
                          self.sequence)