Example #1
0
    def match(self, segments: Tuple[BaseSegment, ...],
              parse_context: ParseContext) -> MatchResult:
        """Match against any of the elements a relevant number of times.

        If it matches multiple, it returns the longest, and if any are the same
        length it returns the first (unless we explicitly just match first).
        """
        # First if we have an *exclude* option, we should check that
        # which would prevent the rest of this grammar from matching.
        if self.exclude:
            with parse_context.deeper_match() as ctx:
                if self.exclude.match(segments, parse_context=ctx):
                    return MatchResult.from_unmatched(segments)

        # Match on each of the options
        matched_segments: MatchResult = MatchResult.from_empty()
        unmatched_segments: Tuple[BaseSegment, ...] = segments
        n_matches = 0
        while True:
            if self.max_times and n_matches >= self.max_times:
                # We've matched as many times as we can
                return MatchResult(matched_segments.matched_segments,
                                   unmatched_segments)

            # Is there anything left to match?
            if len(unmatched_segments) == 0:
                # No...
                if n_matches >= self.min_times:
                    return MatchResult(matched_segments.matched_segments,
                                       unmatched_segments)
                else:
                    # We didn't meet the hurdle
                    return MatchResult.from_unmatched(unmatched_segments)

            # If we've already matched once...
            if n_matches > 0 and self.allow_gaps:
                # Consume any non-code if there is any
                pre_seg, mid_seg, post_seg = trim_non_code_segments(
                    unmatched_segments)
                unmatched_segments = mid_seg + post_seg
            else:
                pre_seg = ()  # empty tuple

            match = self._match_once(unmatched_segments,
                                     parse_context=parse_context)
            if match:
                matched_segments += pre_seg + match.matched_segments
                unmatched_segments = match.unmatched_segments
                n_matches += 1
            else:
                # If we get here, then we've not managed to match. And the next
                # unmatched segments are meaningful, i.e. they're not what we're
                # looking for.
                if n_matches >= self.min_times:
                    return MatchResult(matched_segments.matched_segments,
                                       pre_seg + unmatched_segments)
                else:
                    # We didn't meet the hurdle
                    return MatchResult.from_unmatched(unmatched_segments)
Example #2
0
def test__parser__helper_trim_non_code_segments(
    token_list,
    pre_len,
    mid_len,
    post_len,
    generate_test_segments,
):
    """Test trim_non_code_segments."""
    seg_list = generate_test_segments(token_list)
    pre, mid, post = trim_non_code_segments(seg_list)
    # Assert lengths
    assert (len(pre), len(mid), len(post)) == (pre_len, mid_len, post_len)
    # Assert content
    assert [elem.raw for elem in pre] == list(token_list[:pre_len])
    assert [elem.raw for elem in mid] == list(token_list[pre_len : pre_len + mid_len])
    assert [elem.raw for elem in post] == list(token_list[len(seg_list) - post_len :])
Example #3
0
    def parse(self, parse_context=None, parse_grammar=None):
        """Use the parse grammar to find subsegments within this segment.

        A large chunk of the logic around this can be found in the `expand` method.

        Use the parse setting in the context for testing, mostly to check how deep to go.
        True/False for yes or no, an integer allows a certain number of levels.

        Optionally, this method allows a custom parse grammar to be
        provided which will override any existing parse grammar
        on the segment.
        """
        # Clear the blacklist cache so avoid missteps
        if parse_context:
            parse_context.blacklist.clear()

        # the parse_depth and recurse kwargs control how deep we will recurse for testing.
        if not self.segments:
            # This means we're a root segment, just return an unmutated self
            return self

        # Check the Parse Grammar
        parse_grammar = parse_grammar or self.parse_grammar
        if parse_grammar is None:
            # No parse grammar, go straight to expansion
            parse_context.logger.debug(
                "{0}.parse: no grammar. Going straight to expansion".format(
                    self.__class__.__name__))
        else:
            # For debugging purposes. Ensure that we don't have non-code elements
            # at the start or end of the segments. They should always in the middle,
            # or in the parent expression.
            segments = self.segments
            if self.can_start_end_non_code:
                pre_nc, segments, post_nc = trim_non_code_segments(segments)
            else:
                pre_nc = ()
                post_nc = ()
                if (not segments[0].is_code) and (not segments[0].is_meta):
                    raise ValueError(
                        "Segment {0} starts with non code segment: {1!r}.\n{2!r}"
                        .format(self, segments[0].raw, segments))
                if (not segments[-1].is_code) and (not segments[-1].is_meta):
                    raise ValueError(
                        "Segment {0} ends with non code segment: {1!r}.\n{2!r}"
                        .format(self, segments[-1].raw, segments))

            # NOTE: No match_depth kwarg, because this is the start of the matching.
            with parse_context.matching_segment(
                    self.__class__.__name__) as ctx:
                m = parse_grammar.match(segments=segments, parse_context=ctx)

            if not isinstance(m, MatchResult):
                raise TypeError(
                    "[PD:{0}] {1}.match. Result is {2}, not a MatchResult!".
                    format(parse_context.parse_depth, self.__class__.__name__,
                           type(m)))

            # Basic Validation, that we haven't dropped anything.
            check_still_complete(segments, m.matched_segments,
                                 m.unmatched_segments)

            if m.has_match():
                if m.is_complete():
                    # Complete match, happy days!
                    self.segments = pre_nc + m.matched_segments + post_nc
                else:
                    # Incomplete match.
                    # For now this means the parsing has failed. Lets add the unmatched bit at the
                    # end as something unparsable.
                    # TODO: Do something more intelligent here.
                    self.segments = (
                        pre_nc + m.matched_segments + (UnparsableSegment(
                            segments=m.unmatched_segments + post_nc,
                            expected="Nothing...",
                        ), ))
            elif self.allow_empty and not segments:
                # Very edge case, but some segments are allowed to be empty other than non-code
                self.segments = pre_nc + post_nc
            else:
                # If there's no match at this stage, then it's unparsable. That's
                # a problem at this stage so wrap it in an unparsable segment and carry on.
                self.segments = (
                    pre_nc + (
                        UnparsableSegment(
                            segments=segments,
                            expected=self.name,
                        ),  # NB: tuple
                    ) + post_nc)

        bencher = BenchIt()  # starts the timer
        bencher("Parse complete of {0!r}".format(self.__class__.__name__))

        # Recurse if allowed (using the expand method to deal with the expansion)
        parse_context.logger.debug(
            "{0}.parse: Done Parse. Plotting Recursion. Recurse={1!r}".format(
                self.__class__.__name__, parse_context.recurse))
        parse_depth_msg = "###\n#\n# Beginning Parse Depth {0}: {1}\n#\n###\nInitial Structure:\n{2}".format(
            parse_context.parse_depth + 1, self.__class__.__name__,
            self.stringify())
        if parse_context.may_recurse():
            parse_context.logger.debug(parse_depth_msg)
            with parse_context.deeper_parse() as ctx:
                self.segments = self.expand(self.segments, parse_context=ctx)

        return self
Example #4
0
    def _longest_trimmed_match(
        cls,
        segments: Tuple["BaseSegment", ...],
        matchers: List["MatchableType"],
        parse_context: ParseContext,
        trim_noncode=True,
    ) -> Tuple[MatchResult, Optional["MatchableType"]]:
        """Return longest match from a selection of matchers.

        Prioritise the first match, and if multiple match at the same point the longest.
        If two matches of the same length match at the same time, then it's the first in
        the iterable of matchers.

        Returns:
            `tuple` of (match_object, matcher).

        """
        # Have we been passed an empty list?
        if len(segments) == 0:
            return MatchResult.from_empty(), None

        # If gaps are allowed, trim the ends.
        if trim_noncode:
            pre_nc, segments, post_nc = trim_non_code_segments(segments)

        best_match_length = 0
        # iterate at this position across all the matchers
        for matcher in matchers:
            # MyPy seems to require a type hint here. Not quite sure why.
            res_match: MatchResult = matcher.match(
                segments, parse_context=parse_context
            )
            if res_match.is_complete():
                # Just return it! (WITH THE RIGHT OTHER STUFF)
                if trim_noncode:
                    return (
                        MatchResult.from_matched(
                            pre_nc + res_match.matched_segments + post_nc
                        ),
                        matcher,
                    )
                else:
                    return res_match, matcher
            elif res_match:
                # We've got an incomplete match, if it's the best so far keep it.
                if res_match.matched_length > best_match_length:
                    best_match = res_match, matcher
                    best_match_length = res_match.matched_length

        # If we get here, then there wasn't a complete match. If we
        # has a best_match, return that.
        if best_match_length > 0:
            if trim_noncode:
                return (
                    MatchResult(
                        pre_nc + best_match[0].matched_segments,
                        best_match[0].unmatched_segments + post_nc,
                    ),
                    best_match[1],
                )
            else:
                return best_match
        # If no match at all, return nothing
        return MatchResult.from_unmatched(segments), None
Example #5
0
    def match(self, segments, parse_context):
        """Match a specific sequence of elements."""
        if isinstance(segments, BaseSegment):
            segments = tuple(segments)

        matched_segments = MatchResult.from_empty()
        unmatched_segments = segments

        # Buffers of uninstantiated meta segments.
        meta_pre_nc = ()
        meta_post_nc = ()
        early_break = False

        for idx, elem in enumerate(self._elements):
            # Check for an early break.
            if early_break:
                break

            while True:
                # Consume non-code if appropriate
                if self.allow_gaps:
                    pre_nc, mid_seg, post_nc = trim_non_code_segments(
                        unmatched_segments)
                else:
                    pre_nc = ()
                    mid_seg = unmatched_segments
                    post_nc = ()

                # Is it an indent or dedent?
                if elem.is_meta:
                    # Elements with a negative indent value come AFTER
                    # the whitespace. Positive or neutral come BEFORE.
                    if elem.indent_val < 0:
                        meta_post_nc += (elem(), )
                    else:
                        meta_pre_nc += (elem(), )
                    break

                # Is it a conditional? If so is it active
                if isinstance(
                        elem,
                        Conditional) and not elem.is_enabled(parse_context):
                    # If it's not active, skip it.
                    break

                if len(pre_nc + mid_seg + post_nc) == 0:
                    # We've run our of sequence without matching everything.
                    # Do only optional or meta elements remain?
                    if all(e.is_optional() or e.is_meta
                           or isinstance(elem, Conditional)
                           for e in self._elements[idx:]):
                        # then it's ok, and we can return what we've got so far.
                        # No need to deal with anything left over because we're at the end,
                        # unless it's a meta segment.

                        # We'll add those meta segments after any existing ones. So
                        # the go on the meta_post_nc stack.
                        for e in self._elements[idx:]:
                            # If it's meta, instantiate it.
                            if e.is_meta:
                                meta_post_nc += (e(), )
                            # If it's conditional and it's enabled, match it.
                            if isinstance(e, Conditional) and e.is_enabled(
                                    parse_context):
                                meta_match = e.match(tuple(), parse_context)
                                if meta_match:
                                    meta_post_nc += meta_match.matched_segments

                        # Early break to exit via the happy match path.
                        early_break = True
                        break
                    else:
                        # we've got to the end of the sequence without matching all
                        # required elements.
                        return MatchResult.from_unmatched(segments)
                else:
                    # We've already dealt with potential whitespace above, so carry on to matching
                    with parse_context.deeper_match() as ctx:
                        elem_match = elem.match(mid_seg, parse_context=ctx)

                    if elem_match.has_match():
                        # We're expecting mostly partial matches here, but complete
                        # matches are possible. Don't be greedy with whitespace!
                        matched_segments += (meta_pre_nc + pre_nc +
                                             meta_post_nc +
                                             elem_match.matched_segments)
                        meta_pre_nc = ()
                        meta_post_nc = ()
                        unmatched_segments = elem_match.unmatched_segments + post_nc
                        # Each time we do this, we do a sense check to make sure we haven't
                        # dropped anything. (Because it's happened before!).
                        check_still_complete(
                            segments,
                            matched_segments.matched_segments,
                            unmatched_segments,
                        )

                        # Break out of the while loop and move to the next element.
                        break
                    else:
                        # If we can't match an element, we should ascertain whether it's
                        # required. If so then fine, move on, but otherwise we should crash
                        # out without a match. We have not matched the sequence.
                        if elem.is_optional():
                            # This will crash us out of the while loop and move us
                            # onto the next matching element
                            break
                        else:
                            return MatchResult.from_unmatched(segments)

        # If we get to here, we've matched all of the elements (or skipped them)
        # but still have some segments left (or perhaps have precisely zero left).
        # In either case, we're golden. Return successfully, with any leftovers as
        # the unmatched elements. Meta all go at the end regardless of wny trailing
        # whitespace.
        return MatchResult(
            BaseSegment._position_segments(
                matched_segments.matched_segments + meta_pre_nc +
                meta_post_nc, ),
            unmatched_segments,
        )
Example #6
0
    def match(self, segments: Tuple["BaseSegment", ...],
              parse_context: ParseContext) -> MatchResult:
        """Match if this is a bracketed sequence, with content that matches one of the elements.

        1. work forwards to find the first bracket.
           If we find something other that whitespace, then fail out.
        2. Once we have the first bracket, we need to bracket count forward to find its partner.
        3. Assuming we find its partner then we try and match what goes between them
           using the match method of Sequence.
           If we match, great. If not, then we return an empty match.
           If we never find its partner then we return an empty match but should probably
           log a parsing warning, or error?

        """
        # Trim ends if allowed.
        if self.allow_gaps:
            pre_nc, seg_buff, post_nc = trim_non_code_segments(segments)
        else:
            seg_buff = segments

        # Rehydrate the bracket segments in question.
        start_bracket, end_bracket = self.get_bracket_from_dialect(
            parse_context)
        # Allow optional override for special bracket-like things
        start_bracket = self.start_bracket or start_bracket
        end_bracket = self.end_bracket or end_bracket

        # Look for the first bracket
        with parse_context.deeper_match() as ctx:
            start_match = start_bracket.match(seg_buff, parse_context=ctx)
        if start_match:
            seg_buff = start_match.unmatched_segments
        else:
            # Can't find the opening bracket. No Match.
            return MatchResult.from_unmatched(segments)

        # Look for the closing bracket
        content_segs, end_match, _ = self._bracket_sensitive_look_ahead_match(
            segments=seg_buff,
            matchers=[end_bracket],
            parse_context=parse_context,
            start_bracket=start_bracket,
            end_bracket=end_bracket,
            bracket_pairs_set=self.bracket_pairs_set,
        )
        if not end_match:
            raise SQLParseError(
                "Couldn't find closing bracket for opening bracket.",
                segment=start_match.matched_segments[0],
            )

        # Match the content now we've confirmed the brackets.

        # First deal with the case of TOTALLY EMPTY BRACKETS e.g. "()"
        if not content_segs:
            # If it's allowed, return a match.
            if not self._elements or all(e.is_optional()
                                         for e in self._elements):
                return MatchResult(
                    start_match.matched_segments + end_match.matched_segments,
                    end_match.unmatched_segments,
                )
            # If not, don't.
            else:
                return MatchResult.from_unmatched(segments)

        # Then trim whitespace and deal with the case of no code content e.g. "(   )"
        if self.allow_gaps:
            pre_nc, content_segs, post_nc = trim_non_code_segments(
                content_segs)
        else:
            pre_nc = ()
            post_nc = ()

        # If we don't have anything left after trimming, act accordingly.
        if not content_segs:
            if not self._elements or (all(e.is_optional()
                                          for e in self._elements)
                                      and self.allow_gaps):
                return MatchResult(
                    start_match.matched_segments + pre_nc + post_nc +
                    end_match.matched_segments,
                    end_match.unmatched_segments,
                )
            else:
                return MatchResult.from_unmatched(segments)

        # Match using super. Sequence will interpret the content of the elements.
        with parse_context.deeper_match() as ctx:
            content_match = super().match(content_segs, parse_context=ctx)

        # We require a complete match for the content (hopefully for obvious reasons)
        if content_match.is_complete():
            # Append some indent and dedent tokens at the start and the end.
            return MatchResult(
                # We need to realign the meta segments so the pos markers are correct.
                BaseSegment._position_segments(
                    (
                        # NB: The nc segments go *outside* the indents.
                        start_match.matched_segments +
                        (Indent(), )  # Add a meta indent here
                        + pre_nc + content_match.matched_segments + post_nc +
                        (Dedent(), )  # Add a meta indent here
                        + end_match.matched_segments), ),
                end_match.unmatched_segments,
            )
        # No complete match. Fail.
        else:
            return MatchResult.from_unmatched(segments)
Example #7
0
    def greedy_match(
        cls,
        segments,
        parse_context,
        matchers,
        enforce_whitespace_preceeding_terminator,
        include_terminator=False,
    ):
        """Matching for GreedyUntil works just how you'd expect."""
        seg_buff = segments
        seg_bank = ()  # Empty tuple
        # If no terminators then just return the whole thing.
        if matchers == [None]:
            return MatchResult.from_matched(segments)

        while True:
            with parse_context.deeper_match() as ctx:
                pre, mat, matcher = cls._bracket_sensitive_look_ahead_match(
                    seg_buff, matchers, parse_context=ctx)

            # Do we have a match?
            if mat:
                # Do we need to enforce whitespace preceding?
                if enforce_whitespace_preceeding_terminator:
                    # Does the match include some whitespace already?
                    # Work forward
                    idx = 0
                    while True:
                        elem = mat.matched_segments[idx]
                        if elem.is_meta:
                            idx += 1
                            continue
                        elif elem.is_type("whitespace", "newline"):
                            allowable_match = True
                            break
                        else:
                            # No whitespace before. Not allowed.
                            allowable_match = False
                            break

                    # If we're not ok yet, work backward to the preceding sections.
                    if not allowable_match:
                        idx = -1
                        while True:
                            if len(pre) < abs(idx):
                                # If we're at the start, it's ok
                                allowable_match = True
                                break
                            if pre[idx].is_meta:
                                idx -= 1
                                continue
                            elif pre[idx].is_type("whitespace", "newline"):
                                allowable_match = True
                                break
                            else:
                                # No whitespace before. Not allowed.
                                allowable_match = False
                                break

                    # If this match isn't preceded by whitespace and that is
                    # a requirement, then we can't use it. Carry on...
                    if not allowable_match:
                        # Update our buffers and continue onward
                        seg_bank = pre + mat.matched_segments
                        seg_buff = mat.unmatched_segments
                        # Loop around, don't return yet
                        continue

                # Depending on whether we found a terminator or not we treat
                # the result slightly differently. If no terminator was found,
                # we just use the whole unmatched segment. If we did find one,
                # we match up until (but not including [unless self.include_terminator
                # is true]) that terminator.
                if mat:
                    # Return everything up to the match unless it's a gap matcher.
                    if include_terminator:
                        return MatchResult(
                            seg_bank + pre + mat.matched_segments,
                            mat.unmatched_segments,
                        )

                    # We can't claim any non-code segments, so we trim them off the end.
                    leading_nc, pre_seg_mid, trailing_nc = trim_non_code_segments(
                        seg_bank + pre)
                    return MatchResult(
                        leading_nc + pre_seg_mid,
                        trailing_nc + mat.all_segments(),
                    )
                # No terminator, just return the whole thing.
                return MatchResult.from_matched(mat.unmatched_segments)
            else:
                # Return everything
                return MatchResult.from_matched(segments)
Example #8
0
 def trimmed_matched_length(self) -> int:
     """Return the length of the match in characters, trimming whitespace."""
     _, segs, _ = trim_non_code_segments(self.matched_segments)
     return sum(seg.matched_length for seg in segs)
Example #9
0
    def match(self, segments: Tuple["BaseSegment", ...],
              parse_context: ParseContext) -> MatchResult:
        """Match if a bracketed sequence, with content that matches one of the elements.

        1. work forwards to find the first bracket.
           If we find something other that whitespace, then fail out.
        2. Once we have the first bracket, we need to bracket count forward to find its
           partner.
        3. Assuming we find its partner then we try and match what goes between them
           using the match method of Sequence.
           If we match, great. If not, then we return an empty match.
           If we never find its partner then we return an empty match but should
           probably log a parsing warning, or error?

        """
        # Trim ends if allowed.
        if self.allow_gaps:
            pre_nc, seg_buff, post_nc = trim_non_code_segments(segments)
        else:
            seg_buff = segments  # pragma: no cover TODO?

        # Rehydrate the bracket segments in question.
        # bracket_persits controls whether we make a BracketedSegment or not.
        start_bracket, end_bracket, bracket_persists = self.get_bracket_from_dialect(
            parse_context)
        # Allow optional override for special bracket-like things
        start_bracket = self.start_bracket or start_bracket
        end_bracket = self.end_bracket or end_bracket

        # Are we dealing with a pre-existing BracketSegment?
        if seg_buff[0].is_type("bracketed"):
            seg: BracketedSegment = cast(BracketedSegment, seg_buff[0])
            content_segs = seg.segments[len(seg.start_bracket
                                            ):-len(seg.end_bracket)]
            bracket_segment = seg
            trailing_segments = seg_buff[1:]
        # Otherwise try and match the segments directly.
        else:
            # Look for the first bracket
            with parse_context.deeper_match() as ctx:
                start_match = start_bracket.match(seg_buff, parse_context=ctx)
            if start_match:
                seg_buff = start_match.unmatched_segments
            else:
                # Can't find the opening bracket. No Match.
                return MatchResult.from_unmatched(segments)

            # Look for the closing bracket
            content_segs, end_match, _ = self._bracket_sensitive_look_ahead_match(
                segments=seg_buff,
                matchers=[end_bracket],
                parse_context=parse_context,
                start_bracket=start_bracket,
                end_bracket=end_bracket,
                bracket_pairs_set=self.bracket_pairs_set,
            )
            if not end_match:  # pragma: no cover
                raise SQLParseError(
                    "Couldn't find closing bracket for opening bracket.",
                    segment=start_match.matched_segments[0],
                )

            # Construct a bracket segment
            bracket_segment = BracketedSegment(
                segments=(start_match.matched_segments + content_segs +
                          end_match.matched_segments),
                start_bracket=start_match.matched_segments,
                end_bracket=end_match.matched_segments,
            )
            trailing_segments = end_match.unmatched_segments

        # Then trim whitespace and deal with the case of non-code content e.g. "(   )"
        if self.allow_gaps:
            pre_segs, content_segs, post_segs = trim_non_code_segments(
                content_segs)
        else:  # pragma: no cover TODO?
            pre_segs = ()
            post_segs = ()

        # If we've got a case of empty brackets check whether that is allowed.
        if not content_segs:
            if not self._elements or (all(e.is_optional()
                                          for e in self._elements) and
                                      (self.allow_gaps or
                                       (not pre_segs and not post_segs))):
                return MatchResult(
                    (bracket_segment, )
                    if bracket_persists else bracket_segment.segments,
                    trailing_segments,
                )
            else:
                return MatchResult.from_unmatched(segments)

        # Match the content using super. Sequence will interpret the content of the
        # elements.
        with parse_context.deeper_match() as ctx:
            content_match = super().match(content_segs, parse_context=ctx)

        # We require a complete match for the content (hopefully for obvious reasons)
        if content_match.is_complete():
            # Reconstruct the bracket segment post match.
            # We need to realign the meta segments so the pos markers are correct.
            # Have we already got indents?
            meta_idx = None
            for idx, seg in enumerate(bracket_segment.segments):
                if (seg.is_meta and cast(MetaSegment, seg).indent_val > 0
                        and not cast(MetaSegment, seg).is_template):
                    meta_idx = idx
                    break
            # If we've already got indents, don't add more.
            if meta_idx:
                bracket_segment.segments = BaseSegment._position_segments(
                    bracket_segment.start_bracket + pre_segs +
                    content_match.all_segments() + post_segs +
                    bracket_segment.end_bracket)
            # Append some indent and dedent tokens at the start and the end.
            else:
                bracket_segment.segments = BaseSegment._position_segments(
                    # NB: The nc segments go *outside* the indents.
                    bracket_segment.start_bracket +
                    (Indent(), )  # Add a meta indent here
                    + pre_segs + content_match.all_segments() + post_segs +
                    (Dedent(), )  # Add a meta indent here
                    + bracket_segment.end_bracket)
            return MatchResult(
                (bracket_segment, )
                if bracket_persists else bracket_segment.segments,
                trailing_segments,
            )
        # No complete match. Fail.
        else:
            return MatchResult.from_unmatched(segments)
Example #10
0
    def match(
        self, segments: Tuple[BaseSegment, ...], parse_context: ParseContext
    ) -> MatchResult:
        """Match an arbitrary number of elements separated by a delimiter.

        Note that if there are multiple elements passed in that they will be treated
        as different options of what can be delimited, rather than a sequence.
        """
        # Have we been passed an empty list?
        if len(segments) == 0:
            return MatchResult.from_empty()

        # Make some buffers
        seg_buff = segments
        matched_segments = MatchResult.from_empty()
        # delimiters is a list of tuples containing delimiter segments as we find them.
        delimiters: List[BaseSegment] = []

        # First iterate through all the segments, looking for the delimiter.
        # Second, split the list on each of the delimiters, and ensure that
        # each sublist in turn matches one of the elements.

        # In more detail, match against delimiter, if we match, put a slice
        # up to that point onto a list of slices. Carry on.
        while True:
            # Check to see whether we've exhausted the buffer, either by iterating through it,
            # or by consuming all the non-code segments already.
            # NB: If we're here then we've already tried matching the remaining segments against
            # the content, so we must be in a trailing case.
            if len(seg_buff) == 0:
                # Append the remaining buffer in case we're in the not is_code case.
                matched_segments += seg_buff
                # Nothing left, this is potentially a trailing case?
                if self.allow_trailing and (
                    self.min_delimiters is None
                    or len(delimiters) >= self.min_delimiters
                ):
                    # It is! (nothing left so no unmatched segments to append)
                    return MatchResult.from_matched(matched_segments.matched_segments)
                else:
                    return MatchResult.from_unmatched(segments)

            # We rely on _bracket_sensitive_look_ahead_match to do the bracket counting
            # element of this now. We look ahead to find a delimiter or terminator.
            matchers = [self.delimiter]
            if self.terminator:
                matchers.append(self.terminator)
            # If gaps aren't allowed, a gap (or non-code segment), acts like a terminator.
            if not self.allow_gaps:
                matchers.append(NonCodeMatcher())

            with parse_context.deeper_match() as ctx:
                (
                    pre_content,
                    delimiter_match,
                    delimiter_matcher,
                ) = self._bracket_sensitive_look_ahead_match(
                    seg_buff,
                    matchers,
                    parse_context=ctx,
                    bracket_pairs_set=self.bracket_pairs_set,
                )

            # Store the mutated segments to reuse.
            mutated_segments = pre_content + delimiter_match.all_segments()

            # Have we found a delimiter or terminator looking forward?
            if delimiter_match:
                if delimiter_matcher is self.delimiter:
                    # Yes. Store it and then match the contents up to now.
                    delimiters.append(delimiter_match.matched_segments)

                # We now test the intervening section as to whether it matches one
                # of the things we're looking for. NB: If it's of zero length then
                # we return without trying it.
                if len(pre_content) > 0:
                    pre_non_code, pre_content, post_non_code = trim_non_code_segments(
                        pre_content
                    )
                    # Check for whitespace gaps.
                    # We do this explicitly here rather than relying on an
                    # untrimmed match so we can handle _whitespace_ explicitly
                    # compared to other non code segments like placeholders.
                    if not self.allow_gaps and any(
                        seg.is_whitespace for seg in pre_non_code + post_non_code
                    ):
                        return MatchResult.from_unmatched(mutated_segments)

                    with parse_context.deeper_match() as ctx:
                        match, _ = self._longest_trimmed_match(
                            segments=pre_content,
                            matchers=self._elements,
                            parse_context=ctx,
                            # We've already trimmed
                            trim_noncode=False,
                        )
                    # No match, or an incomplete match: Not allowed
                    if not match or not match.is_complete():
                        return MatchResult.from_unmatched(mutated_segments)

                    # We have a complete match!

                    # First add the segment up to the delimiter to the matched segments
                    matched_segments += (
                        pre_non_code + match.matched_segments + post_non_code
                    )
                    # Then it depends what we matched.
                    # Delimiter
                    if delimiter_matcher is self.delimiter:
                        # Then add the delimiter to the matched segments
                        matched_segments += delimiter_match.matched_segments
                        # Break this for loop and move on, looking for the next delimiter
                        seg_buff = delimiter_match.unmatched_segments
                        # Still got some buffer left. Carry on.
                        continue
                    # Terminator (or the gap terminator).
                    elif delimiter_matcher is self.terminator or isinstance(
                        delimiter_matcher, NonCodeMatcher
                    ):
                        # We just return straight away here. We don't add the terminator to
                        # this match, it should go with the unmatched parts.

                        # First check we've had enough delimiters
                        if (
                            self.min_delimiters
                            and len(delimiters) < self.min_delimiters
                        ):
                            return MatchResult.from_unmatched(mutated_segments)
                        else:
                            return MatchResult(
                                matched_segments.matched_segments,
                                delimiter_match.all_segments(),
                            )
                    else:
                        raise RuntimeError(
                            (
                                "I don't know how I got here. Matched instead on {}, which "
                                "doesn't appear to be delimiter or terminator"
                            ).format(delimiter_matcher)
                        )
                else:
                    # Zero length section between delimiters, or zero code
                    # elements if appropriate. Return unmatched.
                    return MatchResult.from_unmatched(mutated_segments)
            else:
                # No match for a delimiter looking forward, this means we're
                # at the end. In this case we look for a potential partial match
                # looking forward. We know it's a non-zero length section because
                # we checked that up front.

                # First check we're had enough delimiters, because if we haven't then
                # there's no sense to try matching
                if self.min_delimiters and len(delimiters) < self.min_delimiters:
                    return MatchResult.from_unmatched(mutated_segments)

                # We use the whitespace padded match to hoover up whitespace if enabled,
                # and default to the longest matcher. We don't care which one matches.
                pre_non_code, trimmed_segments, post_non_code = trim_non_code_segments(
                    mutated_segments
                )
                # Check for whitespace gaps.
                # We do this explicitly here rather than relying on an
                # untrimmed match so we can handle _whitespace_ explicitly
                # compared to other non code segments like placeholders.
                if not self.allow_gaps and any(
                    seg.is_whitespace for seg in pre_non_code + post_non_code
                ):
                    return MatchResult.from_unmatched(mutated_segments)

                with parse_context.deeper_match() as ctx:
                    mat, _ = self._longest_trimmed_match(
                        trimmed_segments,
                        self._elements,
                        parse_context=ctx,
                        # We've already trimmed
                        trim_noncode=False,
                    )

                if mat:
                    # We've got something at the end. Return!
                    if mat.unmatched_segments:
                        # We have something unmatched and so we should let it also have the trailing elements
                        return MatchResult(
                            matched_segments.matched_segments
                            + pre_non_code
                            + mat.matched_segments,
                            mat.unmatched_segments + post_non_code,
                        )
                    else:
                        # If there's nothing unmatched in the most recent match, then we can consume the trailing
                        # non code segments
                        return MatchResult.from_matched(
                            matched_segments.matched_segments
                            + pre_non_code
                            + mat.matched_segments
                            + post_non_code,
                        )
                else:
                    # No match at the end, are we allowed to trail? If we are then return,
                    # otherwise we fail because we can't match the last element.
                    if self.allow_trailing:
                        return MatchResult(matched_segments.matched_segments, seg_buff)
                    else:
                        return MatchResult.from_unmatched(mutated_segments)
Example #11
0
    def match(
        self,
        segments: Tuple[BaseSegment, ...],
        parse_context: ParseContext,
    ) -> MatchResult:
        """Match an arbitrary number of elements separated by a delimiter.

        Note that if there are multiple elements passed in that they will be treated
        as different options of what can be delimited, rather than a sequence.
        """
        # Have we been passed an empty list?
        if len(segments) == 0:
            return MatchResult.from_empty()

        # Make some buffers
        seg_buff = segments
        matched_segments: Tuple[BaseSegment, ...] = ()
        unmatched_segments: Tuple[BaseSegment, ...] = ()
        cached_matched_segments: Tuple[BaseSegment, ...] = ()
        cached_unmatched_segments: Tuple[BaseSegment, ...] = ()

        delimiters = 0
        matched_delimiter = False

        # We want to render progress bar only for the main matching loop,
        # so disable it when in deeper parsing.
        disable_progress_bar = (
            parse_context.parse_depth > 0
            or progress_bar_configuration.disable_progress_bar)

        # We use amount of `NewLineSegment` to estimate how many steps could be in
        # a big file. It's not perfect, but should do a job in most cases.
        new_line_segments = [
            s for s in segments if isinstance(s, NewlineSegment)
        ]
        progressbar_matching = tqdm(
            total=len(new_line_segments),
            desc="matching",
            miniters=30,
            disable=disable_progress_bar,
            leave=False,
        )

        seeking_delimiter = False
        has_matched_segs = False
        terminated = False

        delimiter_matchers = [self.delimiter]
        terminator_matchers = []

        if self.terminator:
            terminator_matchers.append(self.terminator)
        # If gaps aren't allowed, a gap (or non-code segment), acts like a terminator.
        if not self.allow_gaps:
            terminator_matchers.append(NonCodeMatcher())

        while True:
            progressbar_matching.update(n=1)

            if seeking_delimiter:
                elements = delimiter_matchers

            else:
                elements = self._elements

            if len(seg_buff) > 0:
                pre_non_code, seg_content, post_non_code = trim_non_code_segments(
                    seg_buff)

                if not self.allow_gaps and any(seg.is_whitespace
                                               for seg in pre_non_code):
                    unmatched_segments = seg_buff
                    break

                if not seg_content:  # pragma: no cover
                    matched_segments += pre_non_code
                    break

                # Check whether there is a terminator before checking for content
                with parse_context.deeper_match() as ctx:
                    match, _ = self._longest_trimmed_match(
                        segments=seg_content,
                        matchers=terminator_matchers,
                        parse_context=ctx,
                        # We've already trimmed
                        trim_noncode=False,
                    )

                    if match:
                        terminated = True
                        unmatched_segments = (pre_non_code +
                                              match.all_segments() +
                                              post_non_code)
                        break

                with parse_context.deeper_match() as ctx:
                    match, _ = self._longest_trimmed_match(
                        segments=seg_content,
                        matchers=elements,
                        parse_context=ctx,
                        # We've already trimmed
                        trim_noncode=False,
                        terminators=delimiter_matchers
                        if elements != delimiter_matchers else None,
                    )

                if match:

                    if elements == delimiter_matchers:
                        delimiters += 1
                        matched_delimiter = True
                        cached_matched_segments = matched_segments
                        cached_unmatched_segments = seg_buff

                    else:
                        matched_delimiter = False

                    has_matched_segs = True
                    seg_buff = match.unmatched_segments + post_non_code
                    unmatched_segments = match.unmatched_segments

                    if match.is_complete():

                        matched_segments += (pre_non_code +
                                             match.matched_segments +
                                             post_non_code)

                        unmatched_segments = match.unmatched_segments
                        break

                    matched_segments += pre_non_code + match.matched_segments
                    seeking_delimiter = not seeking_delimiter

                else:
                    matched_segments += pre_non_code
                    unmatched_segments = match.unmatched_segments + post_non_code
                    break

            else:
                break  # pragma: no cover

        if self.min_delimiters:
            if delimiters < self.min_delimiters:
                return MatchResult.from_unmatched(matched_segments +
                                                  unmatched_segments)

        if terminated:
            if has_matched_segs:
                return MatchResult(matched_segments, unmatched_segments)
            else:
                return MatchResult.from_unmatched(matched_segments +
                                                  unmatched_segments)

        if matched_delimiter and not self.allow_trailing:

            if not unmatched_segments:
                return MatchResult.from_unmatched(matched_segments +
                                                  unmatched_segments)
            else:
                return MatchResult(cached_matched_segments,
                                   cached_unmatched_segments)

        if not has_matched_segs:

            return MatchResult.from_unmatched(matched_segments +
                                              unmatched_segments)

        if not unmatched_segments:
            return MatchResult.from_matched(matched_segments)

        return MatchResult(matched_segments, unmatched_segments)
Example #12
0
    def _longest_trimmed_match(
        cls,
        segments: Tuple[BaseSegment, ...],
        matchers: List[MatchableType],
        parse_context: ParseContext,
        trim_noncode=True,
        terminators: List[MatchableType] = None,
    ) -> Tuple[MatchResult, Optional[MatchableType]]:
        """Return longest match from a selection of matchers.

        Prioritise the first match, and if multiple match at the same point the longest.
        If two matches of the same length match at the same time, then it's the first in
        the iterable of matchers.

        Returns:
            `tuple` of (match_object, matcher).

        """
        terminated = False

        # Have we been passed an empty list?
        if len(segments) == 0:  # pragma: no cover
            return MatchResult.from_empty(), None

        # If gaps are allowed, trim the ends.
        if trim_noncode:
            pre_nc, segments, post_nc = trim_non_code_segments(segments)

        best_match_length = 0
        # iterate at this position across all the matchers
        for matcher in matchers:
            # MyPy seems to require a type hint here. Not quite sure why.
            res_match: MatchResult = matcher.match(segments,
                                                   parse_context=parse_context)
            if res_match.is_complete():
                # Just return it! (WITH THE RIGHT OTHER STUFF)
                if trim_noncode:
                    return (
                        MatchResult.from_matched(pre_nc +
                                                 res_match.matched_segments +
                                                 post_nc),
                        matcher,
                    )
                else:
                    return res_match, matcher
            elif res_match:
                # We've got an incomplete match, if it's the best so far keep it.
                if res_match.trimmed_matched_length > best_match_length:
                    best_match = res_match, matcher
                    best_match_length = res_match.trimmed_matched_length

                    if terminators:

                        _, segs, _ = trim_non_code_segments(
                            best_match[0].unmatched_segments)
                        for terminator in terminators:
                            terminator_match: MatchResult = terminator.match(
                                segs, parse_context=parse_context)

                            if terminator_match.matched_segments:
                                terminated = True
                                break

            if terminated:
                break

            # We could stash segments here, but given we might have some successful
            # matches here, we shouldn't, because they'll be mutated in the wrong way.
            # Eventually there might be a performance gain from doing that sensibly
            # here.

        # If we get here, then there wasn't a complete match. If we
        # has a best_match, return that.
        if best_match_length > 0:
            if trim_noncode:
                return (
                    MatchResult(
                        pre_nc + best_match[0].matched_segments,
                        best_match[0].unmatched_segments + post_nc,
                    ),
                    best_match[1],
                )
            else:
                return best_match
        # If no match at all, return nothing
        return MatchResult.from_unmatched(segments), None