def test__parser__grammar_noncode(seg_list, fresh_ansi_dialect): """Test the NonCodeMatcher.""" with RootParseContext(dialect=fresh_ansi_dialect) as ctx: m = NonCodeMatcher().match(seg_list[1:], parse_context=ctx) # NonCode Matcher doesn't work with simple assert NonCodeMatcher().simple(ctx) is None # We should match one and only one segment assert len(m) == 1
def match(self, segments: Tuple[BaseSegment, ...], parse_context: ParseContext) -> MatchResult: """Match an arbitrary number of elements separated by a delimiter. Note that if there are multiple elements passed in that they will be treated as different options of what can be delimited, rather than a sequence. """ # Have we been passed an empty list? if len(segments) == 0: return MatchResult.from_empty() # Make some buffers seg_buff = segments matched_segments = MatchResult.from_empty() # delimiters is a list of tuples containing delimiter segments as we find them. delimiters: List[BaseSegment] = [] # First iterate through all the segments, looking for the delimiter. # Second, split the list on each of the delimiters, and ensure that # each sublist in turn matches one of the elements. # In more detail, match against delimiter, if we match, put a slice # up to that point onto a list of slices. Carry on. while True: # Check to see whether we've exhausted the buffer, either by iterating through it, # or by consuming all the non-code segments already. # NB: If we're here then we've already tried matching the remaining segments against # the content, so we must be in a trailing case. if len(seg_buff) == 0: # Append the remaining buffer in case we're in the not is_code case. matched_segments += seg_buff # Nothing left, this is potentially a trailing case? if self.allow_trailing and ( self.min_delimiters is None or len(delimiters) >= self.min_delimiters): # It is! (nothing left so no unmatched segments to append) return MatchResult.from_matched( matched_segments.matched_segments) else: return MatchResult.from_unmatched(segments) # We rely on _bracket_sensitive_look_ahead_match to do the bracket counting # element of this now. We look ahead to find a delimiter or terminator. matchers = [self.delimiter] if self.terminator: matchers.append(self.terminator) # If gaps aren't allowed, a gap (or non-code segment), acts like a terminator. if not self.allow_gaps: matchers.append(NonCodeMatcher()) with parse_context.deeper_match() as ctx: ( pre_content, delimiter_match, delimiter_matcher, ) = self._bracket_sensitive_look_ahead_match( seg_buff, matchers, parse_context=ctx, ) # Keep track of the *length* of this pre-content section before we start # to change it later. We need this for dealing with terminators. pre_content_len = len(pre_content) # Have we found a delimiter or terminator looking forward? if delimiter_match: if delimiter_matcher is self.delimiter: # Yes. Store it and then match the contents up to now. delimiters.append(delimiter_match.matched_segments) # We now test the intervening section as to whether it matches one # of the things we're looking for. NB: If it's of zero length then # we return without trying it. if len(pre_content) > 0: with parse_context.deeper_match() as ctx: match, matcher = self._longest_trimmed_match( segments=pre_content, matchers=self._elements, parse_context=ctx, trim_noncode=self.allow_gaps, ) # No match, or an incomplete match: Not allowed if not match or not match.is_complete(): return MatchResult.from_unmatched(segments) # We have a complete match! # First add the segment up to the delimiter to the matched segments matched_segments += match.matched_segments # Then it depends what we matched. # Delimiter if delimiter_matcher is self.delimiter: # Then add the delimiter to the matched segments matched_segments += delimiter_match.matched_segments # Break this for loop and move on, looking for the next delimiter seg_buff = delimiter_match.unmatched_segments # Still got some buffer left. Carry on. continue # Terminator (or the gap terminator). elif delimiter_matcher is self.terminator or isinstance( delimiter_matcher, NonCodeMatcher): # We just return straight away here. We don't add the terminator to # this match, it should go with the unmatched parts. The terminator # may also have mutated the returned segments so we also DON'T want # the mutated version, it can do that itself (so we return `seg_buff` # and not `delimiter_match.all_segments()``) # First check we've had enough delimiters if (self.min_delimiters and len(delimiters) < self.min_delimiters): return MatchResult.from_unmatched(segments) else: return MatchResult( matched_segments.matched_segments, # Return the part of the seg_buff which isn't in the # pre-content. seg_buff[pre_content_len:], ) else: raise RuntimeError(( "I don't know how I got here. Matched instead on {0}, which " "doesn't appear to be delimiter or terminator" ).format(delimiter_matcher)) else: # Zero length section between delimiters, or zero code # elements if appropriate. Return unmatched. return MatchResult.from_unmatched(segments) else: # No match for a delimiter looking forward, this means we're # at the end. In this case we look for a potential partial match # looking forward. We know it's a non-zero length section because # we checked that up front. # First check we're had enough delimiters, because if we haven't then # there's no sense to try matching if self.min_delimiters and len( delimiters) < self.min_delimiters: return MatchResult.from_unmatched(segments) # We use the whitespace padded match to hoover up whitespace if enabled, # and default to the longest matcher. We don't care which one matches. with parse_context.deeper_match() as ctx: mat, _ = self._longest_trimmed_match( seg_buff, self._elements, parse_context=ctx, trim_noncode=self.allow_gaps, ) if mat: # We've got something at the end. Return! if mat.unmatched_segments: # We have something unmatched and so we should let it also have the trailing elements return MatchResult( matched_segments.matched_segments + mat.matched_segments, mat.unmatched_segments, ) else: # If there's nothing unmatched in the most recent match, then we can consume the trailing # non code segments return MatchResult.from_matched( matched_segments.matched_segments + mat.matched_segments, ) else: # No match at the end, are we allowed to trail? If we are then return, # otherwise we fail because we can't match the last element. if self.allow_trailing: return MatchResult(matched_segments.matched_segments, seg_buff) else: return MatchResult.from_unmatched(segments)
def match( self, segments: Tuple[BaseSegment, ...], parse_context: ParseContext, ) -> MatchResult: """Match an arbitrary number of elements separated by a delimiter. Note that if there are multiple elements passed in that they will be treated as different options of what can be delimited, rather than a sequence. """ # Have we been passed an empty list? if len(segments) == 0: return MatchResult.from_empty() # Make some buffers seg_buff = segments matched_segments: Tuple[BaseSegment, ...] = () unmatched_segments: Tuple[BaseSegment, ...] = () cached_matched_segments: Tuple[BaseSegment, ...] = () cached_unmatched_segments: Tuple[BaseSegment, ...] = () delimiters = 0 matched_delimiter = False # We want to render progress bar only for the main matching loop, # so disable it when in deeper parsing. disable_progress_bar = ( parse_context.parse_depth > 0 or progress_bar_configuration.disable_progress_bar) # We use amount of `NewLineSegment` to estimate how many steps could be in # a big file. It's not perfect, but should do a job in most cases. new_line_segments = [ s for s in segments if isinstance(s, NewlineSegment) ] progressbar_matching = tqdm( total=len(new_line_segments), desc="matching", miniters=30, disable=disable_progress_bar, leave=False, ) seeking_delimiter = False has_matched_segs = False terminated = False delimiter_matchers = [self.delimiter] terminator_matchers = [] if self.terminator: terminator_matchers.append(self.terminator) # If gaps aren't allowed, a gap (or non-code segment), acts like a terminator. if not self.allow_gaps: terminator_matchers.append(NonCodeMatcher()) while True: progressbar_matching.update(n=1) if seeking_delimiter: elements = delimiter_matchers else: elements = self._elements if len(seg_buff) > 0: pre_non_code, seg_content, post_non_code = trim_non_code_segments( seg_buff) if not self.allow_gaps and any(seg.is_whitespace for seg in pre_non_code): unmatched_segments = seg_buff break if not seg_content: # pragma: no cover matched_segments += pre_non_code break # Check whether there is a terminator before checking for content with parse_context.deeper_match() as ctx: match, _ = self._longest_trimmed_match( segments=seg_content, matchers=terminator_matchers, parse_context=ctx, # We've already trimmed trim_noncode=False, ) if match: terminated = True unmatched_segments = (pre_non_code + match.all_segments() + post_non_code) break with parse_context.deeper_match() as ctx: match, _ = self._longest_trimmed_match( segments=seg_content, matchers=elements, parse_context=ctx, # We've already trimmed trim_noncode=False, terminators=delimiter_matchers if elements != delimiter_matchers else None, ) if match: if elements == delimiter_matchers: delimiters += 1 matched_delimiter = True cached_matched_segments = matched_segments cached_unmatched_segments = seg_buff else: matched_delimiter = False has_matched_segs = True seg_buff = match.unmatched_segments + post_non_code unmatched_segments = match.unmatched_segments if match.is_complete(): matched_segments += (pre_non_code + match.matched_segments + post_non_code) unmatched_segments = match.unmatched_segments break matched_segments += pre_non_code + match.matched_segments seeking_delimiter = not seeking_delimiter else: matched_segments += pre_non_code unmatched_segments = match.unmatched_segments + post_non_code break else: break # pragma: no cover if self.min_delimiters: if delimiters < self.min_delimiters: return MatchResult.from_unmatched(matched_segments + unmatched_segments) if terminated: if has_matched_segs: return MatchResult(matched_segments, unmatched_segments) else: return MatchResult.from_unmatched(matched_segments + unmatched_segments) if matched_delimiter and not self.allow_trailing: if not unmatched_segments: return MatchResult.from_unmatched(matched_segments + unmatched_segments) else: return MatchResult(cached_matched_segments, cached_unmatched_segments) if not has_matched_segs: return MatchResult.from_unmatched(matched_segments + unmatched_segments) if not unmatched_segments: return MatchResult.from_matched(matched_segments) return MatchResult(matched_segments, unmatched_segments)
def match( self, segments: Tuple[BaseSegment, ...], parse_context: ParseContext, ) -> MatchResult: """Match an arbitrary number of elements separated by a delimiter. Note that if there are multiple elements passed in that they will be treated as different options of what can be delimited, rather than a sequence. """ # Have we been passed an empty list? if len(segments) == 0: return MatchResult.from_empty() # Make some buffers seg_buff = segments matched_segments = MatchResult.from_empty() # delimiters is a list of tuples containing delimiter segments as we find them. delimiters: List[BaseSegment] = [] # We want to render progress bar only for the main matching loop, # so disable it when in deeper parsing. disable_progress_bar = ( parse_context.parse_depth > 0 or progress_bar_configuration.disable_progress_bar ) # We use amount of `NewLineSegment` to estimate how many steps could be in # a big file. It's not perfect, but should do a job in most cases. new_line_segments = [s for s in segments if isinstance(s, NewlineSegment)] progressbar_matching = tqdm( total=len(new_line_segments), desc="matching", miniters=30, disable=disable_progress_bar, leave=False, ) # First iterate through all the segments, looking for the delimiter. # Second, split the list on each of the delimiters, and ensure that # each sublist in turn matches one of the elements. # In more detail, match against delimiter, if we match, put a slice # up to that point onto a list of slices. Carry on. while True: progressbar_matching.update(n=1) # Check to see whether we've exhausted the buffer, either by iterating # through it, or by consuming all the non-code segments already. # NB: If we're here then we've already tried matching the remaining segments # against the content, so we must be in a trailing case. if len(seg_buff) == 0: # Append the remaining buffer in case we're in the not is_code case. matched_segments += seg_buff # Nothing left, this is potentially a trailing case? if self.allow_trailing and ( self.min_delimiters is None or len(delimiters) >= self.min_delimiters ): # pragma: no cover TODO? # It is! (nothing left so no unmatched segments to append) return MatchResult.from_matched(matched_segments.matched_segments) else: # pragma: no cover TODO? return MatchResult.from_unmatched(segments) # We rely on _bracket_sensitive_look_ahead_match to do the bracket counting # element of this now. We look ahead to find a delimiter or terminator. matchers = [self.delimiter] if self.terminator: matchers.append(self.terminator) # If gaps aren't allowed, a gap (or non-code segment), acts like a # terminator. if not self.allow_gaps: matchers.append(NonCodeMatcher()) with parse_context.deeper_match() as ctx: ( pre_content, delimiter_match, delimiter_matcher, ) = self._bracket_sensitive_look_ahead_match( seg_buff, matchers, parse_context=ctx, bracket_pairs_set=self.bracket_pairs_set, ) # Store the mutated segments to reuse. mutated_segments = pre_content + delimiter_match.all_segments() # Have we found a delimiter or terminator looking forward? if delimiter_match: if delimiter_matcher is self.delimiter: # Yes. Store it and then match the contents up to now. delimiters.append(delimiter_match.matched_segments) # We now test the intervening section as to whether it matches one # of the things we're looking for. NB: If it's of zero length then # we return without trying it. if len(pre_content) > 0: pre_non_code, pre_content, post_non_code = trim_non_code_segments( pre_content ) # Check for whitespace gaps. # We do this explicitly here rather than relying on an # untrimmed match so we can handle _whitespace_ explicitly # compared to other non code segments like placeholders. if not self.allow_gaps and any( seg.is_whitespace for seg in pre_non_code + post_non_code ): return MatchResult.from_unmatched( mutated_segments ) # pragma: no cover TODO? with parse_context.deeper_match() as ctx: match, _ = self._longest_trimmed_match( segments=pre_content, matchers=self._elements, parse_context=ctx, # We've already trimmed trim_noncode=False, ) # No match - Not allowed if not match: if self.allow_trailing: # If we reach this point, the lookahead match has hit a # delimiter beyond the scope of this Delimited section. # Trailing delimiters are allowed, so return matched up to # this section. return MatchResult( matched_segments.matched_segments, pre_non_code + match.unmatched_segments + post_non_code + delimiter_match.all_segments(), ) else: return MatchResult.from_unmatched(mutated_segments) if not match.is_complete(): # If we reach this point, the lookahead match has hit a # delimiter beyond the scope of this Delimited section. We # should return a partial match, and the delimiter as unmatched. return MatchResult( matched_segments.matched_segments + pre_non_code + match.matched_segments, match.unmatched_segments + post_non_code + delimiter_match.all_segments(), ) # We have a complete match! # First add the segment up to the delimiter to the matched segments matched_segments += ( pre_non_code + match.matched_segments + post_non_code ) # Then it depends what we matched. # Delimiter if delimiter_matcher is self.delimiter: # Then add the delimiter to the matched segments matched_segments += delimiter_match.matched_segments # Break this for loop and move on, looking for the next # delimiter seg_buff = delimiter_match.unmatched_segments # Still got some buffer left. Carry on. continue # Terminator (or the gap terminator). elif delimiter_matcher is self.terminator or isinstance( delimiter_matcher, NonCodeMatcher ): # We just return straight away here. We don't add the terminator # to this match, it should go with the unmatched parts. # First check we've had enough delimiters if ( self.min_delimiters and len(delimiters) < self.min_delimiters ): return MatchResult.from_unmatched(mutated_segments) else: return MatchResult( matched_segments.matched_segments, delimiter_match.all_segments(), ) else: # pragma: no cover raise RuntimeError( ( "I don't know how I got here. Matched instead on {}, " "which doesn't appear to be delimiter or terminator" ).format(delimiter_matcher) ) else: # Zero length section between delimiters, or zero code # elements if appropriate. Return unmatched. return MatchResult.from_unmatched(mutated_segments) else: # No match for a delimiter looking forward, this means we're # at the end. In this case we look for a potential partial match # looking forward. We know it's a non-zero length section because # we checked that up front. # First check we're had enough delimiters, because if we haven't then # there's no sense to try matching if self.min_delimiters and len(delimiters) < self.min_delimiters: return MatchResult.from_unmatched(mutated_segments) # We use the whitespace padded match to hoover up whitespace if enabled, # and default to the longest matcher. We don't care which one matches. pre_non_code, trimmed_segments, post_non_code = trim_non_code_segments( mutated_segments ) # Check for whitespace gaps. # We do this explicitly here rather than relying on an # untrimmed match so we can handle _whitespace_ explicitly # compared to other non code segments like placeholders. if not self.allow_gaps and any( seg.is_whitespace for seg in pre_non_code + post_non_code ): return MatchResult.from_unmatched( mutated_segments ) # pragma: no cover TODO? with parse_context.deeper_match() as ctx: mat, _ = self._longest_trimmed_match( trimmed_segments, self._elements, parse_context=ctx, # We've already trimmed trim_noncode=False, ) if mat: # We've got something at the end. Return! if mat.unmatched_segments: # We have something unmatched and so we should let it also have # the trailing elements return MatchResult( matched_segments.matched_segments + pre_non_code + mat.matched_segments, mat.unmatched_segments + post_non_code, ) else: # If there's nothing unmatched in the most recent match, then we # can consume the trailing non code segments return MatchResult.from_matched( matched_segments.matched_segments + pre_non_code + mat.matched_segments + post_non_code, ) else: # No match at the end, are we allowed to trail? If we are then # return, otherwise we fail because we can't match the last element. if self.allow_trailing: return MatchResult(matched_segments.matched_segments, seg_buff) else: return MatchResult.from_unmatched(mutated_segments)