def find_tokens(matcher): ''' Returns a set of Tokens. Also asserts that children of tokens are not themselves Tokens. Should we also check that a Token occurs somewhere on every path to a leaf node? ''' (tokens, visited, non_tokens) = (set(), set(), set()) stack = deque([matcher]) while stack: matcher = stack.popleft() if matcher not in visited: if is_child(matcher, NonToken): non_tokens.add(matcher) if isinstance(matcher, BaseToken): tokens.add(matcher) if matcher.content: assert_not_token(matcher.content, visited) else: for child in matcher: if isinstance(child, Matcher): stack.append(child) visited.add(matcher) if tokens and non_tokens: raise LexerError( fmt('The grammar contains a mix of Tokens and non-Token ' 'matchers at the top level. If Tokens are used then ' 'non-token matchers that consume input must only ' 'appear "inside" Tokens. The non-Token matchers ' 'include: {0}.', '; '.join(str(n) for n in non_tokens))) return tokens
def assert_not_token(node, visited): ''' Assert that neither this nor any child node is a Token. ''' if isinstance(node, Matcher) and node not in visited: visited.add(node) if isinstance(node, BaseToken): raise LexerError(fmt('Nested token: {0}', node)) else: for child in node: assert_not_token(child, visited)
def _match(self, stream): ''' On matching we first assert that the token type is correct and then delegate to the content. ''' if not self.compiled: raise LexerError( fmt('A {0} token has not been compiled. ' 'You must use the lexer rewriter with Tokens. ' 'This can be done by using matcher.config.lexer().', self.__class__.__name__)) ((tokens, _), next_stream) = s_next(stream) if self.id_ in tokens: yield ([], next_stream)
def __to_regexp(regexp, alphabet): ''' The regexp may be a matcher; if so we try to convert it to a regular expression and extract the equivalent text. ''' if isinstance(regexp, Matcher): rewriter = CompileRegexp(alphabet) rewrite = rewriter(regexp) if isinstance(rewrite, BaseRegexp): regexp = str(rewrite.regexp) else: raise LexerError( format('A Token was specified with a matcher, ' 'but the matcher could not be converted to ' 'a regular expression: {0}', rewrite)) return regexp
def _match(self, stream): ''' On matching we first assert that the token type is correct and then delegate to the content. ''' if not self.compiled: raise LexerError( fmt('A {0} token has not been compiled. ' 'You must use the lexer rewriter with Tokens. ' 'This can be done by using matcher.config.lexer().', self.__class__.__name__)) ((tokens, line_stream), next_stream) = s_next(stream) if self.id_ in tokens: if self.content is None: # result contains all data (use s_next not s_line to set max) (line, _) = s_line(line_stream, True) (line, _) = s_next(line_stream, count=len(line)) yield ([line], next_stream) else: generator = self.content._match(line_stream) while True: (result, next_line_stream) = yield generator if s_empty(next_line_stream) or not self.complete: yield (result, next_stream)
def _match(self, stream): ''' On matching we first assert that the token type is correct and then delegate to the content. ''' if not self.compiled: raise LexerError( format('A {0} token has not been compiled. ' 'You must use the lexer rewriter with Tokens. ' 'This can be done by using matcher.config.lexer().', self.__class__.__name__)) if stream: (tokens, contents) = stream[0] if self.id_ in tokens: if self.content is None: # result contains all data yield ([contents], stream[1:]) else: new_stream = self.__new_stream(contents, stream) generator = self.content._match(new_stream) while True: (result, stream_out) = yield generator if not stream_out or not self.complete: yield (result, stream[1:])