def _add_token(self, token: _TokenT) -> None: """ This is the only core function for parsing. Here happens basically everything. Everything is well prepared by the parser generator and we only apply the necessary steps here. """ grammar = self._pgen_grammar stack = self.stack # pyre-fixme[6]: Expected `_TokenTypeT` for 2nd param but got `TokenType`. transition = _token_to_transition(grammar, token.type, token.string) while True: try: plan = stack[-1].dfa.transitions[transition] break except KeyError: if stack[-1].dfa.is_final: try: self._pop() except Exception: # convert_nonterminal may fail, try to recover enough to at # least tell us where in the file it failed. raise ParserSyntaxError( message="internal error", encountered="", # TODO: encountered/expected are nonsense expected=[], pos=token.start_pos, lines=self.lines, ) else: raise ParserSyntaxError( message="incomplete input", encountered=token.string, expected=list(stack[-1].dfa.arcs.keys()), pos=token.start_pos, lines=self.lines, ) except IndexError: raise ParserSyntaxError( message="too much input", encountered=token.string, expected=None, # EOF pos=token.start_pos, lines=self.lines, ) # Logically, `plan` is always defined, but pyre can't reasonably determine that. # pyre-fixme[18]: Global name `plan` is undefined. stack[-1].dfa = plan.next_dfa for push in plan.dfa_pushes: stack.append(StackNode(push)) leaf = self.convert_terminal(token) stack[-1].nodes.append(leaf)
def parse(self) -> _NodeT: # Ensure that we don't re-use parsers. if self.__was_parse_called: raise Exception("Each parser object may only be used to parse once.") self.__was_parse_called = True for token in self.tokens: self._add_token(token) while True: tos = self.stack[-1] if not tos.dfa.is_final: expected_str = get_expected_str( EOFSentinel.EOF, tos.dfa.transitions.keys() ) raise ParserSyntaxError( f"Incomplete input. {expected_str}", lines=self.lines, raw_line=len(self.lines), raw_column=len(self.lines[-1]), ) if len(self.stack) > 1: self._pop() else: return self.convert_nonterminal(tos.nonterminal, tos.nodes)
def parse(self) -> _NodeT: # Ensure that we don't re-use parsers. if self.__was_parse_called: raise Exception("Each parser object may only be used to parse once.") self.__was_parse_called = True for token in self.tokens: self._add_token(token) while True: tos = self.stack[-1] if not tos.dfa.is_final: # We never broke out -- EOF is too soon -- Unfinished statement. raise ParserSyntaxError( message="incomplete input", encountered=None, expected=list(tos.dfa.arcs.keys()), pos=(len(self.lines), len(self.lines[-1])), lines=self.lines, ) if len(self.stack) > 1: self._pop() else: return self.convert_nonterminal(tos.nonterminal, tos.nodes)
class ExceptionsTest(UnitTest): @data_provider([ ( ParserSyntaxError( message="some message", encountered=None, # EOF expected=None, # EOF pos=(1, 0), lines=["abcd"], ), dedent(""" Syntax Error: some message @ 1:1. Encountered end of file (EOF), but expected end of file (EOF). abcd ^ """).strip(), ), ( ParserSyntaxError( message="some message", encountered="encountered_value", expected=["expected_value"], pos=(1, 2), lines=["\tabcd\r\n"], ), dedent(""" Syntax Error: some message @ 1:10. Encountered 'encountered_value', but expected one of ['expected_value']. abcd ^ """).strip(), ), ]) def test_parser_syntax_error_str(self, err: ParserSyntaxError, expected: str) -> None: self.assertEqual(str(err), expected)
def parse(self): for token in self.tokens: self._add_token(token) while True: tos = self.stack[-1] if not tos.dfa.is_final: expected_str = get_expected_str(EOFSentinel.EOF, tos.dfa.transitions.keys()) raise ParserSyntaxError( f"{expected_str}", lines=self.lines, raw_line=len(self.lines), raw_column=len(self.lines[-1]), ) if len(self.stack) > 1: self._pop() else: return self.convert_nonterminal(tos.nonterminal, tos.nodes)
def _convert_token( # noqa: C901: too complex state: _TokenizeState, curr_token: OrigToken, next_token: Optional[OrigToken] ) -> Token: ct_type = curr_token.type ct_string = curr_token.string ct_start_pos = curr_token.start_pos if ct_type is _ERRORTOKEN: raise ParserSyntaxError( f"{ct_string!r} is not a valid token.", lines=state.lines, raw_line=ct_start_pos[0], raw_column=ct_start_pos[1], ) if ct_type is _ERROR_DEDENT: raise ParserSyntaxError( "Inconsistent indentation. Expected a dedent.", lines=state.lines, raw_line=ct_start_pos[0], raw_column=ct_start_pos[1], ) # Compute relative indent changes for indent/dedent nodes relative_indent: Optional[str] = None if ct_type is _INDENT: old_indent = "" if len(state.indents) < 2 else state.indents[-2] new_indent = state.indents[-1] relative_indent = new_indent[len(old_indent) :] if next_token is not None: nt_type = next_token.type if nt_type is _INDENT: nt_line, nt_column = next_token.start_pos state.indents.append(state.lines[nt_line - 1][:nt_column]) elif nt_type is _DEDENT: state.indents.pop() whitespace_before = state.previous_whitespace_state if ct_type is _INDENT or ct_type is _DEDENT or ct_type is _ENDMARKER: # Don't update whitespace state for these dummy tokens. This makes it possible # to partially parse whitespace for IndentedBlock footers, and then parse the # rest of the whitespace in the following statement's leading_lines. # Unfortunately, that means that the indentation is either wrong for the footer # comments, or for the next line. We've chosen to allow it to be wrong for the # IndentedBlock footer and manually override the state when parsing whitespace # in that particular node. whitespace_after = whitespace_before ct_end_pos = ct_start_pos else: # Not a dummy token, so update the whitespace state. # Compute our own end_pos, since parso's end_pos is wrong for triple-strings. lines = split_lines(ct_string) if len(lines) > 1: ct_end_pos = ct_start_pos[0] + len(lines) - 1, len(lines[-1]) else: ct_end_pos = (ct_start_pos[0], ct_start_pos[1] + len(ct_string)) # Figure out what mode the whitespace parser should use. If we're inside # parentheses, certain whitespace (e.g. newlines) are allowed where they would # otherwise not be. f-strings override and disable this behavior, however. # # Parso's tokenizer tracks this internally, but doesn't expose it, so we have to # duplicate that logic here. pof_stack = state.parenthesis_or_fstring_stack try: if ct_type is _FSTRING_START: pof_stack.append(_FSTRING_STACK_ENTRY) elif ct_type is _FSTRING_END: pof_stack.pop() elif ct_type is _OP: if ct_string in "([{": pof_stack.append(_PARENTHESIS_STACK_ENTRY) elif ct_string in ")]}": pof_stack.pop() except IndexError: # pof_stack may be empty by the time we need to read from it due to # mismatched braces. raise ParserSyntaxError( "Encountered a closing brace without a matching opening brace.", lines=state.lines, raw_line=ct_start_pos[0], raw_column=ct_start_pos[1], ) is_parenthesized = ( len(pof_stack) > 0 and pof_stack[-1] == _PARENTHESIS_STACK_ENTRY ) whitespace_after = WhitespaceState( ct_end_pos[0], ct_end_pos[1], state.indents[-1], is_parenthesized ) # Hold onto whitespace_after, so we can use it as whitespace_before in the next # node. state.previous_whitespace_state = whitespace_after return Token( ct_type, ct_string, ct_start_pos, ct_end_pos, whitespace_before, whitespace_after, relative_indent, )
def _add_token(self, token: _TokenT) -> None: """ This is the only core function for parsing. Here happens basically everything. Everything is well prepared by the parser generator and we only apply the necessary steps here. """ grammar = self._pgen_grammar stack = self.stack # pyre-fixme[6]: Expected `_TokenTypeT` for 2nd param but got `TokenType`. transition = _token_to_transition(grammar, token.type, token.string) while True: try: plan = stack[-1].dfa.transitions[transition] break except KeyError: if stack[-1].dfa.is_final: try: self._pop() except PartialParserSyntaxError as ex: # Upconvert the PartialParserSyntaxError to a ParserSyntaxError # by backfilling the line/column information. raise ParserSyntaxError( ex.message, lines=self.lines, raw_line=token.start_pos[0], raw_column=token.start_pos[1], ) except Exception as ex: # convert_nonterminal may fail due to a bug in our code. Try to # recover enough to at least tell us where in the file it # failed. raise ParserSyntaxError( f"Internal error: {ex}", lines=self.lines, raw_line=token.start_pos[0], raw_column=token.start_pos[1], ) else: # We never broke out -- EOF is too soon -- Unfinished statement. # # BUG: The `expected_str` may not be complete because we already # popped the other possibilities off the stack at this point, but # it still seems useful to list some of the possibilities that we # could've expected. expected_str = get_expected_str( token, stack[-1].dfa.transitions.keys() ) raise ParserSyntaxError( f"Incomplete input. {expected_str}", lines=self.lines, raw_line=token.start_pos[0], raw_column=token.start_pos[1], ) except IndexError: # I don't think this will ever happen with Python's grammar, because if # there are any extra tokens at the end of the input, we'll instead # complain that we expected ENDMARKER. # # However, let's leave it just in case. expected_str = get_expected_str(token, EOFSentinel.EOF) raise ParserSyntaxError( f"Too much input. {expected_str}", lines=self.lines, raw_line=token.start_pos[0], raw_column=token.start_pos[1], ) # Logically, `plan` is always defined, but pyre can't reasonably determine that. # pyre-fixme[18]: Global name `plan` is undefined. stack[-1].dfa = plan.next_dfa for push in plan.dfa_pushes: stack.append(StackNode(push)) leaf = self.convert_terminal(token) stack[-1].nodes.append(leaf)