def testNegativeLookahead(self): tokenize = lexer.LexicalGrammar('a b') rules = { 'goal': [ [LookaheadRule(frozenset({'a'}), False), 'abs'], ], 'abs': [ ['a'], ['b'], ['abs', 'a'], ['abs', 'b'], ], } parse = gen.compile(Grammar(rules)) self.assertRaisesRegex(SyntaxError, r"expected 'b', got 'a'", lambda: parse(tokenize, "a b")) self.assertEqual( parse(tokenize, 'b a'), ('goal', ('abs 2', 'b', 'a')) ) # In simple cases like this, the lookahead restriction can even # disambiguate a grammar that would otherwise be ambiguous. rules['goal'].append(prod(['a'], 'goal_a')) parse = gen.compile(Grammar(rules)) self.assertEqual( parse(tokenize, 'a'), ('goal_a', 'a') )
def disabledNegativeLookaheadDisambiguation(self): tokenize = lexer.LexicalGrammar( '( ) { } ; function =', IDENT=r'[A-Za-z_][A-Za-z_0-9]*') grammar = Grammar({ 'stmts': [ ['stmt'], ['stmts', 'stmt'], ], 'stmt': [ [LookaheadRule(set=frozenset({'function'}), positive=False), 'expr', ';'], ['fndecl'], ], 'fndecl': [ ['function', 'IDENT', '(', ')', '{', Optional('stmt'), '}'], ], 'expr': [ ['term'], ['IDENT', '=', 'expr'], ], 'term': [ ['(', 'expr', ')'], ['fndecl'], ['term', '(', 'expr', ')'], ], }) parse = gen.compile(grammar) # Test that without the lookahead restriction, we reject this grammar # (it's ambiguous): del grammar['stmt'][0][0] self.assertRaisesRegex(ValueError, 'banana', lambda: gen.compile(grammar)) self.assertEqual( parse(tokenize, 'function f() { x = function y() {}; }'), ('stmt', 1, ('fndecl', 'function', 'f', '(', ')', '{', ('stmt', 0, ('expr', 1, 'x', '=', ('expr', 0, ('term', 1, ('fndecl', 'function', 'y', '(', ')', '{', None, '}')))), ';')))) self.assertEqual( parse(tokenize, '(function g(){});'), ('stmts', 0, ('stmt', 0, ('term', 1, ('fndecl', 'function', 'g', '(', ')', '{', None, '}')), ';')))
def testList(self): list_grammar = Grammar({ 'prelist': [ ['word', 'list'] ], 'list': [ ['word'], ['list', 'word'], ], 'word': [ ['SYMBOL'] ], }) parse = gen.compile(list_grammar) self.assertEqual( parse(LispTokenizer, "the quick brown fox jumped over the lazy dog"), ('prelist', 'the', ('list 1', ('list 1', ('list 1', ('list 1', ('list 1', ('list 1', ('list 1', 'quick', 'brown'), 'fox'), 'jumped'), 'over'), 'the'), 'lazy'), 'dog')))
def testDeepRecursion(self): grammar = Grammar({ 'expr': [ ['SYMBOL'], ['(', ')'], ['(', 'exprs', ')'], ], 'exprs': [ ['expr'], ['exprs', 'expr'], ], }) parse = gen.compile(grammar) N = 3000 s = "x" t = ('expr 0', 'x') for i in range(N): s = "(" + s + ")" t = ('expr 2', '(', t, ')') result = parse(LispTokenizer, s) # Python can't check that result == t; it causes a RecursionError. # Testing that repr(result) == repr(t), same deal. So: for i in range(N): self.assertIsInstance(result, tuple) self.assertEqual(len(result), 4) self.assertEqual(result[0], 'expr 2') self.assertEqual(result[1], '(') self.assertEqual(result[3], ')') result = result[2]
def testSimple(self): grammar = Grammar({ 'expr': [ ['SYMBOL'], ['(', 'tail'], ], 'tail': [ [')'], ['expr', 'tail'], ], }) parse = gen.compile(grammar) parsed = parse(LispTokenizer, "(lambda (x) (* x x))") self.assertEqual( parsed, ('expr 1', '(', ('tail 1', 'lambda', ('tail 1', ('expr 1', '(', ('tail 1', 'x', ')')), ('tail 1', ('expr 1', '(', ('tail 1', '*', ('tail 1', 'x', ('tail 1', 'x', ')')))), ')')))))
def testLeftFactorMultiLevel(self): """Test left-factoring again on a nonterminal introduced by left-factoring.""" tokenize = lexer.LexicalGrammar("FOR IN TO BY ( ) = ;", VAR=r'[A-Za-z]+') # The first left-factoring pass on `stmt` will left-factor `FOR ( VAR`. # A second pass is needed to left-factor `= expr TO expr`. grammar = Grammar({ 'stmt': [ ['expr', ';'], ['FOR', '(', 'VAR', 'IN', 'expr', ')', 'stmt'], ['FOR', '(', 'VAR', '=', 'expr', 'TO', 'expr', ')', 'stmt'], ['FOR', '(', 'VAR', '=', 'expr', 'TO', 'expr', 'BY', 'expr', ')', 'stmt'], ['IF', '(', 'expr', ')', 'stmt'], ], 'expr': [ ['VAR'], ], }) parse = gen.compile(grammar) self.assertEqual( parse(tokenize, "FOR (x IN y) z;"), ('stmt 1', 'FOR', '(', 'x', 'IN', 'y', ')', ('stmt 0', 'z', ';'))) self.assertEqual( parse(tokenize, "FOR (x = y TO z) x;"), ('stmt 2', 'FOR', '(', 'x', '=', 'y', 'TO', 'z', ')', ('stmt 0', 'x', ';'))) self.assertEqual( parse(tokenize, "FOR (x = y TO z BY w) x;"), ('stmt 3', 'FOR', '(', 'x', '=', 'y', 'TO', 'z', 'BY', 'w', ')', ('stmt 0', 'x', ';')))
def testTrailingLookahead(self): """Lookahead at the end of a production is banned.""" grammar = gen.Grammar({ 'stmt': [ ['OTHER', ';'], ['IF', '(', 'X', ')', 'stmt', LookaheadRule(frozenset({'ELSE'}), False)], ['IF', '(', 'X', ')', 'stmt', 'ELSE', 'stmt'], ], }) self.assertRaisesRegex( ValueError, r"invalid grammar: lookahead restriction at end of production", lambda: gen.compile(grammar))
def testLeftFactorMulti(self): """Test left-factoring with common prefix of length >1.""" tokenize = lexer.LexicalGrammar("A B C D E") grammar = Grammar({ 'goal': [ ['A', 'B', 'C', 'D'], ['A', 'B', 'C', 'E'], ], }) parse = gen.compile(grammar) self.assertEqual( parse(tokenize, "A B C D"), ('goal 0', 'A', 'B', 'C', 'D')) self.assertEqual( parse(tokenize, "A B C E"), ('goal 1', 'A', 'B', 'C', 'E'))
def compile_as_js( self, grammar_source: str, goals: typing.Optional[typing.Iterable[str]] = None, verbose: bool = False, ) -> None: """Like self.compile(), but generate a parser from ESGrammar, with ASI support, using the JS lexer. """ from js_parser.lexer import JSLexer from js_parser import load_es_grammar from js_parser import generate_js_parser_tables grammar = parse_esgrammar( grammar_source, filename="es-simplified.esgrammar", extensions=[], goals=goals, synthetic_terminals=load_es_grammar.ECMASCRIPT_SYNTHETIC_TERMINALS, terminal_names=load_es_grammar.TERMINAL_NAMES_FOR_SYNTACTIC_GRAMMAR ) grammar = generate_js_parser_tables.hack_grammar(grammar) base_parser_class = gen.compile(grammar, verbose=verbose) # "type: ignore" because poor mypy can't cope with the runtime codegen # we're doing here. class JSParser(base_parser_class): # type: ignore def __init__(self, goal='Script', builder=None): super().__init__(goal, builder) self._goal = goal # self.debug = True def clone(self): return JSParser(self._goal, self.methods) def on_recover(self, error_code, lexer, stv): """Check that ASI error recovery is really acceptable.""" if error_code == 'asi': if not self.closed and stv.term != '}' and not lexer.saw_line_terminator( ): lexer.throw("missing semicolon") else: assert error_code == 'do_while_asi' self.tokenize = JSLexer self.parser_class = JSParser
def testArithmetic(self): tokenize = lexer.LexicalGrammar( "+ - * / ( )", NUM=r'[0-9]\w*', VAR=r'[A-Za-z]\w*') arith_grammar = Grammar({ 'expr': [ ['term'], ['expr', '+', 'term'], ['expr', '-', 'term'], ], 'term': [ ['prim'], ['term', '*', 'prim'], ['term', '/', 'prim'], ], 'prim': [ ['NUM'], ['VAR'], ['(', 'expr', ')'], ], }) parse = gen.compile(arith_grammar) self.assertEqual( parse(tokenize, '2 * 3 + 4 * (5 + 7)'), ('expr 1', ('term 1', '2', '*', '3'), '+', ('term 1', '4', '*', ('prim 2', '(', ('expr 1', '5', '+', '7'), ')')))) self.assertRaisesRegex( SyntaxError, r"unexpected end of input", lambda: parse(tokenize, "(")) self.assertRaisesRegex( SyntaxError, r"expected one of \['\(', 'NUM', 'VAR'], got '\)'", lambda: parse(tokenize, ")"))
def testOptionalEmpty(self): tokenize = lexer.LexicalGrammar("X Y") grammar = Grammar({ 'a': [ [Optional('b'), Optional('c')], ], 'b': [ prod(['X'], 'b'), ], 'c': [ prod(['Y'], 'c'), ] }) parse = gen.compile(grammar) self.assertEqual(parse(tokenize, ""), ('a', None, None)) self.assertEqual(parse(tokenize, "X"), ('a', ('b', 'X'), None)) self.assertEqual(parse(tokenize, "Y"), ('a', None, ('c', 'Y'))) self.assertEqual(parse(tokenize, "X Y"), ('a', ('b', 'X'), ('c', 'Y')))
def testOptional(self): tokenize = lexer.LexicalGrammar('[ ] , X') grammar = Grammar({ 'array': [ ['[', Optional('elision'), ']'], ['[', 'elements', ']'], ['[', 'elements', ',', Optional('elision'), ']'] ], 'elements': [ [Optional('elision'), 'X'], ['elements', ',', Optional('elision'), 'X'] ], 'elision': [ [','], ['elision', ','] ] }) parse = gen.compile(grammar) self.assertEqual(parse(tokenize, "[]"), ('array 0', '[', None, ']')) self.assertEqual(parse(tokenize, "[,]"), ('array 0', '[', ',', ']')) self.assertEqual( parse(tokenize, "[,,X,,X,]"), ('array 2', '[', ('elements 1', ('elements 0', ('elision 1', ',', ','), 'X'), ',', ',', 'X'), ',', None, ']'))
def compile(self, tokenize, grammar): """Compile a grammar. Use this when you expect compilation to succeed.""" self.tokenize = tokenize self.parse = gen.compile(grammar)
# prose not wrapped in square brackets # To avoid conflict with the `>` token, this is recognized only after a space. PROSE=r'(?<= )>[^\n]*', # prose wrapped in square brackets WPROSE=r'\[>[^]]*\]', # expression denoting a matched terminal or nonterminal MATCH_REF=r'\$(?:0|[1-9][0-9]*)', # the spec also gives a few productions names RUSTCOMMENT=r'//.*\n', ) ESGrammarParser = gen.compile( parse_pgen.load_grammar( os.path.join(os.path.dirname(__file__), "esgrammar.pgen"))) SIGIL_FALSE = '~' SIGIL_TRUE = '+' # Abbreviations for single-character terminals, used in the lexical grammar. ECMASCRIPT_CODE_POINTS = { # From <https://tc39.es/ecma262/#table-31> '<ZWNJ>': grammar.Literal('\u200c'), '<ZWJ>': grammar.Literal('\u200d'), '<ZWNBSP>': grammar.Literal('\ufeff'), # From <https://tc39.es/ecma262/#table-32> '<TAB>': grammar.Literal('\t'), '<VT>': grammar.Literal('\u000b'),
# nonterminals wrapped in vertical bars for no apparent reason NTALT=r'\|[A-Z]\w+\|', # the spec also gives a few productions names PRODID=r'#[A-Za-z]\w*', # prose to the end of the line PROSE=r'>.*', # prose wrapped in square brackets WPROSE=r'\[>[^]]*\]', ) parse_esgrammar_generic = gen.compile( parse_pgen.load_grammar( os.path.join(os.path.dirname(__file__), "esgrammar.pgen"))) SIGIL_FALSE = '~' SIGIL_TRUE = '+' # Productions like # # Expression : AssignmentExpression # PrimaryExpression : ArrayLiteral # Statement : IfStatement # # should not cause an extra method call; the action for each of these # productions should be `$0`, i.e. just return the right-hand side unchanged. # Then type inference will make sure that the two nonterminals (Statement and # IfStatement, for example) are given the same type.