Example #1
0
    def testNegativeLookahead(self):
        tokenize = lexer.LexicalGrammar('a b')
        rules = {
            'goal': [
                [LookaheadRule(frozenset({'a'}), False), 'abs'],
            ],
            'abs': [
                ['a'],
                ['b'],
                ['abs', 'a'],
                ['abs', 'b'],
            ],
        }

        parse = gen.compile(Grammar(rules))
        self.assertRaisesRegex(SyntaxError,
                               r"expected 'b', got 'a'",
                               lambda: parse(tokenize, "a b"))
        self.assertEqual(
            parse(tokenize, 'b a'),
            ('goal', ('abs 2', 'b', 'a'))
        )

        # In simple cases like this, the lookahead restriction can even
        # disambiguate a grammar that would otherwise be ambiguous.
        rules['goal'].append(prod(['a'], 'goal_a'))
        parse = gen.compile(Grammar(rules))
        self.assertEqual(
            parse(tokenize, 'a'),
            ('goal_a', 'a')
        )
Example #2
0
    def disabledNegativeLookaheadDisambiguation(self):
        tokenize = lexer.LexicalGrammar(
            '( ) { } ; function =',
            IDENT=r'[A-Za-z_][A-Za-z_0-9]*')
        grammar = Grammar({
            'stmts': [
                ['stmt'],
                ['stmts', 'stmt'],
            ],
            'stmt': [
                [LookaheadRule(set=frozenset({'function'}), positive=False),
                 'expr', ';'],
                ['fndecl'],
            ],
            'fndecl': [
                ['function', 'IDENT', '(', ')', '{', Optional('stmt'), '}'],
            ],
            'expr': [
                ['term'],
                ['IDENT', '=', 'expr'],
            ],
            'term': [
                ['(', 'expr', ')'],
                ['fndecl'],
                ['term', '(', 'expr', ')'],
            ],
        })
        parse = gen.compile(grammar)

        # Test that without the lookahead restriction, we reject this grammar
        # (it's ambiguous):
        del grammar['stmt'][0][0]
        self.assertRaisesRegex(ValueError,
                               'banana',
                               lambda: gen.compile(grammar))

        self.assertEqual(
            parse(tokenize, 'function f() { x = function y() {}; }'),
            ('stmt', 1,
                ('fndecl',
                    'function', 'f', '(', ')', '{',
                    ('stmt', 0,
                        ('expr', 1,
                            'x',
                            '=',
                            ('expr', 0,
                                ('term', 1,
                                    ('fndecl',
                                        'function', 'y', '(', ')',
                                        '{', None, '}')))),
                        ';'))))

        self.assertEqual(
            parse(tokenize, '(function g(){});'),
            ('stmts', 0,
                ('stmt', 0,
                    ('term', 1,
                        ('fndecl',
                            'function', 'g', '(', ')', '{', None, '}')),
                    ';')))
Example #3
0
 def testList(self):
     list_grammar = Grammar({
         'prelist': [
             ['word', 'list']
         ],
         'list': [
             ['word'],
             ['list', 'word'],
         ],
         'word': [
             ['SYMBOL']
         ],
     })
     parse = gen.compile(list_grammar)
     self.assertEqual(
         parse(LispTokenizer,
               "the quick brown fox jumped over the lazy dog"),
         ('prelist',
             'the',
             ('list 1',
                 ('list 1',
                     ('list 1',
                         ('list 1',
                             ('list 1',
                                 ('list 1',
                                     ('list 1',
                                         'quick',
                                         'brown'),
                                     'fox'),
                                 'jumped'),
                             'over'),
                         'the'),
                     'lazy'),
                 'dog')))
Example #4
0
    def testDeepRecursion(self):
        grammar = Grammar({
            'expr': [
                ['SYMBOL'],
                ['(', ')'],
                ['(', 'exprs', ')'],
            ],
            'exprs': [
                ['expr'],
                ['exprs', 'expr'],
            ],
        })
        parse = gen.compile(grammar)

        N = 3000
        s = "x"
        t = ('expr 0', 'x')
        for i in range(N):
            s = "(" + s + ")"
            t = ('expr 2', '(', t, ')')

        result = parse(LispTokenizer, s)

        # Python can't check that result == t; it causes a RecursionError.
        # Testing that repr(result) == repr(t), same deal. So:
        for i in range(N):
            self.assertIsInstance(result, tuple)
            self.assertEqual(len(result), 4)
            self.assertEqual(result[0], 'expr 2')
            self.assertEqual(result[1], '(')
            self.assertEqual(result[3], ')')
            result = result[2]
Example #5
0
    def testSimple(self):
        grammar = Grammar({
            'expr': [
                ['SYMBOL'],
                ['(', 'tail'],
            ],
            'tail': [
                [')'],
                ['expr', 'tail'],
            ],
        })
        parse = gen.compile(grammar)

        parsed = parse(LispTokenizer, "(lambda (x) (* x x))")
        self.assertEqual(
            parsed,
            ('expr 1',
                '(',
                ('tail 1',
                    'lambda',
                    ('tail 1',
                        ('expr 1', '(', ('tail 1', 'x', ')')),
                        ('tail 1',
                            ('expr 1',
                                '(',
                                ('tail 1',
                                    '*',
                                    ('tail 1',
                                        'x',
                                        ('tail 1', 'x', ')')))),
                            ')')))))
Example #6
0
    def testLeftFactorMultiLevel(self):
        """Test left-factoring again on a nonterminal introduced by
        left-factoring."""
        tokenize = lexer.LexicalGrammar("FOR IN TO BY ( ) = ;",
                                        VAR=r'[A-Za-z]+')

        # The first left-factoring pass on `stmt` will left-factor `FOR ( VAR`.
        # A second pass is needed to left-factor `= expr TO expr`.
        grammar = Grammar({
            'stmt': [
                ['expr', ';'],
                ['FOR', '(', 'VAR', 'IN', 'expr', ')', 'stmt'],
                ['FOR', '(', 'VAR', '=', 'expr', 'TO', 'expr', ')', 'stmt'],
                ['FOR', '(', 'VAR', '=', 'expr', 'TO', 'expr',
                 'BY', 'expr', ')', 'stmt'],
                ['IF', '(', 'expr', ')', 'stmt'],
            ],
            'expr': [
                ['VAR'],
            ],
        })
        parse = gen.compile(grammar)
        self.assertEqual(
            parse(tokenize, "FOR (x IN y) z;"),
            ('stmt 1', 'FOR', '(', 'x', 'IN', 'y', ')',
             ('stmt 0', 'z', ';')))
        self.assertEqual(
            parse(tokenize, "FOR (x = y TO z) x;"),
            ('stmt 2', 'FOR', '(', 'x', '=', 'y', 'TO', 'z', ')',
             ('stmt 0', 'x', ';')))
        self.assertEqual(
            parse(tokenize, "FOR (x = y TO z BY w) x;"),
            ('stmt 3', 'FOR', '(', 'x', '=', 'y', 'TO', 'z', 'BY', 'w', ')',
             ('stmt 0', 'x', ';')))
Example #7
0
 def testTrailingLookahead(self):
     """Lookahead at the end of a production is banned."""
     grammar = gen.Grammar({
         'stmt': [
             ['OTHER', ';'],
             ['IF', '(', 'X', ')', 'stmt',
              LookaheadRule(frozenset({'ELSE'}), False)],
             ['IF', '(', 'X', ')', 'stmt', 'ELSE', 'stmt'],
         ],
     })
     self.assertRaisesRegex(
         ValueError,
         r"invalid grammar: lookahead restriction at end of production",
         lambda: gen.compile(grammar))
Example #8
0
 def testLeftFactorMulti(self):
     """Test left-factoring with common prefix of length >1."""
     tokenize = lexer.LexicalGrammar("A B C D E")
     grammar = Grammar({
         'goal': [
             ['A', 'B', 'C', 'D'],
             ['A', 'B', 'C', 'E'],
         ],
     })
     parse = gen.compile(grammar)
     self.assertEqual(
         parse(tokenize, "A B C D"),
         ('goal 0', 'A', 'B', 'C', 'D'))
     self.assertEqual(
         parse(tokenize, "A B C E"),
         ('goal 1', 'A', 'B', 'C', 'E'))
Example #9
0
    def compile_as_js(
        self,
        grammar_source: str,
        goals: typing.Optional[typing.Iterable[str]] = None,
        verbose: bool = False,
    ) -> None:
        """Like self.compile(), but generate a parser from ESGrammar,
        with ASI support, using the JS lexer.
        """
        from js_parser.lexer import JSLexer
        from js_parser import load_es_grammar
        from js_parser import generate_js_parser_tables

        grammar = parse_esgrammar(
            grammar_source,
            filename="es-simplified.esgrammar",
            extensions=[],
            goals=goals,
            synthetic_terminals=load_es_grammar.ECMASCRIPT_SYNTHETIC_TERMINALS,
            terminal_names=load_es_grammar.TERMINAL_NAMES_FOR_SYNTACTIC_GRAMMAR
        )
        grammar = generate_js_parser_tables.hack_grammar(grammar)
        base_parser_class = gen.compile(grammar, verbose=verbose)

        # "type: ignore" because poor mypy can't cope with the runtime codegen
        # we're doing here.
        class JSParser(base_parser_class):  # type: ignore
            def __init__(self, goal='Script', builder=None):
                super().__init__(goal, builder)
                self._goal = goal
                # self.debug = True

            def clone(self):
                return JSParser(self._goal, self.methods)

            def on_recover(self, error_code, lexer, stv):
                """Check that ASI error recovery is really acceptable."""
                if error_code == 'asi':
                    if not self.closed and stv.term != '}' and not lexer.saw_line_terminator(
                    ):
                        lexer.throw("missing semicolon")
                else:
                    assert error_code == 'do_while_asi'

        self.tokenize = JSLexer
        self.parser_class = JSParser
Example #10
0
    def testArithmetic(self):
        tokenize = lexer.LexicalGrammar(
            "+ - * / ( )",
            NUM=r'[0-9]\w*',
            VAR=r'[A-Za-z]\w*')
        arith_grammar = Grammar({
            'expr': [
                ['term'],
                ['expr', '+', 'term'],
                ['expr', '-', 'term'],
            ],
            'term': [
                ['prim'],
                ['term', '*', 'prim'],
                ['term', '/', 'prim'],
            ],
            'prim': [
                ['NUM'],
                ['VAR'],
                ['(', 'expr', ')'],
            ],
        })
        parse = gen.compile(arith_grammar)

        self.assertEqual(
            parse(tokenize, '2 * 3 + 4 * (5 + 7)'),
            ('expr 1',
                ('term 1', '2', '*', '3'),
                '+',
                ('term 1',
                    '4',
                    '*',
                    ('prim 2',
                        '(',
                        ('expr 1', '5', '+', '7'),
                        ')'))))

        self.assertRaisesRegex(
            SyntaxError,
            r"unexpected end of input",
            lambda: parse(tokenize, "("))
        self.assertRaisesRegex(
            SyntaxError,
            r"expected one of \['\(', 'NUM', 'VAR'], got '\)'",
            lambda: parse(tokenize, ")"))
Example #11
0
 def testOptionalEmpty(self):
     tokenize = lexer.LexicalGrammar("X Y")
     grammar = Grammar({
         'a': [
             [Optional('b'), Optional('c')],
         ],
         'b': [
             prod(['X'], 'b'),
         ],
         'c': [
             prod(['Y'], 'c'),
         ]
     })
     parse = gen.compile(grammar)
     self.assertEqual(parse(tokenize, ""), ('a', None, None))
     self.assertEqual(parse(tokenize, "X"), ('a', ('b', 'X'), None))
     self.assertEqual(parse(tokenize, "Y"), ('a', None, ('c', 'Y')))
     self.assertEqual(parse(tokenize, "X Y"), ('a', ('b', 'X'), ('c', 'Y')))
Example #12
0
 def testOptional(self):
     tokenize = lexer.LexicalGrammar('[ ] , X')
     grammar = Grammar({
         'array': [
             ['[', Optional('elision'), ']'],
             ['[', 'elements', ']'],
             ['[', 'elements', ',', Optional('elision'), ']']
         ],
         'elements': [
             [Optional('elision'), 'X'],
             ['elements', ',', Optional('elision'), 'X']
         ],
         'elision': [
             [','],
             ['elision', ',']
         ]
     })
     parse = gen.compile(grammar)
     self.assertEqual(parse(tokenize, "[]"),
                      ('array 0', '[', None, ']'))
     self.assertEqual(parse(tokenize, "[,]"),
                      ('array 0', '[', ',', ']'))
     self.assertEqual(
         parse(tokenize, "[,,X,,X,]"),
         ('array 2',
             '[',
             ('elements 1',
                 ('elements 0',
                     ('elision 1',
                         ',',
                         ','),
                     'X'),
                 ',',
                 ',',
                 'X'),
             ',',
             None,
             ']'))
Example #13
0
 def compile(self, tokenize, grammar):
     """Compile a grammar. Use this when you expect compilation to
     succeed."""
     self.tokenize = tokenize
     self.parse = gen.compile(grammar)
Example #14
0
    # prose not wrapped in square brackets
    # To avoid conflict with the `>` token, this is recognized only after a space.
    PROSE=r'(?<= )>[^\n]*',

    # prose wrapped in square brackets
    WPROSE=r'\[>[^]]*\]',

    # expression denoting a matched terminal or nonterminal
    MATCH_REF=r'\$(?:0|[1-9][0-9]*)',

    # the spec also gives a few productions names
    RUSTCOMMENT=r'//.*\n',
)

ESGrammarParser = gen.compile(
    parse_pgen.load_grammar(
        os.path.join(os.path.dirname(__file__), "esgrammar.pgen")))

SIGIL_FALSE = '~'
SIGIL_TRUE = '+'

# Abbreviations for single-character terminals, used in the lexical grammar.
ECMASCRIPT_CODE_POINTS = {
    # From <https://tc39.es/ecma262/#table-31>
    '<ZWNJ>': grammar.Literal('\u200c'),
    '<ZWJ>': grammar.Literal('\u200d'),
    '<ZWNBSP>': grammar.Literal('\ufeff'),

    # From <https://tc39.es/ecma262/#table-32>
    '<TAB>': grammar.Literal('\t'),
    '<VT>': grammar.Literal('\u000b'),
Example #15
0
    # nonterminals wrapped in vertical bars for no apparent reason
    NTALT=r'\|[A-Z]\w+\|',

    # the spec also gives a few productions names
    PRODID=r'#[A-Za-z]\w*',

    # prose to the end of the line
    PROSE=r'>.*',

    # prose wrapped in square brackets
    WPROSE=r'\[>[^]]*\]',
)

parse_esgrammar_generic = gen.compile(
    parse_pgen.load_grammar(
        os.path.join(os.path.dirname(__file__), "esgrammar.pgen")))

SIGIL_FALSE = '~'
SIGIL_TRUE = '+'

# Productions like
#
#     Expression : AssignmentExpression
#     PrimaryExpression : ArrayLiteral
#     Statement : IfStatement
#
# should not cause an extra method call; the action for each of these
# productions should be `$0`, i.e. just return the right-hand side unchanged.
# Then type inference will make sure that the two nonterminals (Statement and
# IfStatement, for example) are given the same type.