def test_null_production(self): pg = ParserGenerator(["VALUE", "SPACE"]) @pg.production("main : values") def main(p): return p[0] @pg.production("values : none") def values_empty(p): return [] @pg.production("values : VALUE") def values_value(p): return [p[0].getstr()] @pg.production("values : values SPACE VALUE") def values_values(p): return p[0] + [p[2].getstr()] @pg.production("none :") def none(p): return None parser = pg.build() assert parser.parse(FakeLexer([ Token("VALUE", "abc"), Token("SPACE", " "), Token("VALUE", "def"), Token("SPACE", " "), Token("VALUE", "ghi"), ])) == ["abc", "def", "ghi"] assert parser.parse(FakeLexer([])) == []
def test_per_rule_precedence(self): pg = ParserGenerator(["NUMBER", "MINUS"], precedence=[ ("right", ["UMINUS"]), ]) @pg.production("main : expr") def main_expr(p): return p[0] @pg.production("expr : expr MINUS expr") def expr_minus(p): return BoxInt(p[0].getint() - p[2].getint()) @pg.production("expr : MINUS expr", precedence="UMINUS") def expr_uminus(p): return BoxInt(-p[1].getint()) @pg.production("expr : NUMBER") def expr_number(p): return BoxInt(int(p[0].getstr())) with self.assert_warns(ParserGeneratorWarning, "1 shift/reduce conflict"): parser = pg.build() assert parser.parse(FakeLexer([ Token("MINUS", "-"), Token("NUMBER", "4"), Token("MINUS", "-"), Token("NUMBER", "5"), ])) == BoxInt(-9)
def mod_mod_lex(lexer, filename): for token in lexer: if token is REPL_CONTINUE: yield token else: token.filename = filename token_str = token.getstr() macro_key = (filename, token.getstr()) if token.gettokentype() == 'NAME' and macro_key in macro_names: yield Token('MACRO_NAME', token_str, token.getsourcepos()) elif token.gettokentype( ) == 'NAME' and macro_key in infix_macro_names: yield Token('INFIX_MACRO_NAME', token_str, token.getsourcepos()) elif token.gettokentype( ) == 'NAME' and macro_key in infix_1_macro_names: yield Token('INFIX_1_MACRO_NAME', token_str, token.getsourcepos()) elif token.gettokentype( ) == 'NAME' and macro_key in infix_2_macro_names: yield Token('INFIX_2_MACRO_NAME', token_str, token.getsourcepos()) elif token.gettokentype( ) == 'NAME' and macro_key in user_defined_keywords: yield Token('USER_DEFINED_KEYWORD', token_str, token.getsourcepos()) else: yield token
def test_state(self): pg = ParserGenerator(["NUMBER", "PLUS"], precedence=[ ("left", ["PLUS"]), ]) @pg.production("main : expression") def main(state, p): state.count += 1 return p[0] @pg.production("expression : expression PLUS expression") def expression_plus(state, p): state.count += 1 return BoxInt(p[0].getint() + p[2].getint()) @pg.production("expression : NUMBER") def expression_number(state, p): state.count += 1 return BoxInt(int(p[0].getstr())) parser = pg.build() state = ParserState() assert parser.parse(FakeLexer([ Token("NUMBER", "10"), Token("PLUS", "+"), Token("NUMBER", "12"), Token("PLUS", "+"), Token("NUMBER", "-2"), ]), state=state) == BoxInt(20) assert state.count == 6
def test_single_if_statement(self): '''Can parse a single if statement `if (1) { duck = 3; }` ''' given = iter([ Token('IF', 'if'), Token('LPAREN', '('), Token('INTEGER', '1'), Token('RPAREN', ')'), Token('LCURLY', '{'), Token('ID', 'duck'), Token('EQUAL', '='), Token('INTEGER', '3'), Token('SEMI', ';'), Token('RCURLY', '}'), ]) expected = ast.Block([ ast.IfStatement( ast.Integer(1), ast.Block([ ast.Assignment(ast.ID('duck'), ast.Integer(3)) ]), ast.Block([]) ) ]) result = parser.parse(given) assert expected == result
def test_unary_op_expression(self): '''Unary operations are supported: `(5 + -1) - !3;`''' given = iter([ Token('LPAREN', '('), Token('INTEGER', '5'), Token('PLUS', '+'), Token('MINUS', '-'), Token('INTEGER', '1'), Token('RPAREN', ')'), Token('MINUS', '-'), Token('BANG', '!'), Token('INTEGER', '3'), Token('SEMI', ';') ]) result = parser.parse(given) expected = ast.Block([ ast.Statement( ast.BinOp( '-', ast.BinOp( '+', ast.Integer(5), ast.UnaryOp('-', ast.Integer(1)) ), ast.UnaryOp('!', ast.Integer(3)) ) ) ]) assert result == expected
def test_expr_assignment_with_ids(self): '''Can parse assignment to expression with ids: `duck = (goose + 10) * duck;`''' given = iter([ Token('ID', 'duck'), Token('EQUAL', '='), Token('LPAREN', '('), Token('ID', 'goose'), Token('PLUS', '+'), Token('INTEGER', '10'), Token('RPAREN', ')'), Token('MUL', '*'), Token('ID', 'duck'), Token('SEMI', ';') ]) result = parser.parse(given) expected = ast.Block([ ast.Assignment( ast.ID('duck'), ast.BinOp( '*', ast.BinOp('+', ast.ID('goose'), ast.Integer(10)), ast.ID('duck') ) ) ]) assert result == expected
def test_arithmetic(self): pg = ParserGenerator(["NUMBER", "PLUS"]) @pg.production("main : expr") def main(p): return p[0] @pg.production("expr : expr PLUS expr") def expr_op(p): return BoxInt(p[0].getint() + p[2].getint()) @pg.production("expr : NUMBER") def expr_num(p): return BoxInt(int(p[0].getstr())) with self.assert_warns( ParserGeneratorWarning, "1 shift/reduce conflict" ): parser = pg.build() assert parser.parse(iter([ Token("NUMBER", "1"), Token("PLUS", "+"), Token("NUMBER", "4") ])) == BoxInt(5)
def test_build_ast_assembly(self): tokens = iter((Token('IDENTIFIER', 'uHO'), Token('SEMICOLON', ';'), Token('IDENTIFIER', 'pADH1'))) ast = self.parser.parse(tokens) self.assertEqual(type(ast), Program) self.assertEqual( ast.to_dict(), { 'definitions': { 'node': 'DefinitionList', 'items': [{ 'label': None, 'node': 'Assembly', 'parts': [{ 'node': 'SymbolReference', 'identifier': 'uHO', 'invert': False, 'slice': None }, { 'node': 'SymbolReference', 'identifier': 'pADH1', 'invert': False, 'slice': None }], }] }, 'imports': [], 'node': 'Program' })
def f(n): return parser.parse( FakeLexer([ Token("NUMBER", str(n)), Token("PLUS", "+"), Token("NUMBER", str(n)) ])).getint()
def test_precedence(self): pg = ParserGenerator(["NUMBER", "PLUS", "TIMES"], precedence=[ ("left", ["PLUS"]), ("left", ["TIMES"]), ]) @pg.production("main : expr") def main(p): return p[0] @pg.production("expr : expr PLUS expr") @pg.production("expr : expr TIMES expr") def expr_binop(p): return BoxInt({ "+": operator.add, "*": operator.mul }[p[1].getstr()](p[0].getint(), p[2].getint())) @pg.production("expr : NUMBER") def expr_num(p): return BoxInt(int(p[0].getstr())) parser = pg.build() assert parser.parse(FakeLexer([ Token("NUMBER", "3"), Token("TIMES", "*"), Token("NUMBER", "4"), Token("PLUS", "+"), Token("NUMBER", "5") ])) == BoxInt(17)
def test_build_ast_sequence_constant_amino_acid(self): """Parse to an AST for a constant amino acid sequence.""" tokens = iter(( Token('FORWARD_SLASH', '/'), Token('AMINO_ACID_SEQUENCE', '$NYWKDGGSSGRS*'), Token('FORWARD_SLASH', '/'), )) ast = self.parser.parse(tokens) self.assertEqual(type(ast), Program) self.assertEqual( ast.to_dict(), { 'definitions': { 'node': 'DefinitionList', 'items': [{ 'label': None, 'node': 'Assembly', 'parts': [{ 'node': 'SequenceConstant', 'sequence': 'NYWKDGGSSGRS*', 'type': 'PROTEIN' }] }], }, 'imports': [], 'node': 'Program' })
def test_build_ast_sequence_constant(self): """Parse to an AST for a constant nucleotide sequence.""" tokens = iter(( Token('FORWARD_SLASH', '/'), Token('IDENTIFIER', 'ATGG'), Token('FORWARD_SLASH', '/'), )) ast = self.parser.parse(tokens) self.assertEqual(type(ast), Program) self.assertEqual( ast.to_dict(), { 'definitions': { 'node': 'DefinitionList', 'items': [{ 'label': None, 'node': 'Assembly', 'parts': [{ 'node': 'SequenceConstant', 'sequence': 'ATGG', 'type': 'DNA' }] }], }, 'imports': [], 'node': 'Program' })
def f(): state = ParserState() return parser.parse(iter([ Token("NUMBER", "10"), Token("PLUS", "+"), Token("NUMBER", "12"), Token("PLUS", "+"), Token("NUMBER", "-2"), ]), state=state).getint() + state.count
def test_simple(self): pg = ParserGenerator(["VALUE"]) @pg.production("main : VALUE") def main(p): return p[0] parser = pg.build() assert parser.parse(FakeLexer([Token("VALUE", "abc")])) == Token("VALUE", "abc")
def test_simple_caching(self): pg = ParserGenerator(["VALUE"], cache_id="simple") @pg.production("main : VALUE") def main(p): return p[0] pg.build() parser = pg.build() assert parser.parse(iter([Token("VALUE", "3")])) == Token("VALUE", "3")
def test_simple_caching(self): # Generate a random cache_id so that every test run does both the cache # write and read paths. pg = ParserGenerator(["VALUE"], cache_id=str(uuid.uuid4())) @pg.production("main : VALUE") def main(p): return p[0] pg.build() parser = pg.build() assert parser.parse(iter([Token("VALUE", "3")])) == Token("VALUE", "3")
def test_mixed_expression(self): '''An expression can have both numbers and ids: `x * (y + 10);`''' given = iter([ Token('ID', 'x'), Token('MUL', '*'), Token('LPAREN', '('), Token('ID', 'y'), Token('PLUS', '+'), Token('INTEGER', '10'), Token('RPAREN', ')'), Token('SEMI', ';') ]) result = parser.parse(given) expected = ast.Block([ ast.Statement( ast.BinOp( '*', ast.ID('x'), ast.BinOp('+', ast.ID('y'), ast.Integer(10)) ) ) ]) assert result == expected
def test_parens(self): '''Honors parenthesis: `(1 + 5) * 20;`''' given = iter([ Token('LPAREN', '('), Token('INTEGER', '1'), Token('PLUS', '+'), Token('INTEGER', '5'), Token('RPAREN', ')'), Token('MUL', '*'), Token('INTEGER', '20'), Token('SEMI', ';') ]) result = parser.parse(given) expected = ast.Block([ ast.Statement( ast.BinOp( '*', ast.BinOp('+', ast.Integer(1), ast.Integer(5)), ast.Integer(20) ) ) ]) assert result == expected
def test_relational_op_order(self): '''Relational ops have lowest precedence: `3 * 5 + 3 == 18;`''' given = iter([ Token('INTEGER', '3'), Token('MUL', '*'), Token('INTEGER', '5'), Token('PLUS', '+'), Token('INTEGER', '3'), Token('EQUAL_EQUAL', '=='), Token('INTEGER', '18'), Token('SEMI', ';') ]) result = parser.parse(given) expected = ast.Block([ ast.Statement( ast.BinOp( '==', ast.BinOp( '+', ast.BinOp('*', ast.Integer(3), ast.Integer(5)), ast.Integer(3) ), ast.Integer(18) ) ) ]) assert result == expected
def handle_newline(token): text = token.getstr() indent_str = text.rsplit('\n', 1)[1] indent = indent_str.count(' ') + indent_str.count('\t') * tab_len if indent > indent_level[-1]: indent_level.append(indent) indent_token = Token('INDENT', indent_str) indent_token.source_pos = token.getsourcepos() token_queue.append(indent_token) else: while indent < indent_level[-1]: indent_level.pop() dedent_token = Token('DEDENT', indent_str) token_queue.append(dedent_token) return token
def test_simple_declaration(self): '''Can parse a simple declaration: `int a;`''' given = iter([ Token('INT_TYPE', 'int'), Token('ID', 'a'), Token('SEMI', ';') ]) expected = ast.Block([ ast.Declaration('int', [ast.ID('a')]) ]) result = parser.parse(given) assert expected == result
def assertEqual(self, first: str, second: List[Tuple[str, str]], msg: Any = ...) -> None: super().assertEqual(list(lex(first)), [Token(name, value) for name, value in second], msg)
def test_add(self): '''Can parse a simple addition: `1 + 1;`''' given = iter([ Token('INTEGER', '1'), Token('PLUS', '+'), Token('INTEGER', '1'), Token('SEMI', ';') ]) expected = ast.Block([ ast.Statement(ast.BinOp('+', ast.Integer(1), ast.Integer(1))) ]) result = parser.parse(given) assert expected == result
def test_simple_id(self): '''IDs are treated as expressions: `x;`''' given = iter([ Token('ID', 'x'), Token('SEMI', ';') ]) result = parser.parse(given) expected = ast.Block([ ast.Statement( ast.ID('x') ) ]) assert result == expected
def test_parse_error(self): pg = ParserGenerator(["VALUE"]) @pg.production("main : VALUE") def main(p): return p[0] parser = pg.build() with py.test.raises(ParsingError) as exc_info: parser.parse(FakeLexer([ Token("VALUE", "hello"), Token("VALUE", "world", SourcePosition(5, 10, 2)), ])) assert exc_info.value.getsourcepos().lineno == 10
def test_pipe_production(self): pg = ParserGenerator(["VALUE1", "VALUE2"]) @pg.production("main : VALUE1 | VALUE2") def main(p): return p[0] parser = pg.build() assert len(pg.productions) == 2 assert parser.parse(iter([Token("VALUE1", "3")])) == Token("VALUE1", "3") assert parser.parse(iter([Token("VALUE2", "3")])) == Token("VALUE2", "3")
def layout(tokens): lineno = -1 precede = "" for token in tokens: if lineno < 0: lineno = token.source_pos.lineno if lineno < token.source_pos.lineno: lineno = token.source_pos.lineno if token.source_pos.colno == 1: yield Token("LINE", "") precede = "" if token.gettokentype() == "LEFTPAREN" and precede != "ATOM": yield Token("LEFTPAREN0", token.getstr()) else: yield token precede = token.gettokentype()
def __init__(self, mres, mpred, value, token, src): # def __init__(self, mres, mpred, mexprres, value, token, src): super().__init__(token, src) self._predicate = mpred self._value = value # Identify symbol if exists sp = token.getsourcepos() if mres: self._result = mres else: msym = "matchres_" + str(sp.lineno) + "_" + str(sp.colno) self._result = Symbol( msym, Token("MATCH_RES", msym, token.getsourcepos()), src) msym = "%0" self._exprres = Symbol( msym, Token("MATCH_EXPRREF", msym, token.getsourcepos()), src) self._ident = "match_" + str(sp.lineno) + "_" + str(sp.colno)
def test_default_reductions(self): pg = ParserGenerator( ["INTEGER_START", "INTEGER_VALUE", "COMPARE"], precedence=[ ("nonassoc", ["COMPARE"]) ] ) record = [] @pg.production("main : expr") def main(p): record.append("main") return p[0] @pg.production("expr : expr COMPARE expr") def expr_compare(p): record.append("expr:compare") return BoxInt(p[0].getint() - p[2].getint()) @pg.production("expr : INTEGER_START INTEGER_VALUE") def expr_int(p): record.append("expr:int") return BoxInt(int(p[1].getstr())) parser = pg.build() assert parser.parse(RecordingLexer(record, [ Token("INTEGER_START", ""), Token("INTEGER_VALUE", "10"), Token("COMPARE", "-"), Token("INTEGER_START", ""), Token("INTEGER_VALUE", "5") ])) == BoxInt(5) assert record == [ "token:INTEGER_START", "token:INTEGER_VALUE", "expr:int", "token:COMPARE", "token:INTEGER_START", "token:INTEGER_VALUE", "expr:int", "expr:compare", "token:None", "main", ]
def mod_lex(lexer, repl_mode=False): paren_openers = {'LPAREN', 'LBRACE', 'LBRACK'} paren_closers = {'RPAREN', 'RBRACE', 'RBRACK'} token_queue = [] indent_level = [0] ignore_newline = False paren_level = 0 tab_len = 4 def handle_newline(token): text = token.getstr() indent_str = text.rsplit('\n', 1)[1] indent = indent_str.count(' ') + indent_str.count('\t') * tab_len if indent > indent_level[-1]: indent_level.append(indent) indent_token = Token('INDENT', indent_str) indent_token.source_pos = token.getsourcepos() token_queue.append(indent_token) else: while indent < indent_level[-1]: indent_level.pop() dedent_token = Token('DEDENT', indent_str) token_queue.append(dedent_token) return token for token in lexer: while len(token_queue) > 0: queued_token = token_queue.pop() if queued_token.gettokentype() in paren_openers: paren_level += 1 elif queued_token.gettokentype() in paren_closers: paren_level -= 1 ignore_newline = (paren_level > 0) yield queued_token if token.name == 'NAME': for rule in klg.rules: if rule.matches(token.value, 0): token.name = rule.name break elif token.gettokentype() == 'NEWLINE': if not ignore_newline: yield handle_newline(token) continue if token.gettokentype() in paren_openers: paren_level += 1 elif token.gettokentype() in paren_closers: paren_level -= 1 ignore_newline = (paren_level > 0) if token.gettokentype() == 'NAME' and token.getstr().startswith('&'): amp = Token('AMP', '&') amp.source_pos = token.getsourcepos() comma = Token('COMMA', ',') amp.source_pos = token.getsourcepos() name = Token('NAME', token.getstr()[1:]) name.source_pos = token.getsourcepos() yield amp yield comma yield name else: yield token if repl_mode and len(indent_level) > 1: yield REPL_CONTINUE elif repl_mode and paren_level > 0: yield REPL_CONTINUE else: while len(indent_level) > 1: indent_level.pop() yield Token('DEDENT', '') for token in token_queue: yield token
def mod_lex(lexer, repl_mode=False): paren_openers = {'LPAREN', 'LBRACE', 'LBRACK'} paren_closers = {'RPAREN', 'RBRACE', 'RBRACK'} token_queue = Queue() indent_level = [0] ignore_newline = False paren_level = 0 tab_len = 4 def handle_newline(token): text = token.getstr() indent_str = text.rsplit('\n', 1)[1] indent = indent_str.count(' ') + indent_str.count('\t') * tab_len if indent > indent_level[-1]: indent_level.append(indent) indent_token = Token('INDENT', indent_str) indent_token.source_pos = token.getsourcepos() token_queue.put(indent_token) else: while indent < indent_level[-1]: indent_level.pop() dedent_token = Token('DEDENT', indent_str) token_queue.put(dedent_token) return token for token in lexer: while not token_queue.empty(): queued_token = token_queue.get() if queued_token.gettokentype() in paren_openers: paren_level += 1 elif queued_token.gettokentype() in paren_closers: paren_level -= 1 ignore_newline = (paren_level > 0) if queued_token.gettokentype() == 'NAME' and queued_token.getstr().startswith('&'): amp = Token('AMP', '&') amp.source_pos = queued_token.getsourcepos() comma = Token('COMMA', ',') amp.source_pos = queued_token.getsourcepos() name = Token('NAME', queued_token.getstr()[1:]) name.source_pos = queued_token.getsourcepos() yield amp yield comma yield name else: yield queued_token if token.name == 'NAME': for rule in klg.rules: if rule.matches(token.value, 0): token.name = rule.name break if token.gettokentype() in INFIX_OPERATORS: ahead_token = next(lexer) if ahead_token.gettokentype() == 'NEWLINE': pass else: token_queue.put(ahead_token) elif token.gettokentype() == 'NEWLINE': try: ahead_token = next(lexer) _set_keyword(ahead_token) ahead_token_type = ahead_token.gettokentype() except StopIteration: ahead_token = None ahead_token_type = None if not (ignore_newline or ((ahead_token is not None) and ((ahead_token_type in INFIX_OPERATORS) or ahead_token_type == 'DOT_NAME'))): yield handle_newline(token) if ahead_token is not None: token_queue.put(ahead_token) continue if token.gettokentype() in paren_openers: paren_level += 1 elif token.gettokentype() in paren_closers: paren_level -= 1 ignore_newline = (paren_level > 0) if token.gettokentype() == 'NAME' and token.getstr().startswith('&'): amp = Token('AMP', '&') amp.source_pos = token.getsourcepos() comma = Token('COMMA', ',') amp.source_pos = token.getsourcepos() name = Token('NAME', token.getstr()[1:]) name.source_pos = token.getsourcepos() yield amp yield comma yield name else: yield token if repl_mode and len(indent_level) > 1: yield REPL_CONTINUE elif repl_mode and paren_level > 0: yield REPL_CONTINUE else: while not token_queue.empty(): yield token_queue.get() while len(indent_level) > 1: indent_level.pop() yield Token('DEDENT', '')