class TestGLRList(unittest.TestCase): def setUp(self): S, L = CFGNonterminal.create("S", "L") self.op, self.cp, self.c, self.x = CFGTerminal.create("(", ")", ",", "x") S.production(lambda i, j, k: j, self.op, L, self.cp) S.production(lambda i: i, self.x) L.production(lambda i: [i], S) def append(l, i): l.append(i) return l L.production(lambda i, j, k: append(i, k), L, self.c, S) self.parser = CFGParser(S, (self.op, self.cp, self.c, self.x), (S, L)) def test_single_list(self): l = self.parser.parse_items(self.op, self.x, self.cp) self.assertEqual(l, [self.x]) l = self.parser.parse_items(self.op, self.x, self.c, self.x, self.cp) self.assertEqual(l, [self.x, self.x]) def test_nested(self): l = self.parser.parse_items(self.op, self.x, self.c, self.op, self.x, self.cp, self.cp) self.assertEqual(l, [self.x, [self.x]])
class TestGLRLexer(unittest.TestCase): def setUp( self ): plus, times, num = CFGTerminal.create("+", "*", "num") E, T, F = CFGNonterminal.create("E", "T", "F") E.production( lambda i: i, T ) T.production( lambda i, j, k: i.add(k), T, plus, F ) T.production( lambda i: i, F ) F.production( lambda i, j, k: i.mul(k), F, times, num ) F.production( lambda i: i, num ) tokens = { '\\+': plus, '\\*': times, "[0-9][0-9]*": num.data_wrapper(Number) } self.lexer = CFGLexer( tokens ) self.parser = CFGParser( E, (plus, times, num), (E, T, F) ) def _n( self, s ): return self.parser.parse( self.lexer.tokenize( s ) ).value def test_add( self ): self.assertEqual( self._n("5+4"), 9 ) self.assertEqual( self._n("1+3+0"), 4 ) def test_mul( self ): self.assertEqual( self._n("3*2"), 6 ) self.assertEqual( self._n("1*5*13"), 65 ) self.assertEqual( self._n("4*0"), 0 ) def test_oop( self ): self.assertEqual( self._n("4+2*8"), 20 ) self.assertEqual( self._n("5+0*12+4*3*2"), 29 )
def __init__( self ): self._build_parser() self.lexer = CFGLexer( self.tokens, eof_token = self.eof.data_wrapper(EOF) ) self.parser = CFGParser( self.goal, self.terminals, self.nonterminals ) self.reset()
class TestGLRValuesExpr(unittest.TestCase): def setUp(self): self.plus, self.mul, self.num = CFGTerminal.create("+", "*", "num") E, T, F = CFGNonterminal.create("E", "T", "F") E.production(lambda i: i, T) T.production(lambda i, j, k: i + k, T, self.plus, F) T.production(lambda i: i, F) F.production(lambda i, j, k: i * k, F, self.mul, self.num) F.production(lambda i: i, self.num) self.parser = CFGParser(E, (self.plus, self.mul, self.num), (E, T, F)) def _n(self, v): return self.num.with_data(v) def test_add(self): v = self.parser.parse_items(self._n(5), self.plus, self._n(2)) self.assertEqual(v, 7) v = self.parser.parse_items(self._n(1), self.plus, self._n(4), self.plus, self._n(10)) self.assertEqual(v, 15) def test_mul(self): v = self.parser.parse_items(self._n(5), self.mul, self._n(2)) self.assertEqual(v, 10) v = self.parser.parse_items(self._n(1), self.mul, self._n(4), self.mul, self._n(10)) self.assertEqual(v, 40) def test_oop(self): v = self.parser.parse_items(self._n(7), self.mul, self._n(3), self.plus, self._n(2)) self.assertEqual(v, 23) v = self.parser.parse_items(self._n(1), self.plus, self._n(4), self.mul, self._n(10)) self.assertEqual(v, 41) def test_complex(self): v = self.parser.parse_items(self._n(7), self.mul, self._n(3), self.plus, self._n(3), self.mul, self._n(2)) self.assertEqual(v, 27) v = self.parser.parse_items( self._n(1), self.plus, self._n(4), self.mul, self._n(5), self.plus, self._n(1), self.mul, self._n(9), self.mul, self._n(2), ) self.assertEqual(v, 39)
def setUp(self): self.plus, self.mul, self.num = CFGTerminal.create("+", "*", "num") E, T, F = CFGNonterminal.create("E", "T", "F") E.production(lambda i: i, T) T.production(lambda i, j, k: i + k, T, self.plus, F) T.production(lambda i: i, F) F.production(lambda i, j, k: i * k, F, self.mul, self.num) F.production(lambda i: i, self.num) self.parser = CFGParser(E, (self.plus, self.mul, self.num), (E, T, F))
def setUp(self): S, L = CFGNonterminal.create("S", "L") self.op, self.cp, self.c, self.x = CFGTerminal.create("(", ")", ",", "x") S.production(lambda i, j, k: j, self.op, L, self.cp) S.production(lambda i: i, self.x) L.production(lambda i: [i], S) def append(l, i): l.append(i) return l L.production(lambda i, j, k: append(i, k), L, self.c, S) self.parser = CFGParser(S, (self.op, self.cp, self.c, self.x), (S, L))
def setUp( self ): plus, times, num = CFGTerminal.create("+", "*", "num") E, T, F = CFGNonterminal.create("E", "T", "F") E.production( lambda i: i, T ) T.production( lambda i, j, k: i.add(k), T, plus, F ) T.production( lambda i: i, F ) F.production( lambda i, j, k: i.mul(k), F, times, num ) F.production( lambda i: i, num ) tokens = { '\\+': plus, '\\*': times, "[0-9][0-9]*": num.data_wrapper(Number) } self.lexer = CFGLexer( tokens ) self.parser = CFGParser( E, (plus, times, num), (E, T, F) )
class CFGFileParser: def __init__( self ): self._build_parser() self.lexer = CFGLexer( self.tokens, eof_token = self.eof.data_wrapper(EOF) ) self.parser = CFGParser( self.goal, self.terminals, self.nonterminals ) self.reset() def _build_parser( self ): self.terminals = ( start_section, end_section, tokens_id, productions_id, start_name, end_name, produces, colon, newline, indent, dedent, operator, comment, identifier, literal, number, eof ) = ( CFGTerminal.create( "start_section", "end_section", "tokens_id", "productions_id", "start_name", "end_name", "produces", "colon", "newline", "indent", "dedent", "operator", "comment", "identifier", "literal", "number", "eof" ) ) for t in self.terminals: setattr( self, t.identifier, t ) make_tuple = lambda *args: tuple(args) self.tokens = [] self.tokens.append(CFGToken(r"{%", start_section)) self.tokens.append(CFGToken(r"%}", end_section)) self.tokens.append(CFGToken(r"tokens", tokens_id.data_wrapper(Identifier))) self.tokens.append(CFGToken(r"productions", productions_id.data_wrapper(Identifier))) self.tokens.append(CFGToken(r"\(%", start_name)) self.tokens.append(CFGToken(r"%\)", end_name)) self.tokens.append(CFGToken(r"->", produces)) self.tokens.append(CFGToken(r":", colon.data_wrapper(Operator))) operators = [ ",", ";", "@", "+", "-", "\\*", "/", "//", "!", "\\|", "&", "<<", ">>", "<", ">", "=", "\\.", "%", "`", "~", "\\^" ] for o in operators: self.tokens.append(CFGToken(o, operator.data_wrapper(Operator))) assign_operators = [ "+=", "-=", "\\*=", "/=", "//=", "!=", "\\|=", "&=", "<=", ">=", "==", "%=", "<<=", ">>=" ] for o in assign_operators: self.tokens.append(CFGToken(o, operator.data_wrapper(Operator))) paren_operators = [ "\(", "\[", "{", "\)", "\]", "}" ] for p in paren_operators[:3]: self.tokens.append(CFGToken(p, self._paren_open)) for p in paren_operators[3:]: self.tokens.append(CFGToken(p, self._paren_close)) self.tokens.append(CFGToken(r"[a-zA-Z_][a-zA-Z0-9_]*", identifier.data_wrapper( Identifier ))) self.tokens.append(CFGToken( r"([0-9][0-9]*(\.[0-9]*)?|\.[0-9][0-9]*)([eE][+-]?[0-9][0-9]*)?", number.data_wrapper( Number ))) self.tokens.append(CFGToken(r'"([^\\"\n]*(\\.)?)*"', literal.data_wrapper( Literal ))) self.tokens.append(CFGToken(r"'([^\\'\n]*(\\.)?)*'", literal.data_wrapper( Literal ))) self.tokens.append(CFGToken(r'r"([^"\n]*(\\")?)*"', literal.data_wrapper( Literal ))) self.tokens.append(CFGToken(r"r'([^'\n]*(\\')?)*'", literal.data_wrapper( Literal ))) self.tokens.append(CFGToken( r'"""([^"]*("|"")?)*"""', literal.data_wrapper( Literal ))) self.tokens.append(CFGToken( r"'''([^']*('|'')?)*'''", literal.data_wrapper( Literal ))) self.tokens.append(CFGToken(r"^[ \t]*\n", self._newline_handler(True))) self.tokens.append(CFGToken(r"^[ \t]*", self._indent_handler, True)) self.tokens.append(CFGToken(r"\n", self._newline_handler(False))) self.tokens.append(CFGToken(r"[ \t\r]", CFGToken.NoToken)) self.tokens.append(CFGToken(r"\\\n^", CFGToken.NoToken)) self.tokens.append(CFGToken(r"^[ \t]*#[^\n]*\n", self._comment_line_handler, True)) self.tokens.append(CFGToken(r"#[^\n]*", comment.data_wrapper( Comment ))) self.nonterminals = ( goal, cfg_file, section, code, code_line, code_lines, code_bits, naming, regexes, products, productions, regex, production ) = ( CFGNonterminal.create( "goal", "cfg_file", "section", "code", "code_line", "code_lines", "code_bits", "naming", "regexes", "products", "productions", "regex", "production" ) ) for t in self.nonterminals: setattr( self, t.identifier, t ) make_list = lambda *args: list(args) def append( l, i ): l.append(i); return l def append_tuple( l, *a ): l.append( tuple(a) ); return l first = lambda a, *args: a second = lambda a, b, *args: b third = lambda a, b, c, *args: c fourth = lambda a, b, c, d, *args: d goal.production( first, cfg_file, eof ) cfg_file.production( File.append, cfg_file, section, code_lines ) cfg_file.production( File, code_lines ) code_toks = [ operator, identifier, number, literal, colon, tokens_id, productions_id ] for t in code_toks: code_bits.production( append, code_bits, t ) code_bits.production( make_list, t ) code_line.production( append, code_bits, newline ) code_line.production( lambda i: [], newline ) code_lines.production( Code.append, code_lines, code_line ) code_lines.production( Code.add_block, code_lines, indent, code_lines, dedent ) code_lines.production( lambda: Code() ) code.production( lambda _, c, __: Code(c), colon, code_bits, newline ) code.production( fourth, colon, newline, indent, code_lines, dedent ) code.production( lambda n: Code(), newline ) section.production( TokenSection.create, start_section, tokens_id, regexes, end_section ) section.production( ProductionSection.create, start_section, productions_id, productions, end_section ) naming.production( second, start_name, identifier, end_name ) naming.production( lambda: Identifier("", None, None) ) regex.production( Regex, literal, naming, code ) regexes.production( append, regexes, regex ) for white in [ newline, indent, dedent ]: regexes.production( first, regexes, white ) regexes.production( lambda: [] ) products.production( append_tuple, products, identifier, naming ) products.production( lambda: [] ) production.production( Production.create, identifier, produces, products, code ) productions.production( append, productions, production ) for white in [ newline, indent, dedent ]: productions.production( first, productions, white ) productions.production( lambda: [] ) def _paren_open( self, parsed, start, end ): self.paren_level += 1 data = Operator(parsed, start, end) return self.operator.with_data(data) def _paren_close( self, parsed, start, end ): self.paren_level -= 1 data = Operator(parsed, start, end) return self.operator.with_data(data) def _indent_handler( self, parsed, start, end ): indent = 0 for ch in parsed: if ch == ' ': indent += 1 elif ch == '\t': indent = (indent/4 +1) * 4 line, col = start if self.paren_level == 0: if indent > self.indent_levels[-1]: self.indent_levels.append( indent ) data = Indent( parsed, start, end ) yield self.indent.with_data( data ) while indent < self.indent_levels[-1]: self.indent_levels.pop() data = Dedent( '', end, end ) yield self.dedent.with_data( data ) def _newline_handler( self, empty ): def cb( parsed, start, end ): line, col = start for ch in parsed[:-1]: col += 1 paren = self.paren_level != 0 data = Newline( '\n', (line, col), (line, col+1), empty, paren ) return self.newline.with_data( data ) return cb def _comment_line_handler( self, parsed, start, end ): line, col = start index = parsed.index( '#' ) line, col = line, col+index end = (line, col+len(parsed[index:-1])) data = Comment( parsed[index:-1], (line, col), end ) yield self.comment.with_data( data ) line, col = end data = Newline( '\n', end, (line, col+1), True ) yield self.newline.with_data( data ) def reset( self ): self.paren_level = 0 self.indent_levels = [0] def tokenize( self, data, reset = True ): if reset: self.reset() for tok in self.lexer.tokenize( data ): yield tok def tokenize_nt( self, data, reset = True ): for t in self.tokenize( data, reset ): yield t.data def python_tokenize( self, data, reset = True ): for t in self.tokenize( data, reset ): yield t.data.python() def _ifind( self, it, val ): v = it.next() while val != v: v = it.next() return v, it def _indent_level( self, it, include_below = False ): depth = 0 while depth >= 0: t = it.next() if t == Indent(): depth += 1 elif t == Dedent(): depth -= 1 if include_below or depth == 0: yield t def _get_file_tokens( self, filename ): f = file( filename, 'r' ) return self.tokenize_nt( chain.from_iterable( f ) ) def _get_class_tokens( self, filename, clsname, include_header = False ): tokens = self._get_file_tokens( filename ) n = Identifier('') while n != Identifier( clsname ): cls, tokens = self._ifind( tokens, Identifier('class') ) n = tokens.next() if include_header: yield cls yield n while n != Indent(): n = tokens.next() if include_header: yield n for t in self._indent_level( tokens, True ): yield t def _get_fn_tokens( self, filename, clsname, fnname, args = False, header = False ): tokens = self._get_class_tokens( filename, clsname ) n = Identifier( '' ) while n != Identifier( fnname ): d, tokens = self._ifind( tokens, Identifier( 'def' ) ) n = tokens.next() if header: yield d yield n while n != Indent(): n = tokens.next() if args: yield n for t in self._indent_level( tokens, True ): yield t def _intro( self ): yield "\n\n" + "#"*70 + '\n' yield "# Begin automatically generated code\n" yield "#"*70 + '\n' yield "from collections import defaultdict\n" yield "from itertools import chain\n" def _parser_header( self ): yield "class Parser:\n" yield "\tdef __init__( self ):\n" yield "\t\tself._build_lexer()\n" yield "\t\tself._build_parser()\n" yield "\t\n" yield "\tdef tokenize_and_parse( self, input ):\n" yield "\t\treturn self.parse( self.tokenize( input ) )\n" yield "\t\n" def _fn_header( self, name ): yield "\tdef {0}".format( name ) def _extro( self ): yield "#"*70 + '\n' yield "# End automatically generated code\n" yield "#"*70 + '\n\n\n' def parse( self, filename ): cfi = chain.from_iterable f = file( filename, 'r' ) tokens = self.tokenize( cfi( f ) ) cfg_file = self.parser.parse( tokens ) f.close() tokens, prods = None, None for i in cfg_file.items: if isinstance( i, TokenSection ): tokens = i if isinstance( i, ProductionSection ): prods = i if tokens is None or prods is None: return terminals = tokens.build() prods.build( terminals ) out = chain( (t for t in cfg_file.items[0].tokens), self.tokenize_nt( cfi(self._intro()) ), self._get_file_tokens( 'actions.py' ), self._get_class_tokens( 'glr.py', 'CFGNonterminal', True ), self._get_class_tokens( 'glr.py', 'CFGTerminal', True ), self._get_class_tokens( 'lexer.py', '_NamedEmptyObject', True ), self._get_class_tokens( 'lexer.py', 'CFGDFAState', True ), self.tokenize_nt( cfi(self._parser_header()), False ), self.tokenize_nt( cfi(self._fn_header( "parse" )), False ), self._get_fn_tokens( 'glr.py', 'CFGParser', 'parse', args = True ), self.tokenize_nt( cfi(self._fn_header( "_parse" )), False ), self._get_fn_tokens( 'glr.py', 'CFGParser', '_parse', args = True ), self.tokenize_nt( cfi(self._fn_header( "tokenize" )), False ), self._get_fn_tokens( 'lexer.py', 'CFGLexer', 'tokenize', args = True ), self.tokenize_nt( cfi(self._fn_header( "_tokenize" )), False ), self._get_fn_tokens( 'lexer.py', 'CFGLexer', '_tokenize', args = True ), self.tokenize_nt( cfi(self._fn_header( "_wrap_data" )), False ), self._get_fn_tokens( 'lexer.py', 'CFGLexer', '_wrap_data', args = True ), self.tokenize_nt( cfi(tokens.lexer_tokens()), False ), self.tokenize_nt( cfi(prods.parser_tokens()), False ), self.tokenize_nt( cfi(self._extro()), False ), (t for t in cfg_file.items[-1].tokens) ) indents, previous = [''], None f = file( 'output.py', 'w' ) for t in out: data, indents, previous = t.output( indents, previous ) f.write( data ) return cfg_file def output( self ): pass