def test_nested_groups( self ): l = CFGLexer( {"a(b(cd)*e)f": "match"} ) self.assertEqual( l.match( "abef" ), "match" ) self.assertEqual( l.match( "abcdcdef" ), "match" ) self.assertEqual( l.match( "af" ), None ) self.assertEqual( l.match( "abf" ), None ) self.assertEqual( l.match( "aef" ), None )
def test_empty_bol( self ): l = CFGLexer( {"b\\n": self._tuple, "^[ \\t]*": self._tuple } ) fn = lambda s: list( l.tokenize(s) ) self.assertEqual( fn("b\n b\n"), self._toks( "", "b\n", " ", "b\n", "" ) ) self.assertEqual( fn(" \tb\n"), self._toks( " \t", "b\n", "" ) ) self.assertEqual( fn(" \n "), self._toks( " ", (2, 0), " " ) )
def test_bol( self ): l = CFGLexer( {"ab\\n": self._tuple, "^c\\n": self._tuple } ) fn = lambda s: list( l.tokenize(s) ) self.assertEqual( fn( "ab\n" ), self._toks( "ab\n" ) ) self.assertEqual( fn( "c\n" ), self._toks( "c\n" ) ) self.assertEqual( fn( "ab\nc\n" ), self._toks( "ab\n", "c\n" ) ) self.assertEqual( fn( "bac\n" ), self._toks( (1, 4) ) )
def test_empty_eol( self ): l = CFGLexer( {"\\nab": self._tuple, "c*$": self._tuple } ) fn = lambda s: list( l.tokenize(s) ) self.assertEqual( fn("c\nab"), self._toks( "c", "\nab", "" ) ) self.assertEqual( fn("\nabcc\nab"), self._toks( "", "\nab", "cc", "\nab", "" ) ) self.assertEqual( fn("\nab"), self._toks( "", "\nab", "" ) ) self.assertEqual( fn("ca\nab"), self._toks( (1, 2), "", "\nab", "" ) )
def test_basic_regex( self ): l = CFGLexer( {"ab*": self._tuple} ) fn = lambda s: list( l.tokenize(s) ) self.assertEqual( fn("abb"), self._toks( "abb" ) ) self.assertEqual( fn("aba"), self._toks( "ab", "a" ) ) self.assertEqual( fn("abaca"), self._toks( "ab", "a", (1, 4), "a" ) ) self.assertEqual( fn("caba"), self._toks( (1, 1), "ab", "a" ) ) self.assertEqual( fn("abac"), self._toks( "ab", "a", (1, 4) ) )
def test_identifier_regex( self ): l = CFGLexer( { "if": self._tuple, "else": self._tuple, "elif": self._tuple, "[a-zA-Z_][a-zA-Z0-9_]*": self._tuple } ) fn = lambda s: list( l.tokenize(s) ) self.assertEqual( fn("if elif"), self._toks( "if", (1, 3), "elif" ) ) self.assertEqual( fn("ab 03if"), self._toks( "ab", (1, 5), "if" ) ) self.assertEqual( fn("elin if 432"), self._toks( "elin", (1, 5), "if", (1, 11) ) )
def test_floating_point( self ): v = lambda d, s, e: float(d) s = r"([0-9][0-9]*(\.[0-9]*)?|\.[0-9][0-9]*)([eE][+-]?[0-9][0-9]*)?" l = CFGLexer( {s: v} ) self.assertEqual( l.match( "12" ), float("12") ) self.assertEqual( l.match( "120" ), float("120") ) self.assertEqual( l.match( "6.43" ), float("6.43") ) self.assertEqual( l.match( "9.003" ), float("9.003") ) self.assertEqual( l.match( "1.04e-4" ), float("1.04e-4") ) self.assertEqual( l.match( "54.2E+2" ), float("54.2E+2") ) self.assertEqual( l.match( "0" ), float("0") ) self.assertEqual( l.match( "a123" ), None ) self.assertEqual( l.match( ".e" ), None ) self.assertEqual( l.match( "." ), None )
class TestGLRLexer(unittest.TestCase): def setUp( self ): plus, times, num = CFGTerminal.create("+", "*", "num") E, T, F = CFGNonterminal.create("E", "T", "F") E.production( lambda i: i, T ) T.production( lambda i, j, k: i.add(k), T, plus, F ) T.production( lambda i: i, F ) F.production( lambda i, j, k: i.mul(k), F, times, num ) F.production( lambda i: i, num ) tokens = { '\\+': plus, '\\*': times, "[0-9][0-9]*": num.data_wrapper(Number) } self.lexer = CFGLexer( tokens ) self.parser = CFGParser( E, (plus, times, num), (E, T, F) ) def _n( self, s ): return self.parser.parse( self.lexer.tokenize( s ) ).value def test_add( self ): self.assertEqual( self._n("5+4"), 9 ) self.assertEqual( self._n("1+3+0"), 4 ) def test_mul( self ): self.assertEqual( self._n("3*2"), 6 ) self.assertEqual( self._n("1*5*13"), 65 ) self.assertEqual( self._n("4*0"), 0 ) def test_oop( self ): self.assertEqual( self._n("4+2*8"), 20 ) self.assertEqual( self._n("5+0*12+4*3*2"), 29 )
def __init__( self ): self._build_parser() self.lexer = CFGLexer( self.tokens, eof_token = self.eof.data_wrapper(EOF) ) self.parser = CFGParser( self.goal, self.terminals, self.nonterminals ) self.reset()
def test_multiple_options( self ): l = CFGLexer( {"a|b|c(d|e|f)*": "match"} ) print l.base_state print l.base_state.edges['a'] print l.base_state.edges['b'] print l.base_state.edges['c'] print l.base_state.edges['c'].edges['e'] self.assertEqual( l.match( "a" ), "match" ) self.assertEqual( l.match( "b" ), "match" ) self.assertEqual( l.match( "c" ), "match" ) self.assertEqual( l.match( "ce" ), "match" ) self.assertEqual( l.match( "cfed" ), "match" ) self.assertEqual( l.match( "cffdee" ), "match" ) self.assertEqual( l.match( "adef" ), None ) self.assertEqual( l.match( "bdef" ), None ) self.assertEqual( l.match( "ac" ), None ) self.assertEqual( l.match( "acef" ), None ) self.assertEqual( l.match( "def" ), None )
def test_kleene( self ): l = CFGLexer( {"ab*": "match"} ) self.assertEqual( l.match( "a" ), "match" ) self.assertEqual( l.match( "ab" ), "match" ) self.assertEqual( l.match( "abbbb" ), "match" ) self.assertEqual( l.match( "aab" ), None ) self.assertEqual( l.match( "b" ), None )
def test_options( self ): l = CFGLexer( {"(a|b)*(c|d)": "match"} ) print l.base_state print l.base_state.edges['c'] print l.base_state.edges['d'] print l.base_state.edges['a'] print l.base_state.edges['b'] self.assertEqual( l.match( "c" ), "match" ) self.assertEqual( l.match( "d" ), "match" ) self.assertEqual( l.match( "ac" ), "match" ) self.assertEqual( l.match( "aabad" ), "match" ) self.assertEqual( l.match( "aadc" ), None ) self.assertEqual( l.match( "aba" ), None ) self.assertEqual( l.match( "dd" ), None )
def setUp( self ): plus, times, num = CFGTerminal.create("+", "*", "num") E, T, F = CFGNonterminal.create("E", "T", "F") E.production( lambda i: i, T ) T.production( lambda i, j, k: i.add(k), T, plus, F ) T.production( lambda i: i, F ) F.production( lambda i, j, k: i.mul(k), F, times, num ) F.production( lambda i: i, num ) tokens = { '\\+': plus, '\\*': times, "[0-9][0-9]*": num.data_wrapper(Number) } self.lexer = CFGLexer( tokens ) self.parser = CFGParser( E, (plus, times, num), (E, T, F) )
def test_eol( self ): l = CFGLexer( {"\\na": self._tuple, "$\\nc": self._tuple } ) fn = lambda s: list( l.tokenize(s) ) self.assertEqual( fn( "\na\nc" ), self._toks( "\na", "\nc" ) ) self.assertEqual( fn( "\nb\nc" ), self._toks( (2, 1), "\nc" ) )
def test_groups( self ): l = CFGLexer( {"a|ac*(de)*": "match"} ) print l.base_state self.assertEqual( l.match( "a" ), "match" ) self.assertEqual( l.match( "acde" ), "match" ) self.assertEqual( l.match( "ac" ), "match" ) self.assertEqual( l.match( "ade" ), "match" ) self.assertEqual( l.match( "accdede" ), "match" ) self.assertEqual( l.match( "aa" ), None ) self.assertEqual( l.match( "accd" ), None ) self.assertEqual( l.match( "acded" ), None ) self.assertEqual( l.match( "cc" ), None ) self.assertEqual( l.match( "acdec" ), None )
class CFGFileParser: def __init__( self ): self._build_parser() self.lexer = CFGLexer( self.tokens, eof_token = self.eof.data_wrapper(EOF) ) self.parser = CFGParser( self.goal, self.terminals, self.nonterminals ) self.reset() def _build_parser( self ): self.terminals = ( start_section, end_section, tokens_id, productions_id, start_name, end_name, produces, colon, newline, indent, dedent, operator, comment, identifier, literal, number, eof ) = ( CFGTerminal.create( "start_section", "end_section", "tokens_id", "productions_id", "start_name", "end_name", "produces", "colon", "newline", "indent", "dedent", "operator", "comment", "identifier", "literal", "number", "eof" ) ) for t in self.terminals: setattr( self, t.identifier, t ) make_tuple = lambda *args: tuple(args) self.tokens = [] self.tokens.append(CFGToken(r"{%", start_section)) self.tokens.append(CFGToken(r"%}", end_section)) self.tokens.append(CFGToken(r"tokens", tokens_id.data_wrapper(Identifier))) self.tokens.append(CFGToken(r"productions", productions_id.data_wrapper(Identifier))) self.tokens.append(CFGToken(r"\(%", start_name)) self.tokens.append(CFGToken(r"%\)", end_name)) self.tokens.append(CFGToken(r"->", produces)) self.tokens.append(CFGToken(r":", colon.data_wrapper(Operator))) operators = [ ",", ";", "@", "+", "-", "\\*", "/", "//", "!", "\\|", "&", "<<", ">>", "<", ">", "=", "\\.", "%", "`", "~", "\\^" ] for o in operators: self.tokens.append(CFGToken(o, operator.data_wrapper(Operator))) assign_operators = [ "+=", "-=", "\\*=", "/=", "//=", "!=", "\\|=", "&=", "<=", ">=", "==", "%=", "<<=", ">>=" ] for o in assign_operators: self.tokens.append(CFGToken(o, operator.data_wrapper(Operator))) paren_operators = [ "\(", "\[", "{", "\)", "\]", "}" ] for p in paren_operators[:3]: self.tokens.append(CFGToken(p, self._paren_open)) for p in paren_operators[3:]: self.tokens.append(CFGToken(p, self._paren_close)) self.tokens.append(CFGToken(r"[a-zA-Z_][a-zA-Z0-9_]*", identifier.data_wrapper( Identifier ))) self.tokens.append(CFGToken( r"([0-9][0-9]*(\.[0-9]*)?|\.[0-9][0-9]*)([eE][+-]?[0-9][0-9]*)?", number.data_wrapper( Number ))) self.tokens.append(CFGToken(r'"([^\\"\n]*(\\.)?)*"', literal.data_wrapper( Literal ))) self.tokens.append(CFGToken(r"'([^\\'\n]*(\\.)?)*'", literal.data_wrapper( Literal ))) self.tokens.append(CFGToken(r'r"([^"\n]*(\\")?)*"', literal.data_wrapper( Literal ))) self.tokens.append(CFGToken(r"r'([^'\n]*(\\')?)*'", literal.data_wrapper( Literal ))) self.tokens.append(CFGToken( r'"""([^"]*("|"")?)*"""', literal.data_wrapper( Literal ))) self.tokens.append(CFGToken( r"'''([^']*('|'')?)*'''", literal.data_wrapper( Literal ))) self.tokens.append(CFGToken(r"^[ \t]*\n", self._newline_handler(True))) self.tokens.append(CFGToken(r"^[ \t]*", self._indent_handler, True)) self.tokens.append(CFGToken(r"\n", self._newline_handler(False))) self.tokens.append(CFGToken(r"[ \t\r]", CFGToken.NoToken)) self.tokens.append(CFGToken(r"\\\n^", CFGToken.NoToken)) self.tokens.append(CFGToken(r"^[ \t]*#[^\n]*\n", self._comment_line_handler, True)) self.tokens.append(CFGToken(r"#[^\n]*", comment.data_wrapper( Comment ))) self.nonterminals = ( goal, cfg_file, section, code, code_line, code_lines, code_bits, naming, regexes, products, productions, regex, production ) = ( CFGNonterminal.create( "goal", "cfg_file", "section", "code", "code_line", "code_lines", "code_bits", "naming", "regexes", "products", "productions", "regex", "production" ) ) for t in self.nonterminals: setattr( self, t.identifier, t ) make_list = lambda *args: list(args) def append( l, i ): l.append(i); return l def append_tuple( l, *a ): l.append( tuple(a) ); return l first = lambda a, *args: a second = lambda a, b, *args: b third = lambda a, b, c, *args: c fourth = lambda a, b, c, d, *args: d goal.production( first, cfg_file, eof ) cfg_file.production( File.append, cfg_file, section, code_lines ) cfg_file.production( File, code_lines ) code_toks = [ operator, identifier, number, literal, colon, tokens_id, productions_id ] for t in code_toks: code_bits.production( append, code_bits, t ) code_bits.production( make_list, t ) code_line.production( append, code_bits, newline ) code_line.production( lambda i: [], newline ) code_lines.production( Code.append, code_lines, code_line ) code_lines.production( Code.add_block, code_lines, indent, code_lines, dedent ) code_lines.production( lambda: Code() ) code.production( lambda _, c, __: Code(c), colon, code_bits, newline ) code.production( fourth, colon, newline, indent, code_lines, dedent ) code.production( lambda n: Code(), newline ) section.production( TokenSection.create, start_section, tokens_id, regexes, end_section ) section.production( ProductionSection.create, start_section, productions_id, productions, end_section ) naming.production( second, start_name, identifier, end_name ) naming.production( lambda: Identifier("", None, None) ) regex.production( Regex, literal, naming, code ) regexes.production( append, regexes, regex ) for white in [ newline, indent, dedent ]: regexes.production( first, regexes, white ) regexes.production( lambda: [] ) products.production( append_tuple, products, identifier, naming ) products.production( lambda: [] ) production.production( Production.create, identifier, produces, products, code ) productions.production( append, productions, production ) for white in [ newline, indent, dedent ]: productions.production( first, productions, white ) productions.production( lambda: [] ) def _paren_open( self, parsed, start, end ): self.paren_level += 1 data = Operator(parsed, start, end) return self.operator.with_data(data) def _paren_close( self, parsed, start, end ): self.paren_level -= 1 data = Operator(parsed, start, end) return self.operator.with_data(data) def _indent_handler( self, parsed, start, end ): indent = 0 for ch in parsed: if ch == ' ': indent += 1 elif ch == '\t': indent = (indent/4 +1) * 4 line, col = start if self.paren_level == 0: if indent > self.indent_levels[-1]: self.indent_levels.append( indent ) data = Indent( parsed, start, end ) yield self.indent.with_data( data ) while indent < self.indent_levels[-1]: self.indent_levels.pop() data = Dedent( '', end, end ) yield self.dedent.with_data( data ) def _newline_handler( self, empty ): def cb( parsed, start, end ): line, col = start for ch in parsed[:-1]: col += 1 paren = self.paren_level != 0 data = Newline( '\n', (line, col), (line, col+1), empty, paren ) return self.newline.with_data( data ) return cb def _comment_line_handler( self, parsed, start, end ): line, col = start index = parsed.index( '#' ) line, col = line, col+index end = (line, col+len(parsed[index:-1])) data = Comment( parsed[index:-1], (line, col), end ) yield self.comment.with_data( data ) line, col = end data = Newline( '\n', end, (line, col+1), True ) yield self.newline.with_data( data ) def reset( self ): self.paren_level = 0 self.indent_levels = [0] def tokenize( self, data, reset = True ): if reset: self.reset() for tok in self.lexer.tokenize( data ): yield tok def tokenize_nt( self, data, reset = True ): for t in self.tokenize( data, reset ): yield t.data def python_tokenize( self, data, reset = True ): for t in self.tokenize( data, reset ): yield t.data.python() def _ifind( self, it, val ): v = it.next() while val != v: v = it.next() return v, it def _indent_level( self, it, include_below = False ): depth = 0 while depth >= 0: t = it.next() if t == Indent(): depth += 1 elif t == Dedent(): depth -= 1 if include_below or depth == 0: yield t def _get_file_tokens( self, filename ): f = file( filename, 'r' ) return self.tokenize_nt( chain.from_iterable( f ) ) def _get_class_tokens( self, filename, clsname, include_header = False ): tokens = self._get_file_tokens( filename ) n = Identifier('') while n != Identifier( clsname ): cls, tokens = self._ifind( tokens, Identifier('class') ) n = tokens.next() if include_header: yield cls yield n while n != Indent(): n = tokens.next() if include_header: yield n for t in self._indent_level( tokens, True ): yield t def _get_fn_tokens( self, filename, clsname, fnname, args = False, header = False ): tokens = self._get_class_tokens( filename, clsname ) n = Identifier( '' ) while n != Identifier( fnname ): d, tokens = self._ifind( tokens, Identifier( 'def' ) ) n = tokens.next() if header: yield d yield n while n != Indent(): n = tokens.next() if args: yield n for t in self._indent_level( tokens, True ): yield t def _intro( self ): yield "\n\n" + "#"*70 + '\n' yield "# Begin automatically generated code\n" yield "#"*70 + '\n' yield "from collections import defaultdict\n" yield "from itertools import chain\n" def _parser_header( self ): yield "class Parser:\n" yield "\tdef __init__( self ):\n" yield "\t\tself._build_lexer()\n" yield "\t\tself._build_parser()\n" yield "\t\n" yield "\tdef tokenize_and_parse( self, input ):\n" yield "\t\treturn self.parse( self.tokenize( input ) )\n" yield "\t\n" def _fn_header( self, name ): yield "\tdef {0}".format( name ) def _extro( self ): yield "#"*70 + '\n' yield "# End automatically generated code\n" yield "#"*70 + '\n\n\n' def parse( self, filename ): cfi = chain.from_iterable f = file( filename, 'r' ) tokens = self.tokenize( cfi( f ) ) cfg_file = self.parser.parse( tokens ) f.close() tokens, prods = None, None for i in cfg_file.items: if isinstance( i, TokenSection ): tokens = i if isinstance( i, ProductionSection ): prods = i if tokens is None or prods is None: return terminals = tokens.build() prods.build( terminals ) out = chain( (t for t in cfg_file.items[0].tokens), self.tokenize_nt( cfi(self._intro()) ), self._get_file_tokens( 'actions.py' ), self._get_class_tokens( 'glr.py', 'CFGNonterminal', True ), self._get_class_tokens( 'glr.py', 'CFGTerminal', True ), self._get_class_tokens( 'lexer.py', '_NamedEmptyObject', True ), self._get_class_tokens( 'lexer.py', 'CFGDFAState', True ), self.tokenize_nt( cfi(self._parser_header()), False ), self.tokenize_nt( cfi(self._fn_header( "parse" )), False ), self._get_fn_tokens( 'glr.py', 'CFGParser', 'parse', args = True ), self.tokenize_nt( cfi(self._fn_header( "_parse" )), False ), self._get_fn_tokens( 'glr.py', 'CFGParser', '_parse', args = True ), self.tokenize_nt( cfi(self._fn_header( "tokenize" )), False ), self._get_fn_tokens( 'lexer.py', 'CFGLexer', 'tokenize', args = True ), self.tokenize_nt( cfi(self._fn_header( "_tokenize" )), False ), self._get_fn_tokens( 'lexer.py', 'CFGLexer', '_tokenize', args = True ), self.tokenize_nt( cfi(self._fn_header( "_wrap_data" )), False ), self._get_fn_tokens( 'lexer.py', 'CFGLexer', '_wrap_data', args = True ), self.tokenize_nt( cfi(tokens.lexer_tokens()), False ), self.tokenize_nt( cfi(prods.parser_tokens()), False ), self.tokenize_nt( cfi(self._extro()), False ), (t for t in cfg_file.items[-1].tokens) ) indents, previous = [''], None f = file( 'output.py', 'w' ) for t in out: data, indents, previous = t.output( indents, previous ) f.write( data ) return cfg_file def output( self ): pass