def test_nested_groups( self ):
		l = CFGLexer( {"a(b(cd)*e)f": "match"} )
		self.assertEqual( l.match( "abef" ), "match" )
		self.assertEqual( l.match( "abcdcdef" ), "match" )
		self.assertEqual( l.match( "af" ), None )
		self.assertEqual( l.match( "abf" ), None )
		self.assertEqual( l.match( "aef" ), None )
Ejemplo n.º 2
0
	def test_empty_bol( self ):
		l = CFGLexer( {"b\\n": self._tuple, "^[ \\t]*": self._tuple } )
		fn = lambda s: list( l.tokenize(s) )
		self.assertEqual( fn("b\n  b\n"), 
			self._toks( "", "b\n", "  ", "b\n", "" ) )
		self.assertEqual( fn(" \tb\n"), self._toks( " \t", "b\n", "" ) )
		self.assertEqual( fn(" \n "), self._toks( " ", (2, 0), " " ) )
Ejemplo n.º 3
0
	def test_bol( self ):
		l = CFGLexer( {"ab\\n": self._tuple, "^c\\n": self._tuple } )
		fn = lambda s: list( l.tokenize(s) )
		self.assertEqual( fn( "ab\n" ), self._toks( "ab\n" ) )
		self.assertEqual( fn( "c\n" ), self._toks( "c\n" ) )
		self.assertEqual( fn( "ab\nc\n" ), self._toks( "ab\n", "c\n" ) )
		self.assertEqual( fn( "bac\n" ), self._toks( (1, 4) ) )
Ejemplo n.º 4
0
	def test_empty_eol( self ):
		l = CFGLexer( {"\\nab": self._tuple, "c*$": self._tuple } )
		fn = lambda s: list( l.tokenize(s) )
		self.assertEqual( fn("c\nab"), self._toks( "c", "\nab", "" ) )
		self.assertEqual( fn("\nabcc\nab"), 
				self._toks( "", "\nab", "cc", "\nab", "" ) )
		self.assertEqual( fn("\nab"), self._toks( "", "\nab", "" ) )
		self.assertEqual( fn("ca\nab"), self._toks( (1, 2), "", "\nab", "" ) )
Ejemplo n.º 5
0
	def test_basic_regex( self ):
		l = CFGLexer( {"ab*": self._tuple} )
		fn = lambda s: list( l.tokenize(s) )
		self.assertEqual( fn("abb"), self._toks( "abb" ) )
		self.assertEqual( fn("aba"), self._toks( "ab", "a" ) )

		self.assertEqual( fn("abaca"), self._toks( "ab", "a", (1, 4), "a" ) )
		self.assertEqual( fn("caba"), self._toks( (1, 1), "ab", "a" ) )
		self.assertEqual( fn("abac"), self._toks( "ab", "a", (1, 4) ) )
Ejemplo n.º 6
0
	def test_identifier_regex( self ):
		l = CFGLexer( {	"if": self._tuple,  
						"else": self._tuple,
						"elif": self._tuple,
						"[a-zA-Z_][a-zA-Z0-9_]*": self._tuple } )
		fn = lambda s: list( l.tokenize(s) )
		self.assertEqual( fn("if elif"), self._toks( "if", (1, 3), "elif" ) )
		self.assertEqual( fn("ab 03if"), self._toks( "ab", (1, 5), "if" ) )
		self.assertEqual( fn("elin if 432"), 
			self._toks( "elin", (1, 5), "if", (1, 11) ) )
	def test_floating_point( self ):
		v = lambda d, s, e: float(d)
		s = r"([0-9][0-9]*(\.[0-9]*)?|\.[0-9][0-9]*)([eE][+-]?[0-9][0-9]*)?"
		l = CFGLexer( {s: v} )
		self.assertEqual( l.match( "12" ), float("12") )
		self.assertEqual( l.match( "120" ), float("120") )
		self.assertEqual( l.match( "6.43" ), float("6.43") )
		self.assertEqual( l.match( "9.003" ), float("9.003") )
		self.assertEqual( l.match( "1.04e-4" ), float("1.04e-4") )
		self.assertEqual( l.match( "54.2E+2" ), float("54.2E+2") )
		self.assertEqual( l.match( "0" ), float("0") )
		self.assertEqual( l.match( "a123" ), None )
		self.assertEqual( l.match( ".e" ), None )
		self.assertEqual( l.match( "." ), None )
class TestGLRLexer(unittest.TestCase):
	def setUp( self ):
		plus, times, num = CFGTerminal.create("+", "*", "num")
		E, T, F = CFGNonterminal.create("E", "T", "F")

		E.production( lambda i: i, T )
		T.production( lambda i, j, k: i.add(k), T, plus, F )
		T.production( lambda i: i, F )
		F.production( lambda i, j, k: i.mul(k), F, times, num )
		F.production( lambda i: i, num )

		tokens = {
			'\\+': plus, '\\*': times,
			"[0-9][0-9]*": num.data_wrapper(Number)
			}

		self.lexer = CFGLexer( tokens )
		self.parser = CFGParser( E, (plus, times, num), (E, T, F) )

	def _n( self, s ):
		return self.parser.parse( self.lexer.tokenize( s ) ).value

	def test_add( self ):
		self.assertEqual( self._n("5+4"), 9 )
		self.assertEqual( self._n("1+3+0"), 4 )

	def test_mul( self ):
		self.assertEqual( self._n("3*2"), 6 )
		self.assertEqual( self._n("1*5*13"), 65 )
		self.assertEqual( self._n("4*0"), 0 )

	def test_oop( self ):
		self.assertEqual( self._n("4+2*8"), 20 )
		self.assertEqual( self._n("5+0*12+4*3*2"), 29 )
Ejemplo n.º 9
0
	def __init__( self ):
		self._build_parser()
		self.lexer = CFGLexer( self.tokens, 
			eof_token = self.eof.data_wrapper(EOF) )
		self.parser = CFGParser( 
			self.goal, self.terminals, self.nonterminals )
		self.reset()
	def test_multiple_options( self ):
		l = CFGLexer( {"a|b|c(d|e|f)*": "match"} )
		print l.base_state
		print l.base_state.edges['a']
		print l.base_state.edges['b']
		print l.base_state.edges['c']
		print l.base_state.edges['c'].edges['e']
		self.assertEqual( l.match( "a" ), "match" )
		self.assertEqual( l.match( "b" ), "match" )
		self.assertEqual( l.match( "c" ), "match" )
		self.assertEqual( l.match( "ce" ), "match" )
		self.assertEqual( l.match( "cfed" ), "match" )
		self.assertEqual( l.match( "cffdee" ), "match" )
		self.assertEqual( l.match( "adef" ), None )
		self.assertEqual( l.match( "bdef" ), None )
		self.assertEqual( l.match( "ac" ), None )
		self.assertEqual( l.match( "acef" ), None )
		self.assertEqual( l.match( "def" ), None )
	def test_kleene( self ):
		l = CFGLexer( {"ab*": "match"} )
		self.assertEqual( l.match( "a" ), "match" )
		self.assertEqual( l.match( "ab" ), "match" )
		self.assertEqual( l.match( "abbbb" ), "match" )
		self.assertEqual( l.match( "aab" ), None )
		self.assertEqual( l.match( "b" ), None )
	def test_options( self ):
		l = CFGLexer( {"(a|b)*(c|d)": "match"} )
		print l.base_state
		print l.base_state.edges['c']
		print l.base_state.edges['d']
		print l.base_state.edges['a']
		print l.base_state.edges['b']
		self.assertEqual( l.match( "c" ), "match" )
		self.assertEqual( l.match( "d" ), "match" )
		self.assertEqual( l.match( "ac" ), "match" )
		self.assertEqual( l.match( "aabad" ), "match" )
		self.assertEqual( l.match( "aadc" ), None )
		self.assertEqual( l.match( "aba" ), None )
		self.assertEqual( l.match( "dd" ), None )
	def setUp( self ):
		plus, times, num = CFGTerminal.create("+", "*", "num")
		E, T, F = CFGNonterminal.create("E", "T", "F")

		E.production( lambda i: i, T )
		T.production( lambda i, j, k: i.add(k), T, plus, F )
		T.production( lambda i: i, F )
		F.production( lambda i, j, k: i.mul(k), F, times, num )
		F.production( lambda i: i, num )

		tokens = {
			'\\+': plus, '\\*': times,
			"[0-9][0-9]*": num.data_wrapper(Number)
			}

		self.lexer = CFGLexer( tokens )
		self.parser = CFGParser( E, (plus, times, num), (E, T, F) )
Ejemplo n.º 14
0
	def test_eol( self ):
		l = CFGLexer( {"\\na": self._tuple, "$\\nc": self._tuple } )
		fn = lambda s: list( l.tokenize(s) )
		self.assertEqual( fn( "\na\nc" ), self._toks( "\na", "\nc" ) )
		self.assertEqual( fn( "\nb\nc" ), self._toks( (2, 1), "\nc" ) )
	def test_groups( self ):
		l = CFGLexer( {"a|ac*(de)*": "match"} )
		print l.base_state
		self.assertEqual( l.match( "a" ), "match" )
		self.assertEqual( l.match( "acde" ), "match" )
		self.assertEqual( l.match( "ac" ), "match" )
		self.assertEqual( l.match( "ade" ), "match" )
		self.assertEqual( l.match( "accdede" ), "match" )
		self.assertEqual( l.match( "aa" ), None )
		self.assertEqual( l.match( "accd" ), None )
		self.assertEqual( l.match( "acded" ), None )
		self.assertEqual( l.match( "cc" ), None )
		self.assertEqual( l.match( "acdec" ), None )
Ejemplo n.º 16
0
class CFGFileParser:
	def __init__( self ):
		self._build_parser()
		self.lexer = CFGLexer( self.tokens, 
			eof_token = self.eof.data_wrapper(EOF) )
		self.parser = CFGParser( 
			self.goal, self.terminals, self.nonterminals )
		self.reset()

	def _build_parser( self ):
		self.terminals = ( start_section, end_section,
			tokens_id, productions_id,
			start_name, end_name, produces, colon,
			newline, indent, dedent, operator, comment, identifier,
			literal, number, eof ) = (
			CFGTerminal.create( "start_section", "end_section",
					"tokens_id", "productions_id",
					"start_name", "end_name", "produces", 
					"colon", "newline", "indent", "dedent",
					"operator", "comment", "identifier", "literal", "number",
					"eof" ) )
		for t in self.terminals:
			setattr( self, t.identifier, t )

		make_tuple = lambda *args: tuple(args)

		self.tokens = []
		self.tokens.append(CFGToken(r"{%", start_section))
		self.tokens.append(CFGToken(r"%}", end_section))
		self.tokens.append(CFGToken(r"tokens", 
			tokens_id.data_wrapper(Identifier)))
		self.tokens.append(CFGToken(r"productions", 
			productions_id.data_wrapper(Identifier)))

		self.tokens.append(CFGToken(r"\(%", start_name))
		self.tokens.append(CFGToken(r"%\)", end_name))
		self.tokens.append(CFGToken(r"->", produces))
		self.tokens.append(CFGToken(r":", 
			colon.data_wrapper(Operator)))

		operators = [
			",", ";", "@", 
			"+", "-", "\\*", "/", "//", "!", "\\|", "&",
			"<<", ">>", "<", ">", "=", "\\.",
			"%", "`", "~", "\\^"
			]
		for o in operators:
			self.tokens.append(CFGToken(o, operator.data_wrapper(Operator)))

		assign_operators = [
			"+=", "-=", "\\*=", "/=", "//=",
			"!=", "\\|=", "&=", "<=", ">=", "==", "%=",
			"<<=", ">>=" ]
		for o in assign_operators:
			self.tokens.append(CFGToken(o, operator.data_wrapper(Operator)))
		
		paren_operators = [ "\(", "\[", "{", "\)", "\]", "}" ]
		for p in paren_operators[:3]:
			self.tokens.append(CFGToken(p, self._paren_open))
		for p in paren_operators[3:]:
			self.tokens.append(CFGToken(p, self._paren_close))

		self.tokens.append(CFGToken(r"[a-zA-Z_][a-zA-Z0-9_]*",
			identifier.data_wrapper( Identifier )))
		self.tokens.append(CFGToken(
			r"([0-9][0-9]*(\.[0-9]*)?|\.[0-9][0-9]*)([eE][+-]?[0-9][0-9]*)?",
			number.data_wrapper( Number )))
		
		self.tokens.append(CFGToken(r'"([^\\"\n]*(\\.)?)*"', 
			literal.data_wrapper( Literal )))
		self.tokens.append(CFGToken(r"'([^\\'\n]*(\\.)?)*'", 
			literal.data_wrapper( Literal )))

		self.tokens.append(CFGToken(r'r"([^"\n]*(\\")?)*"', 
			literal.data_wrapper( Literal )))
		self.tokens.append(CFGToken(r"r'([^'\n]*(\\')?)*'", 
			literal.data_wrapper( Literal )))
		
		self.tokens.append(CFGToken(
			r'"""([^"]*("|"")?)*"""', 
			literal.data_wrapper( Literal )))
		self.tokens.append(CFGToken(
			r"'''([^']*('|'')?)*'''", 
			literal.data_wrapper( Literal )))
	
		self.tokens.append(CFGToken(r"^[ \t]*\n", self._newline_handler(True))) 
		self.tokens.append(CFGToken(r"^[ \t]*", self._indent_handler, True))
		self.tokens.append(CFGToken(r"\n", self._newline_handler(False)))
		
		self.tokens.append(CFGToken(r"[ \t\r]", CFGToken.NoToken))
		self.tokens.append(CFGToken(r"\\\n^", CFGToken.NoToken))

		self.tokens.append(CFGToken(r"^[ \t]*#[^\n]*\n",
			self._comment_line_handler, True))
		self.tokens.append(CFGToken(r"#[^\n]*", 
			comment.data_wrapper( Comment )))

		self.nonterminals = ( goal, cfg_file, section, 
			code, code_line, code_lines, code_bits,
			naming, regexes, products, productions,
			regex, production ) = ( 
			CFGNonterminal.create(
				"goal", "cfg_file", "section", 
				"code", "code_line", "code_lines", "code_bits",
				"naming", "regexes", "products", "productions",
				"regex", "production"
				) )
		for t in self.nonterminals:
			setattr( self, t.identifier, t )

		make_list = lambda *args: list(args)
		def append( l, i ): l.append(i); return l
		def append_tuple( l, *a ): l.append( tuple(a) ); return l

		first = lambda a, *args: a
		second = lambda a, b, *args: b
		third = lambda a, b, c, *args: c
		fourth = lambda a, b, c, d, *args: d

		goal.production( first, cfg_file, eof )
		cfg_file.production( File.append, cfg_file, section, code_lines )
		cfg_file.production( File, code_lines )

		code_toks = [ operator, identifier, number, literal, colon,
			tokens_id, productions_id ]
		for t in code_toks:
			code_bits.production( append, code_bits, t )
			code_bits.production( make_list, t )

		code_line.production( append, code_bits, newline )
		code_line.production( lambda i: [], newline )

		code_lines.production( Code.append, code_lines, code_line )
		code_lines.production( Code.add_block, 
			code_lines, indent, code_lines, dedent )
		code_lines.production( lambda: Code() )

		code.production( lambda _, c, __: Code(c), colon, code_bits, newline )
		code.production( fourth, colon, newline, indent, code_lines, dedent )
		code.production( lambda n: Code(), newline )

		section.production( TokenSection.create, 
			start_section, tokens_id, regexes, end_section )
		section.production( ProductionSection.create, 
			start_section, productions_id, productions, end_section )

		naming.production( second, start_name, identifier, end_name )
		naming.production( lambda: Identifier("", None, None) )

		regex.production( Regex, literal, naming, code )
		regexes.production( append, regexes, regex )
		for white in [ newline, indent, dedent ]:
			regexes.production( first, regexes, white )
		regexes.production( lambda: [] )

		products.production( append_tuple, products, identifier, naming )
		products.production( lambda: [] )

		production.production( Production.create, 
			identifier, produces, products, code )
		productions.production( append, productions, production )
		for white in [ newline, indent, dedent ]:
			productions.production( first, productions, white )
		productions.production( lambda: [] )

	def _paren_open( self, parsed, start, end ):
		self.paren_level += 1
		data = Operator(parsed, start, end)
		return self.operator.with_data(data)

	def _paren_close( self, parsed, start, end ):
		self.paren_level -= 1
		data = Operator(parsed, start, end)
		return self.operator.with_data(data)

	def _indent_handler( self, parsed, start, end ):
		indent = 0
		for ch in parsed:
			if ch == ' ':		indent += 1
			elif ch == '\t':	indent = (indent/4 +1) * 4

		line, col = start
		if self.paren_level == 0:
			if indent > self.indent_levels[-1]:
				self.indent_levels.append( indent )
				data = Indent( parsed, start, end )
				yield self.indent.with_data( data )
			while indent < self.indent_levels[-1]:
				self.indent_levels.pop()
				data = Dedent( '', end, end )
				yield self.dedent.with_data( data )
	
	def _newline_handler( self, empty ):
		def cb( parsed, start, end ):
			line, col = start
			for ch in parsed[:-1]:
				col += 1
			paren = self.paren_level != 0
			data = Newline( '\n', (line, col), (line, col+1), empty, paren )
			return self.newline.with_data( data )
		return cb

	def _comment_line_handler( self, parsed, start, end ):
		line, col = start
		index = parsed.index( '#' )
		line, col = line, col+index
		end = (line, col+len(parsed[index:-1]))
		data = Comment( parsed[index:-1], (line, col), end )
		yield self.comment.with_data( data )

		line, col = end
		data = Newline( '\n', end, (line, col+1), True )
		yield self.newline.with_data( data )
	
	def reset( self ):
		self.paren_level = 0
		self.indent_levels = [0]

	def tokenize( self, data, reset = True ):
		if reset:
			self.reset()
		for tok in self.lexer.tokenize( data ):
			yield tok

	def tokenize_nt( self, data, reset = True ):
		for t in self.tokenize( data, reset ):
			yield t.data

	def python_tokenize( self, data, reset = True ):
		for t in self.tokenize( data, reset ):
			yield t.data.python()

	def _ifind( self, it, val ):
		v = it.next()
		while val != v:
			v = it.next()
		return v, it

	def _indent_level( self, it, include_below = False ):
		depth = 0
		while depth >= 0:
			t = it.next()
			if t == Indent():	depth += 1
			elif t == Dedent():	depth -= 1
			if include_below or depth == 0:
				yield t

	def _get_file_tokens( self, filename ):
		f = file( filename, 'r' )
		return self.tokenize_nt( chain.from_iterable( f ) )

	def _get_class_tokens( self, filename, clsname, include_header = False ):
		tokens = self._get_file_tokens( filename )
		n = Identifier('')
		while n != Identifier( clsname ):
			cls, tokens = self._ifind( tokens, Identifier('class') )
			n = tokens.next()

		if include_header:
			yield cls
			yield n
		while n != Indent():
			n = tokens.next()
			if include_header:
				yield n

		for t in self._indent_level( tokens, True ):
			yield t

	def _get_fn_tokens( self, filename, clsname, fnname, 
			args = False, header = False ):
		tokens = self._get_class_tokens( filename, clsname )
		n = Identifier( '' )
		while n != Identifier( fnname ):
			d, tokens = self._ifind( tokens, Identifier( 'def' ) )
			n = tokens.next()

		if header:
			yield d
			yield n
		while n != Indent():
			n = tokens.next()
			if args:
				yield n

		for t in self._indent_level( tokens, True ):
			yield t

	def _intro( self ):
		yield "\n\n" + "#"*70 + '\n'
		yield "# Begin automatically generated code\n"
		yield "#"*70 + '\n'
		yield "from collections import defaultdict\n"
		yield "from itertools import chain\n"

	def _parser_header( self ):
		yield "class Parser:\n"
		yield "\tdef __init__( self ):\n"
		yield "\t\tself._build_lexer()\n"
		yield "\t\tself._build_parser()\n"
		yield "\t\n"
		yield "\tdef tokenize_and_parse( self, input ):\n"
		yield "\t\treturn self.parse( self.tokenize( input ) )\n"
		yield "\t\n"

	def _fn_header( self, name ):
		yield "\tdef {0}".format( name )

	def _extro( self ):
		yield "#"*70 + '\n'
		yield "# End automatically generated code\n"
		yield "#"*70 + '\n\n\n'

	def parse( self, filename ):
		cfi = chain.from_iterable

		f = file( filename, 'r' )
		tokens = self.tokenize( cfi( f ) )
		cfg_file = self.parser.parse( tokens )
		f.close()

		tokens, prods = None, None
		for i in cfg_file.items:
			if isinstance( i, TokenSection ):		tokens = i
			if isinstance( i, ProductionSection ):	prods = i

		if tokens is None or prods is None:
			return

		terminals = tokens.build()
		prods.build( terminals )

		out = chain( 
			(t for t in cfg_file.items[0].tokens),
			self.tokenize_nt( cfi(self._intro()) ),
			self._get_file_tokens( 'actions.py' ),
			self._get_class_tokens( 'glr.py', 'CFGNonterminal', True ),
			self._get_class_tokens( 'glr.py', 'CFGTerminal', True ),
			self._get_class_tokens( 'lexer.py', '_NamedEmptyObject', True ),
			self._get_class_tokens( 'lexer.py', 'CFGDFAState', True ),
			self.tokenize_nt( cfi(self._parser_header()), False ),
			self.tokenize_nt( cfi(self._fn_header( "parse" )), False ),
			self._get_fn_tokens( 'glr.py', 'CFGParser', 'parse', 
				args = True ),
			self.tokenize_nt( cfi(self._fn_header( "_parse" )), False ),
			self._get_fn_tokens( 'glr.py', 'CFGParser', '_parse', 
				args = True ),
			self.tokenize_nt( cfi(self._fn_header( "tokenize" )), False ),
			self._get_fn_tokens( 'lexer.py', 'CFGLexer', 'tokenize', 
				args = True ),
			self.tokenize_nt( cfi(self._fn_header( "_tokenize" )), False ),
			self._get_fn_tokens( 'lexer.py', 'CFGLexer', '_tokenize', 
				args = True ),
			self.tokenize_nt( cfi(self._fn_header( "_wrap_data" )), False ),
			self._get_fn_tokens( 'lexer.py', 'CFGLexer', '_wrap_data', 
				args = True ),
			self.tokenize_nt( cfi(tokens.lexer_tokens()), False ),
			self.tokenize_nt( cfi(prods.parser_tokens()), False ),
			self.tokenize_nt( cfi(self._extro()), False ),
			(t for t in cfg_file.items[-1].tokens)
			)

		indents, previous = [''], None
		f = file( 'output.py', 'w' )
		for t in out:
			data, indents, previous = t.output( indents, previous )
			f.write( data )

		return cfg_file

	def output( self ):
		pass