def __init__(self, printfunc=print): self.printfunc = printfunc # The following structures must # already be defined here so that they are retained for shell # input (which is split on several ast.Programs). # Everything that can be accessed with a name/identifier, is stored # either in globals or locals. # Globals is a dictionary that maps global names to instances. It # contains modules, typedefs, functions. # Locals is an array which acts as a stack for local variables. # Top-level statements behave like being encapsulated in an implicit # main()-function, i.e. their variables are not global! # I think that we could always use a self.locals.append() whenever # new names are pushed on the stack (function arguments and in let # statements) but it looks more high-level to access the variables # always in the same way, which is: self.locals[symbol_tree.get_index()]. # To enable this when declaring variables, we use a list that # automatically grows. #self.globals : typing.Dict[str, ] self.locals = StackList() # Modules/Imports, custom types and function definitions are accessed through # the ast.TranslationUnit directly. The evaluator can access/read the symbol table # to identify stuff. # Here, initialized empty, filled with content when evaluating. self.current_unit = TranslationUnitRef( ast.TranslationUnit([], collections.OrderedDict(), collections.OrderedDict(), [], {})) # All imported modules self.modules: typing.Dict[str, ast.TranslationUnit] = {} # self.symbol_tree = SymbolTree()
def __init__( self, symbol_table_snapshot=None, modules: typing.Optional[typing.Dict[str, ast.TranslationUnit]] = None): self.symbol_tree = SymbolTree(symbol_table_snapshot) self.modules: typing.Dict[str, ast.TranslationUnit] = modules if isinstance( modules, dict) else {}
def parse_function_definition(self) -> ast.FunctionDefinition: toks = TokenList() # FUNC foo (bar : int) : str { ... } if not toks.add(self.match(token.FUNC)): raise Exception("Expected function definition.") # func FOO (bar : int) : str { ... } if not toks.add(self.match(token.IDENTIFIER)): raise ParseException("Expected function name.") name = self.peek(-1).lexeme if name in self.symbols_global: raise ParseException(f"Name '{name}' already exists in symbol table. Function definition impossible.") # Register function name before parsing parameter names (no parameter name should have the function name!) self.symbols_global[name] = bongtypes.UnknownType() # ( if not toks.add(self.match(token.LPAREN)): raise ParseException("Expected ( to start the parameter list.") # Parameters parameter_names, parameter_types = self.parse_parameters() # ) if not toks.add(self.match(token.RPAREN)): raise ParseException("Expected ) to end the parameter list.") # Return types return_types : typing.List[ast.BongtypeIdentifier] = [] if toks.add(self.match(token.COLON)): self.check_eof("Return type list expected.") return_types.append(self.parse_type()) while toks.add(self.match(token.COMMA)): return_types.append(self.parse_type()) # { if not self.peek().type == token.LBRACE: raise ParseException("Expected function body.") # New local symbol table (tree) for statement block # We could just store the global symbol table in the object because # it will always be the same. But remembering the previous symbol # table here theoretically allows to parse function definitions inside # other functions (the local symbol table would be properly restored # then). global_symbol_tree = self.symbol_tree self.symbol_tree = SymbolTree() # Parameters for param,typ in zip(parameter_names,parameter_types): if param in self.symbol_tree: raise ParseException(f"Argument name '{param}' appears twice in function definition") self.symbol_tree.register(param, bongtypes.UnknownType()) # Snapshot before block is parsed (this changes the state of the tree) func_symbol_tree_snapshot = self.symbol_tree.take_snapshot() # Function body body = self.block_stmt() # Restore symbol table/tree self.symbol_tree = global_symbol_tree return ast.FunctionDefinition(toks, name, parameter_names, parameter_types, return_types, body, func_symbol_tree_snapshot)
def __init__(self, lexer, snapshot=None, basepath=None): self.lexer = lexer self.basepath = basepath if basepath != None else os.getcwd() self.symbols_global : typing.Dict[str, bongtypes.BaseNode] = {} self.symbol_tree = SymbolTree() if snapshot != None: # When restoring the global dictionary, we need to copy the dict. # Otherwise, we change the snapshot that the caller (the repl) # will (most probably) reuse. self.symbols_global = snapshot[0].copy() # overwrite self.symbol_tree.restore_snapshot(snapshot[1]) # restore else: # Only when initializing symbol tables for the first time, register # builtin stuff for bfuncname, bfunc in bong_builtins.functions.items(): self.symbols_global[bfuncname] = bongtypes.BuiltinFunction(bfunc[1]) for btypename, btype in bongtypes.basic_types.items(): self.symbols_global[btypename] = bongtypes.Typedef(btype())
def resolve_function_interface(self, function: ast.FunctionDefinition, unit: ast.TranslationUnit): parameters = bongtypes.TypeList([]) returns = bongtypes.TypeList([]) for param_name, param_type in zip(function.parameter_names, function.parameter_types): typ = self.resolve_type(param_type, unit, function) parameters.append(typ) SymbolTree(function.symbol_tree_snapshot)[param_name] = typ for ret in function.return_types: returns.append(self.resolve_type(ret, unit, function)) unit.symbols_global[function.name] = bongtypes.Function( parameters, returns)
class Eval: # Defined here so that it can be used by the parser BUILTIN_ENVIRONMENT = {"sys_argv": sys.argv} def __init__(self, printfunc=print): self.printfunc = printfunc # The following structures must # already be defined here so that they are retained for shell # input (which is split on several ast.Programs). # Everything that can be accessed with a name/identifier, is stored # either in globals or locals. # Globals is a dictionary that maps global names to instances. It # contains modules, typedefs, functions. # Locals is an array which acts as a stack for local variables. # Top-level statements behave like being encapsulated in an implicit # main()-function, i.e. their variables are not global! # I think that we could always use a self.locals.append() whenever # new names are pushed on the stack (function arguments and in let # statements) but it looks more high-level to access the variables # always in the same way, which is: self.locals[symbol_tree.get_index()]. # To enable this when declaring variables, we use a list that # automatically grows. #self.globals : typing.Dict[str, ] self.locals = StackList() # Modules/Imports, custom types and function definitions are accessed through # the ast.TranslationUnit directly. The evaluator can access/read the symbol table # to identify stuff. # Here, initialized empty, filled with content when evaluating. self.current_unit = TranslationUnitRef( ast.TranslationUnit([], collections.OrderedDict(), collections.OrderedDict(), [], {})) # All imported modules self.modules: typing.Dict[str, ast.TranslationUnit] = {} # self.symbol_tree = SymbolTree() def restore_symbol_tree(self, node: SymbolTreeNode): self.symbol_tree.restore_snapshot(node) def evaluate(self, node: ast.BaseNode) -> ValueList: if isinstance(node, ast.Program): # Register all imported modules for k, m in node.modules.items(): self.modules[k] = m # Then evaluate the main module/file/input return self.evaluate(node.main_unit) elif isinstance(node, ast.TranslationUnit): # First, retain/copy all function definitions. The other stuff seems # not to be required currently. # Here, we can not just set the current unit to node to retain # function definitions across evaluations in shell mode. for k, f in node.function_definitions.items(): self.current_unit.unit.function_definitions[k] = f # Set the current symbol table (which could be a reused one) self.current_unit.unit.symbols_global = node.symbols_global # Afterwards, run all non-function statements res = ValueList([]) for stmt in node.statements: res = self.evaluate(stmt) if res.returned(): # ast.Program is the top-level-node, return means exit then # https://docs.python.org/3/library/sys.html#sys.exit says: # int -> int, Null -> 0, other -> 1 # This behaviour seems reasonable here sys.exit(res[0] if len(res) > 0 else None) return res elif isinstance(node, ast.Block): symtree = self.symbol_tree.take_snapshot() result = ValueList([]) for stmt in node.stmts: result = self.evaluate(stmt) if result.returned(): break self.symbol_tree.restore_snapshot(symtree) return result elif isinstance(node, ast.Return): if node.result == None: return ValueList([], True) result = self.evaluate(node.result) result.unwind_return = True return result elif isinstance(node, ast.IfElseStatement): cond = node.cond if isTruthy(self.evaluate(cond)): return self.evaluate(node.thn) elif isinstance(node.els, ast.BaseNode): return self.evaluate(node.els) return ValueList([]) elif isinstance(node, ast.WhileStatement): ret = ValueList([]) while isTruthy(self.evaluate(node.cond)): ret = self.evaluate(node.t) if ret.returned(): break return ret elif isinstance(node, ast.AssignOp): values = self.evaluate(node.rhs) self.assign(node.lhs, values) return values elif isinstance(node, ast.BinOp): op = node.op lhs = self.evaluate(node.lhs)[0] rhs = self.evaluate(node.rhs)[0] if op == "+": res = lhs + rhs elif op == "-": res = lhs - rhs elif op == "*": res = lhs * rhs elif op == "/": if isinstance(lhs, int): res = lhs // rhs else: res = lhs / rhs elif op == "%": res = lhs % rhs elif op == "^": res = lhs**rhs elif op == "&&": res = lhs and rhs elif op == "||": res = lhs or rhs elif op == "==": res = lhs == rhs elif op == "!=": res = lhs != rhs elif op == "<": res = lhs < rhs elif op == ">": res = lhs > rhs elif op == "<=": res = lhs <= rhs elif op == ">=": res = lhs >= rhs else: raise Exception("unrecognised operator: " + str(node.op)) return ValueList([res]) elif isinstance(node, ast.UnaryOp): op = node.op if op == "!": val = not self.evaluate(node.rhs)[0] elif op == "-": val = -self.evaluate(node.rhs)[0] else: raise Exception("unrecognised unary operator: " + str(node.op)) return ValueList([val]) elif isinstance(node, ast.Integer): return ValueList([node.value]) elif isinstance(node, ast.Float): return ValueList([node.value]) elif isinstance(node, ast.String): return ValueList([node.value]) elif isinstance(node, ast.Bool): return ValueList([node.value]) elif isinstance(node, ast.SysCall): return self.callprogram(node) elif isinstance(node, ast.Pipeline): if len(node.elements) < 2: raise Exception( "Pipelines should have more than one element. This seems to be a parser bug." ) syscalls = [] # First pipeline element: First syscall or stdin if isinstance(node.elements[0], ast.SysCall): syscalls.append(node.elements[0]) stdin = None else: stdin = self.evaluate(node.elements[0]) # Other pipeline elements until last: syscalls for sc in node.elements[1:-1]: assert (isinstance(sc, ast.SysCall)) syscalls.append(sc) # Last pipeline element: Last syscall or stdout (+stderr) if isinstance(node.elements[-1], ast.SysCall): syscalls.append(node.elements[-1]) assignto = None else: assignto = node.elements[-1] # Special case: piping an ordinary expression into a variable if len(syscalls) == 0: raise Exception("The special case, assigning regular values" " via pipelines, is not supported currently.") """ if assignto == None: raise Exception("Assertion error: Whenever a pipeline has no syscalls, it should consist of an expression that is assigned to something. No assignment was found here.") self.assign(assignto, stdin) return stdin """ processes = [] for syscall in syscalls[:-1]: process = self.callprogram(syscall, stdin, True) processes.append(process) stdin = process.stdout numOutputPipes = 0 if assignto == None else self.numInputsExpected( assignto) lastProcess = self.callprogram(syscalls[-1], stdin, numOutputPipes) # So, there is this single case that is different from everything else # and that needs special treatment: # Whenever the first process is opened with stdin=PIPE, we must # close its stdin except when this is the only process, then we # must not close the stdin, because then communicate() will fail. if not isinstance(node.elements[0], ast.SysCall) and len(processes): processes[0].stdin.close() outstreams = lastProcess.communicate() for process in processes: process.wait() # Assign stdout,stderr to variables #results = ValueList(outstreams[:numOutputPipes]) results = ValueList([]) for o in outstreams[:numOutputPipes]: results.append(o.decode('utf-8')) if isinstance(assignto, ast.PipelineLet): # copied from ast.Let if len(assignto.names) != len(results): raise Exception( "number of expressions between rhs and lhs do not match" ) self.symbol_tree.restore_snapshot( assignto.symbol_tree_snapshot) for name, result in zip(assignto.names, results): index = self.symbol_tree.get_index(name) self.locals[index] = result elif isinstance(assignto, ast.ExpressionList): self.assign(assignto, results) elif isinstance(assignto, ast.BaseNode): self.assign(ast.ExpressionList(assignto.tokens, [assignto]), results) # Return exitcode of subprocess return ValueList([lastProcess.returncode]) elif isinstance(node, ast.Identifier): if node.name in self.symbol_tree: index = self.symbol_tree.get_index(node.name) return ValueList([self.locals[index]]) elif node.name in self.current_unit.unit.symbols_global: pass # TODO Add global environment raise Exception( f"Unknown identifier '{node.name}' specified. TODO: global environment." ) elif isinstance(node, ast.IndexAccess): index = self.evaluate(node.rhs)[0] lhs = self.evaluate(node.lhs)[0] return ValueList([lhs[index]]) elif isinstance(node, ast.DotAccess): # The following is only used for StructValue, modules are only used # for module- and function-access which is handled in FunctionCall below. val = self.evaluate(node.lhs)[0][node.rhs] return ValueList([val]) elif isinstance(node, ast.FunctionCall): # node.name should either be an ast.Identifier, then we call a function # in the current module/unit, or an ast.DotAccess, then we call a function # in the specified module/unit. if isinstance(node.name, ast.Identifier): unit = self.current_unit.unit funcname = node.name.name elif isinstance(node.name, ast.DotAccess): unit = self.get_module(node.name.lhs) funcname = node.name.rhs else: raise Exception( "Identifier or DotAccess for function name expected.") # Change (PUSH) the current unit. We also do this if we call a function # in the current unit/module because then we do not have to decide # afterwards if we have to pop the translation unit back, we just do it. self.current_unit = TranslationUnitRef(unit, self.current_unit) try: # Evaluate arguments (with old scope) # TODO Here, we can maybe use args = self.evaluate(ExprList) # instead of building a new array from scratch? args = [] for a in node.args: args.append(self.evaluate(a)[0]) # Call by value! args = copy.deepcopy(args) # Call function, either builtin or defined if isinstance(unit.symbols_global[funcname], bongtypes.Function): # Bong function function = unit.function_definitions[funcname] symbol_tree_snapshot = self.symbol_tree.take_snapshot() self.symbol_tree.restore_snapshot( function.symbol_tree_snapshot) local_env_snapshot = self.locals self.locals = StackList() try: # Add arguments to new local environment, then eval func for name, arg in zip(function.parameter_names, args): index = self.symbol_tree.get_index(name) self.locals[index] = arg result = self.evaluate(function.body) finally: self.symbol_tree.restore_snapshot(symbol_tree_snapshot) self.locals = local_env_snapshot if result.returned(): result.unwind_return = False return result return result else: # Builtin function return bong_builtins.functions[funcname][0](args) finally: # Change back (POP) the current unit self.current_unit = self.current_unit.parent elif isinstance(node, ast.Print): self.printfunc(self.evaluate(node.expr)) elif isinstance(node, ast.Let): # First, evaluate all rhses (those are possibly encapsulated in an # ExpressionList, so no need to iterate here results = self.evaluate(node.expr) # Then, assign results. This order of execution additionally prevents # the rhs of a let statement to use the variables declared on the # left side. if len(node.names) != len(results): raise Exception( "number of expressions between rhs and lhs do not match") self.symbol_tree.restore_snapshot(node.symbol_tree_snapshot) for name, result in zip(node.names, results): index = self.symbol_tree.get_index(name) self.locals[index] = result elif isinstance(node, ast.Array): elements = [] for e in node.elements: elements.append(self.evaluate(e)[0]) return ValueList([elements]) elif isinstance(node, ast.StructValue): assert (isinstance(node.name, ast.Identifier) or isinstance(node.name, ast.DotAccess)) structval = StructValue(node.name) for name, expr in node.fields.items(): structval[name] = self.evaluate(expr)[0] return ValueList([structval]) elif isinstance(node, ast.ExpressionList): results = ValueList([]) for exp in node.elements: # ValueList is a FlatList and an append to FlatList is # automatically flattened. Not indexing into the result # of evaluate() here is crucial because the result could be # an empty ValueList (e.g. function calls) results.append(self.evaluate(exp)) return results else: raise Exception("unknown ast node") return ValueList([]) # Satisfy mypy def assign(self, lhs: ast.ExpressionList, rhs: ValueList): if len(rhs) != len(lhs): raise Exception("number of elements on lhs and rhs does not match") for l, value in zip(lhs, rhs): # lhs evaluation: The lhs can be a variable assignment, an # index access, a DotAccess if isinstance(l, ast.Identifier): name = l.name stack_index = self.symbol_tree.get_index(name) self.locals[stack_index] = value elif isinstance(l, ast.IndexAccess): index_access_index = self.evaluate(l.rhs)[0] array = self.evaluate(l.lhs)[0] array[index_access_index] = value elif isinstance(l, ast.DotAccess): struct = self.evaluate(l.lhs)[0] struct[l.rhs] = value else: raise Exception( "Can only assign to variable or indexed variable") def numInputsExpected(self, assignto): if isinstance(assignto, ast.PipelineLet): return len(assignto.names) elif isinstance(assignto, ast.ExpressionList): return len(assignto.elements) else: # Currently only used in pipelines, it's a single variable then return 1 def callprogram(self, program, stdin=None, numOutputPipes=0): # TODO We pass a whole ast.SysCall object to callprogram, only the args # list would be enough. Should we change that? This would simplify this # method itself and calling builtin functions. # # Before doing anything, expand ~ to user's home directory cmd = [] home_directory = os.path.expanduser("~") for arg in program.args: if arg.startswith("~"): arg = home_directory + arg[1:] cmd.append(arg) # Check bong builtins first. Until now, only 'cd' defined if cmd[0] == "cd": if stdin != None or numOutputPipes != 0: print("bong: cd: can not be piped") # TODO Here, the calling pipe will crash :( return something # usable instead! return None return self.call_cd(cmd) path_var = os.environ['PATH'].split(':') # Special case: Syscalls with relative or absolute path ('./foo', '../foo', '/foo/bar', 'foo/bar') if (cmd[0].startswith('./') or cmd[0].startswith('../') or cmd[0].startswith('/') or '/' in cmd[0]): path_var = [""] for path in path_var: if len(path) > 0: if not path.endswith('/'): path += "/" filepath = path + cmd[0] else: filepath = cmd[0] if os.path.isfile(filepath) and os.access(filepath, os.X_OK): # Simple syscall if stdin == None and numOutputPipes == 0: compl = subprocess.run(cmd) return compl.returncode # Piped syscall # lhs = subprocess.Popen(["ls"], stdout=subprocess.PIPE) # rhs = subprocess.Popen(["grep", "foo"], stdin=lhs.stdout) # lhs.stdout.close() # rhs.communicate() # -> I call stdout.close() on all but the last subprocesses # -> I call communicate() only on the last subprocess # TODO Is that actually the right approach? else: # a) this is the leftmost syscall of a pipe or # -> Create the process with stdin=None # b) the previous step of the pipe was a syscall # -> Create the process with stdin=stdin # c) lhs of the pipe was variable or function # -> Create the process with stdin=PIPE and write the value into stdin case_a = stdin == None case_b = isinstance(stdin, io.BufferedReader) # _io.Buff...? case_c = not (case_a or case_b) stdin_arg = stdin if not case_c else subprocess.PIPE stdout_arg = subprocess.PIPE if numOutputPipes > 0 else None stderr_arg = subprocess.PIPE if numOutputPipes > 1 else None proc = subprocess.Popen(cmd, stdin=stdin_arg, stdout=stdout_arg, stderr=stderr_arg) if case_c: # Prevent possible bytestreams from being interpreted # as strings. Currently 2019-12-22, this is not strictly # required because we don't have bytestreams yet and # everything is nicely decoded to strings but in the # future, we have to do this here! if type(stdin) == bytes: proc.stdin.write(stdin) else: proc.stdin.write(str(stdin).encode("utf-8")) #proc.stdin.close() # Now, after having created this process, we can run the # stdout.close() on the previous process (if there was one) # stdout of the previous is stdin here. if isinstance(stdin, io.BufferedReader): # _io.Buff...? stdin.close() return proc print("bong: {}: command not found".format(cmd[0])) def call_cd(self, args): if len(args) > 2: print("bong: cd: too many arguments") return 1 try: if len(args) > 1: if (args[1] == "-" ): # Everything bash can do, we can do better. if hasattr(self, "prev_directory"): self.change_dir(self.prev_directory) else: self.change_dir(args[1]) else: self.change_dir(os.path.expanduser('~')) return 0 except Exception as e: print("bong: cd: {}".format(str(e))) return 1 def change_dir(self, new_dir): prev_dir = os.getcwd() os.chdir( new_dir) # This can fail so everything else happens afterwards self.prev_directory = prev_dir # Now, we send the escape codes to tell the terminal (emulator) the # new directory current_dir = os.getcwd() sys.stdout.write("\x1b]7;file:" + current_dir + "\x07") # Tell the cwd to our terminal (emulator) home_directory = os.path.expanduser("~") if current_dir.startswith(home_directory): window_title = "~" + current_dir[len( home_directory):] # ~ + home-dir skipped in current dir else: window_title = current_dir sys.stdout.write("\x1b]2;bong " + window_title + "\x07") # Set the window title # Takes an Identifier or DotAccess which should describe a module # and returns the corresponding ast.TranslationUnit. The search # is started at self.current_unit's symbol table. For each resolution # step, another (the next) symbol table is used. def get_module( self, name: ast.BaseNode ) -> ast.TranslationUnit: # name should be Identifier (returns current_unit) or DotAccess (returns resolved DotAccess.lhs) # DotAccesses are forwarded until an Identifier is found. The # Identifier uses the current_unit's symbol table to resolve # the module. The DotAccesses use the returned units to resolve # further modules afterwards. if isinstance(name, ast.Identifier): module = self.current_unit.unit.symbols_global[name.name] elif isinstance(name, ast.DotAccess): unit = self.get_module(name.lhs) module = unit.symbols_global[name.rhs] else: raise Exception("Identifier or DotAccess expected.") if not isinstance(module, bongtypes.Module): raise Exception("Module expected.") return self.modules[module.path]
class TypeChecker: def __init__( self, symbol_table_snapshot=None, modules: typing.Optional[typing.Dict[str, ast.TranslationUnit]] = None): self.symbol_tree = SymbolTree(symbol_table_snapshot) self.modules: typing.Dict[str, ast.TranslationUnit] = modules if isinstance( modules, dict) else {} def checkprogram( self, main_unit: ast.TranslationUnit) -> typing.Optional[ast.Program]: try: return self.checkprogram_uncaught(main_unit) except TypecheckException as e: if e.node != None: loc = e.node.get_location() posstring = f" in {loc[0]}, line {loc[1]} col {loc[2]} to line {loc[3]} col {loc[4]}" else: posstring = "" print(f"TypecheckError{posstring}: {str(e.msg)}", file=sys.stderr) return None # The typechecker has to assign types in the symbol table for # - function definitions (parameter types, return types) # - struct definitions # - let statements # Since assigning types for let statements requires a full # ast pass, we do all type assignments in the typechecker. # Like this, it is not split on several components. # # Anyways, resolving custom types has to be done first so that # types are available for subsequent steps. # Next, function interfaces can/must be resolved so that their # type requirements are available for function calls. # Finally, everything else can be checked (function bodies # can only be checked here, not earlier). # This pattern resembles how the evaluator handles # FunctionDefinitions and all other statements differently. # # If we want to be insane, we could do all of this in one # single pass: Collect all type- and function-definitions first, # then resolve types and function interfaces as needed (check # the symbol-table if this has to be done yet). # Anyways, it can safely be assumed that the split approach # is more maintainable, debuggable, understandable. # If you want to try out the insane approach, just insert # resolve_type() and resolve_function_interface() at the # appropriate places when check()ing the ast. def checkprogram_uncaught(self, main_unit: ast.TranslationUnit) -> ast.Program: # DEBUG #print("Global symbols:", main_unit.symbols_global) #print(main_unit.symbol_tree) #print(main_unit) # Theoretically, everything is accessible via this chain: # self.program.main_unit.symbols_global # Anyways, for convenience, we make everything accessible here. self.main_unit = main_unit self.symbols_global = main_unit.symbols_global program = ast.Program(self.modules, main_unit) # Resolve module imports first self.parse_imports(main_unit) # Then resolve types self.resolve_types(main_unit) for unit in self.modules.values(): self.symbols_global = unit.symbols_global self.resolve_types(unit) # Resolve function interfaces self.symbols_global = main_unit.symbols_global self.resolve_function_interfaces(main_unit) for unit in self.modules.values(): self.symbols_global = unit.symbols_global self.resolve_function_interfaces(unit) # Typecheck the rest (also assigning variable types) # Functions in modules for unit in self.modules.values(): self.symbols_global = unit.symbols_global for func in unit.function_definitions.values(): res, turn = self.check(func) # Functions in main_module / main_unit self.symbols_global = main_unit.symbols_global for func in main_unit.function_definitions.values(): res, turn = self.check(func) # Statements in main_module / main_unit for stmt in main_unit.statements: res, turn = self.check(stmt) # If there is a possible return value, if turn != Return.NO: # ensure it is an int expect = bongtypes.TypeList([bongtypes.Integer()]) if not res.sametype(expect): raise TypecheckException( "Return type of program does not evaluate to int.", stmt) return program def parse_imports(self, parent_unit: ast.TranslationUnit): for imp_stmt in parent_unit.import_statements: if imp_stmt.path not in self.modules: # Parse # TODO this should be encapsulated more nicely. Currently, same code # as in main.py try: with open(imp_stmt.path) as f: code = f.read() except Exception as e: raise TypecheckException( f"Importing {imp_stmt.path} impossible:" f" '{e}'", imp_stmt) l = lexer.Lexer(code, imp_stmt.path) p = parser.Parser(l) child_unit = p.compile() # add2modmap if not isinstance(child_unit, ast.TranslationUnit): raise TypecheckException( f"Importing {imp_stmt.path}" " failed.", imp_stmt) self.modules[imp_stmt.path] = child_unit # Recurse self.parse_imports(child_unit) # Add to symbol table parent_unit.symbols_global[imp_stmt.name] = bongtypes.Module( imp_stmt.path) def resolve_types(self, unit: ast.TranslationUnit): for typename, struct_def in unit.struct_definitions.items(): self.resolve_type(ast.BongtypeIdentifier([typename], 0), unit, struct_def) # Resolve a given BongtypeIdentifier to an actual type. For custom types, # this method will not return the bongtypes.Typedef, but the value type instead, # i.e. the Typedefs will be unpacked. # It can crash whenever an inner type in a struct, a type hint in a function # interface or a type hint in a let statement uses a typename that is not defined. # Currently, recursive types are prevented. But actually, it would be # possible to instantiate a type that refers to itself via an array because # that array could be empty. Therefore, it would be nice if we could: # 1. Allow recursive types first. # 2. Check if there is a recursive circle without arrays # struct T { x : T } is an error because it is infinite # struct T { x : []T } is OK def resolve_type(self, identifier: ast.BongtypeIdentifier, unit: ast.TranslationUnit, node: ast.BaseNode) -> bongtypes.ValueType: # Arrays are resolved recursively if identifier.num_array_levels > 0: return bongtypes.Array( self.resolve_type( ast.BongtypeIdentifier(identifier.typename, identifier.num_array_levels - 1), unit, node)) # If a module name is given, propagate to the module if len(identifier.typename) > 1: modulename = identifier.typename[0] remaining_typename = identifier.typename[1:] # The following checks are a little bit convoluted to satisfy mypy if (not modulename in unit.symbols_global): raise TypecheckException( f"Module {modulename} not found in" " symbol table.", node) module_sym = unit.symbols_global[modulename] if not isinstance(module_sym, bongtypes.Module): raise TypecheckException( f"Symbol {modulename} is not a module," f" instead it is {module_sym}.", node) modulepath = module_sym.path if not modulepath in self.modules: raise TypecheckException( f"Module {module_sym} not found" " in module dictionary.", node) child_unit = self.modules[modulepath] remaining_typeidentifier = ast.BongtypeIdentifier( remaining_typename, 0) return self.resolve_type(remaining_typeidentifier, child_unit, node) # Otherwise, the typename is the only item in the list typename = identifier.typename[0] # Check missing type if not typename in unit.symbols_global: raise TypecheckException( f"Type {typename} can not be" " resolved.", node) # Already known types can be returned if not unit.symbols_global[typename].sametype(bongtypes.UnknownType()): typedef = unit.symbols_global[typename] # Prevent recursive types if isinstance(typedef, bongtypes.UnfinishedType): raise TypecheckException( f"Type {typename} is recursive." " This is currently not allowed for several reasons.", node) if not isinstance(typedef, bongtypes.Typedef): raise TypecheckException( f"Type {typename} can not be" " resolved.", node) return typedef.value_type # unpack # Everything else (structs) will be determined by determining the inner types if not typename in unit.struct_definitions: raise TypecheckException( f"Type {typename} can not be" " resolved.", node) struct_def = unit.struct_definitions[typename] # For recursion prevention, remember that we have started this type unit.symbols_global[typename] = bongtypes.UnfinishedType() fields: typing.Dict[str, bongtypes.ValueType] = {} for name, type_identifier in struct_def.fields.items(): fields[name] = self.resolve_type(type_identifier, unit, struct_def) value_type = bongtypes.Struct(typename, fields) unit.symbols_global[typename] = bongtypes.Typedef(value_type) return value_type def resolve_function_interfaces(self, unit: ast.TranslationUnit): for func_definition in unit.function_definitions.values(): self.resolve_function_interface(func_definition, unit) def resolve_function_interface(self, function: ast.FunctionDefinition, unit: ast.TranslationUnit): parameters = bongtypes.TypeList([]) returns = bongtypes.TypeList([]) for param_name, param_type in zip(function.parameter_names, function.parameter_types): typ = self.resolve_type(param_type, unit, function) parameters.append(typ) SymbolTree(function.symbol_tree_snapshot)[param_name] = typ for ret in function.return_types: returns.append(self.resolve_type(ret, unit, function)) unit.symbols_global[function.name] = bongtypes.Function( parameters, returns) def is_writable(self, node: ast.BaseNode): # Identifiers can describe modules, function names, types, variables. Only variables # are writable and only those will be as ValueTypes in the symbol table so this is # how we can determine the writability of this node. if isinstance(node, ast.Identifier): if node.name in self.symbol_tree: return True else: return False #return isinstance(self.symbol_table[node.name].typ, bongtypes.ValueType) # IndexAccess and DotAccess are writable whenever the lhs is writable, e.g. # foo().bar not writable # foo.bar[0] writable if foo is a writable variable # foo.bar()[0].baz not writable because function call's result is not writable # mod.foo not writable if mod is a module, then mod.foo is a type elif isinstance(node, ast.IndexAccess): return self.is_writable(node.lhs) elif isinstance(node, ast.DotAccess): return self.is_writable(node.lhs) elif isinstance(node, ast.ExpressionList): for n in node.inner_nodes: if not self.is_writable(n): return False return True # Everything else shouldn't be writable (function calls, blocks, ...) else: return False # Determine the type of the ast node. # This method returns the TypeList (0, 1 or N elements) that the node will # evaluate to and a return hint that tells us if the node contains a # return statement and if it is sure that this return will be invoked. This # information is required to check/guarantee the return type of function # definitions. def check(self, node: ast.BaseNode) -> typing.Tuple[TypeList, Return]: if isinstance(node, ast.Block): symbol_tree_snapshot = self.symbol_tree.take_snapshot() # All return statements in a whole block must match so that the # whole block is consistent. block_return: typing.Tuple[TypeList, Return] = (TypeList([]), Return.NO) for stmt in node.stmts: stmt_return = self.check(stmt) if stmt_return[1] != Return.NO: if block_return[1] == Return.NO: # initialize block_return = stmt_return else: # ensure that all return types are the same if not block_return[0].sametype(stmt_return[0]): raise TypecheckException( "Return type does not match previous return type in block.", stmt) # If at least one statement in the block definitely # returns, the whole block definitely returns # -> a YES overwrites a MAYBE if stmt_return[1] == Return.YES: block_return = stmt_return # block_return[1] = Return.YES # Here, we could theoretically break from the # loop because subsequent statements will not # be executed. But no break has the benefit # that the following code is already typechecked. # When the return is removed, the typechecker # result will not change. # Restore scope self.symbol_tree.restore_snapshot(symbol_tree_snapshot) return block_return if isinstance(node, ast.Return): if node.result == None: return bongtypes.TypeList([]), Return.YES res, turn = self.check(node.result) # turn should be false here return res, Return.YES if isinstance(node, ast.IfElseStatement): cond, turn = self.check(node.cond) if len(cond) == 0 or type(cond[0]) != bongtypes.Boolean: raise TypecheckException( "If statement requires boolean condition.", node.cond) a, aturn = self.check(node.thn) if isinstance(node.els, ast.BaseNode): b, bturn = self.check(node.els) else: b, bturn = a, Return.NO # if there is no else, it won't return # 1. if { } else { } -> OK # 2. if { return } else { } -> OK # 3. if { } else { return } -> OK # 4. if { return } else { return } -> returns should match! # If there is no 'else', this is covered by 1. and 2. if aturn != Return.NO and bturn != Return.NO: # 4 if not a.sametype(b): raise TypecheckException( "'If' and 'Else' branch's return type do not match.", node) # Here, only if both are YES, the whole if-else is YES if aturn == Return.YES and bturn == Return.YES: return a, Return.YES return a, Return.MAYBE if aturn != Return.NO: # 2 return a, Return.MAYBE if bturn != Return.NO: # 3 return b, Return.MAYBE return TypeList([]), Return.NO # 1 if isinstance(node, ast.WhileStatement): types, turn = self.check(node.cond) if len(types) != 1: raise TypecheckException( "While statement requires a single" " boolean value as condition.", node.cond) if type(types[0]) != bongtypes.Boolean: raise TypecheckException( "While statement requires boolean condition.", node.cond) types, turn = self.check(node.t) if turn != Return.NO: return types, Return.MAYBE return types, turn if isinstance(node, ast.AssignOp): rhs, turn = self.check(node.rhs) lhs, turn = self.check(node.lhs) match_types(lhs, rhs, node, ("Variable and expression types in assignment do" f" not match. Lhs expects '{lhs}' but rhs evaluates" f" to '{rhs}'")) if not self.is_writable(node.lhs): raise TypecheckException( "Lhs of assignment is no writable variable!", node.lhs) return lhs, Return.NO if isinstance(node, ast.BinOp): op = node.op # For BinOps, most bongtypes' operators are overloaded # Not overloaded: 'and' and 'or' lhslist, turn = self.check(node.lhs) rhslist, turn = self.check(node.rhs) assert len(lhslist) == 1 and len(rhslist) == 1 lhstyp = lhslist[0] rhstyp = rhslist[0] try: # Catch all BongtypeExceptions if op == "+": # TODO "+" is a valid operator for arrays but we do not do # the proper empty-array check with match_types() here. Should # we do that? return TypeList([lhstyp + rhstyp]), Return.NO if op == "-": return TypeList([lhstyp - rhstyp]), Return.NO if op == "*": return TypeList([lhstyp * rhstyp]), Return.NO if op == "/": return TypeList([lhstyp / rhstyp]), Return.NO if op == "%": return TypeList([lhstyp % rhstyp]), Return.NO if op == "^": return TypeList([lhstyp**rhstyp]), Return.NO if op == "&&": if type(lhstyp) != bongtypes.Boolean: raise TypecheckException( "Logical 'and' expects boolean operands. Left operand is not boolean.", node.lhs) if type(rhstyp) != bongtypes.Boolean: raise TypecheckException( "Logical 'and' expects boolean operands. Right operand is not boolean.", node.rhs) return TypeList([bongtypes.Boolean()]), Return.NO if op == "||": if type(lhstyp) != bongtypes.Boolean: raise TypecheckException( "Logical 'or' expects boolean operands. Left operand not boolean.", node.lhs) if type(rhstyp) != bongtypes.Boolean: raise TypecheckException( "Logical 'or' expects boolean operands. Right operand is not boolean.", node.rhs) return TypeList([bongtypes.Boolean()]), Return.NO if op == "==": return TypeList([lhstyp.eq(rhstyp)]), Return.NO if op == "!=": return TypeList([lhstyp.ne(rhstyp)]), Return.NO if op == "<": return TypeList([lhstyp < rhstyp]), Return.NO if op == ">": return TypeList([lhstyp > rhstyp]), Return.NO if op == "<=": return TypeList([lhstyp <= rhstyp]), Return.NO if op == ">=": return TypeList([lhstyp >= rhstyp]), Return.NO else: raise Exception("unrecognised binary operator: " + str(node.op)) except BongtypeException as e: # ... and transform to TypecheckExc raise TypecheckException(e.msg, node) elif isinstance(node, ast.UnaryOp): try: # Catch all BongtypeExceptions ... op = node.op if op == "!": rhs, turn = self.check(node.rhs) if len(rhs) != 1 or type(rhs[0]) != bongtypes.Boolean: raise TypecheckException( "Logical 'not' expects boolean operand.", node) return TypeList([bongtypes.Boolean()]), Return.NO if op == "-": rhstype, turn = self.check(node.rhs) if len(rhstype) != 1 or not ( type(rhstype[0]) == bongtypes.Integer or type(rhstype[0]) == bongtypes.Float): raise TypecheckException("Negate expects number.", node) return rhstype, Return.NO raise Exception("unrecognised unary operator: " + str(node.op)) except BongtypeException as e: # ... and transform to TypecheckExc raise TypecheckException(e.msg, node) elif isinstance(node, ast.Integer): return TypeList([bongtypes.Integer()]), Return.NO elif isinstance(node, ast.Float): return TypeList([bongtypes.Float()]), Return.NO elif isinstance(node, ast.String): return TypeList([bongtypes.String()]), Return.NO elif isinstance(node, ast.Bool): return TypeList([bongtypes.Boolean()]), Return.NO elif isinstance(node, ast.SysCall): return TypeList([bongtypes.Integer()]), Return.NO elif isinstance(node, ast.Pipeline): # Also see evaluator -> ast.Pipeline, it is very similar if len(node.elements) < 2: raise TypecheckException( "Pipelines should have more than one element. This seems to be a parser bug.", node) programcalls = [] strtype = TypeList([bongtypes.String() ]) # used for checking stdin and stdout # Check pipeline input types if isinstance(node.elements[0], ast.SysCall): programcalls.append(node.elements[0]) else: stdin, turn = self.check(node.elements[0]) # turn == NO if not stdin.sametype(strtype): raise TypecheckException( "The input to a pipeline should evaluate to a string, {} was found instead." .format(stdin), node.elements[0]) # Collect programcalls for elem in node.elements[1:-1]: if not isinstance(elem, ast.SysCall): raise TypecheckException( "The main part of a pipeline (all" " elements but the first and last) should only consist" f" of program calls, '{elem}' found instead.", elem) programcalls.append(elem) # Check pipeline output types if isinstance(node.elements[-1], ast.SysCall): programcalls.append(node.elements[-1]) else: assignto = node.elements[-1] # Either the assignto is a PipelineLet, then check it manually, # or the assignto is something else, then do the same checks as for assignments. if isinstance(assignto, ast.PipelineLet): names = assignto.names if len(names) > 2 or len(names) == 0: raise TypecheckException( "The output of a pipeline can only be written to one or two string variables, let with {} variables was found instead." .format(len(names)), assignto) for name, type_identifier in zip(assignto.names, assignto.types): if isinstance(type_identifier, ast.BongtypeIdentifier): typ = self.resolve_type(type_identifier, self.main_unit, assignto) if not typ.sametype(bongtypes.String()): raise TypecheckException( "The output of a pipeline" " can only be written to string variables, let" f" with explicit type '{typ}' was found instead.", assignto) else: pass self.symbol_tree.restore_snapshot( assignto.symbol_tree_snapshot) self.symbol_tree[name] = bongtypes.String() else: output, turn = self.check(assignto) writable = self.is_writable(assignto) if (not writable or (not output.sametype(strtype) and not output.sametype( TypeList([bongtypes.String(), bongtypes.String()])))): raise TypecheckException( "The output of a pipeline can only" f" be written to string variables, {assignto} found" " instead.", assignto) # Check that everything in between actually is a program call for pcall in programcalls: if not isinstance(pcall, ast.SysCall): raise TypecheckException( "Everything in the center of a pipeline must be a programmcall, '{}' was found instead." .format(pcall), pcall) return TypeList([bongtypes.Integer()]), Return.NO elif isinstance(node, ast.Identifier): if node.name in self.symbol_tree: return TypeList([self.symbol_tree[node.name]]), Return.NO elif node.name in self.symbols_global: return TypeList([self.symbols_global[node.name]]), Return.NO raise TypecheckException(f"{node.name} is undefined.", node) elif isinstance(node, ast.IndexAccess): index, turn = self.check(node.rhs) if len(index) != 1 or type(index[0]) != bongtypes.Integer: raise TypecheckException("Indexing requires Integer.", node.rhs) lhs, turn = self.check(node.lhs) if len(lhs) != 1: raise TypecheckException( "Indexing requires a single variable.", node.lhs) if isinstance(lhs[0], bongtypes.String): # bong string return lhs, Return.NO if isinstance(lhs[0], bongtypes.Array): # bong array return TypeList([lhs[0].contained_type]), Return.NO raise TypecheckException("IndexAccess with unsupported type.", node.lhs) elif isinstance(node, ast.DotAccess): lhs, turn = self.check(node.lhs) if len(lhs) != 1: raise TypecheckException( "DotAccess requires a single variable/identifier.", node.lhs) if isinstance(lhs[0], bongtypes.Struct): # bong struct if node.rhs not in lhs[0].fields: raise TypecheckException( f"Name '{node.rhs}' not found in" f" struct '{node.lhs}'.", node) value_type = lhs[0].fields[node.rhs] return TypeList([value_type]), Return.NO elif isinstance(lhs[0], bongtypes.Module): # module modulepath = lhs[0].path if not modulepath in self.modules: raise TypecheckException( f"Module {node.lhs} can not be" " resolved.", node) module = self.modules[modulepath] if node.rhs not in module.symbols_global: raise TypecheckException( f"Name '{node.rhs}' not found in" f" module '{node.lhs}' which resolved to '{lhs[0]}'.", node) return TypeList([module.symbols_global[node.rhs]]), Return.NO raise TypecheckException("DotAccess with unsupported type.", node.lhs) elif isinstance(node, ast.FunctionDefinition): # The function interface should already be completely in the symbol table. # Here, we only check that the function block is valid and that it returns # what we expect! func = self.symbols_global[node.name] # bongtypes.Function assert (isinstance(func, bongtypes.Function)) # Before function body checking, save/restore symbol table # The current snapshot should be empty here because function # definitions are typechecked before the main statements are # checked. But anyways, logically, this is the right approach! symbol_tree_snapshot = self.symbol_tree.take_snapshot() self.symbol_tree.restore_snapshot(node.symbol_tree_snapshot) # Compare expected with actual result/return expect = func.return_types actual, turn = self.check(node.body) match_types( expect, actual, node, "Function return type does not" f" match function declaration. Declared '{expect}' but" f" returned '{actual}'.") # Enforce that there is a return statement if we require it if len(expect) > 0: # Return required if turn != Return.YES: # Return not guaranteed raise TypecheckException("Point of no return reached!", node) raise TypecheckException( "Function declaration expects return type" f" '{expect}' but a return statement that" " will definitely be invoked is missing.", node) # Restore symbol table before function call self.symbol_tree.restore_snapshot(symbol_tree_snapshot) return TypeList( []), Return.NO # FunctionDefinition itself returns nothing if isinstance(node, ast.FunctionCall): funcs, turn = self.check(node.name) if len(funcs) != 1: raise TypecheckException( f"'{node.name}' does not resolve to a function.", node.name) func = funcs[0] if type(func) != bongtypes.Function and type( func) != bongtypes.BuiltinFunction: raise TypecheckException(f"'{node.name}' is not a function.", node) argtypes, turn = self.check(node.args) # Check builtin functions if isinstance(func, bongtypes.BuiltinFunction): try: return func.check(argtypes), Return.NO except BongtypeException as e: # Convert to TypecheckException raise TypecheckException(e.msg, node) # Otherwise, it is a bong function that has well-defined parameter types assert (isinstance(func, bongtypes.Function)) match_types( func.parameter_types, argtypes, node, (f"Function '{node.name}' expects parameters of type " f"'{func.parameter_types}' but '{argtypes}' were given.")) # If everything goes fine (function can be called), it returns # whatever the function declaration says \o/ return func.return_types, Return.NO elif isinstance(node, ast.Print): self.check(node.expr) # We can print anything but don't care return TypeList([]), Return.NO elif isinstance(node, ast.Let): # Check rhs expression results, turn = self.check(node.expr) if len(node.names) != len(results): raise TypecheckException( "Number of expressions on rhs of let statement does not match the number of variables.", node) # Before handling the lhs of the let statement, set the correct # scope symbol table. This is necessary so that all symbol table # interaction affects the variables that are declared by the # let statement self.symbol_tree.restore_snapshot(node.symbol_tree_snapshot) # Then, check the type information and store to symbol table for name, type_identifier, result in zip(node.names, node.types, results): if isinstance(type_identifier, ast.BongtypeIdentifier): typ = self.resolve_type(type_identifier, self.main_unit, node) result = merge_types( typ, result, node, "Assignment in let statement impossible: '{}' has type '{}' but expression has type '{}'." .format(name, typ, result)) else: if not is_specific_type(result): raise TypecheckException( "Automatic type for variable '{}' but rhs is no definitive type either, '{}' found instead." .format(name, result), node) self.symbol_tree[name] = result return TypeList([]), Return.NO elif isinstance(node, ast.Array): # Super complicated things can happen here: # Imagine the array contains function calls like # arr = [foonc(), baar()] # and those functions return multiple values. # Then, check(ast.Array.elements : ExpressionList) -- see below -- creates # a TypeList and the multiple return values from the functions are TypeLists # themselves. When those are added to the main TypeList, it is flattened # automatically. In the result, we just have a List of types. And here, we # just have to check that all those types are equal. # I'm fascinated how everything magically works automatically. Isn't that beautiful? types, turn = self.check(node.elements) inner_type: bongtypes.ValueType = bongtypes.AutoType() # Otherwise, all contained types should match for i, typ in enumerate(types): inner_type = merge_types(inner_type, typ, node) return TypeList([bongtypes.Array(inner_type)]), Return.NO elif isinstance(node, ast.StructValue): struct_types, turn = self.check(node.name) if len(struct_types) != 1: raise TypecheckException( f"'{node.name}' does not resolve to a (single) struct.", node.name) struct_type = struct_types[0] if (type(struct_type) != bongtypes.Typedef or type(struct_type.value_type) != bongtypes.Struct): raise TypecheckException( f"'{node.name}' is not a struct type.", node) fields: typing.Dict[str, bongtypes.ValueType] = {} for name, value in node.fields.items(): argtypes, turn = self.check(value) if len(argtypes) != 1: raise TypecheckException( "Expression does not evaluate" " to a single value.", value) # Duplicates are caught in the parser, we can just assign here. if not isinstance(argtypes[0], bongtypes.ValueType): raise TypecheckException("ValueType expected", value) fields[name] = argtypes[0] # TODO See issue #27: Currently, we only write the resolved struct # type's name into the struct value here. struct_val = bongtypes.Struct(struct_type.value_type.name, fields) typ = merge_types(struct_type.value_type, struct_val, node) return TypeList([typ]), Return.NO elif isinstance(node, ast.ExpressionList): types = bongtypes.TypeList([]) for exp in node: typlist, turn = self.check(exp) types.append(typlist) # TypeLists are automatically flattened return types, Return.NO else: raise Exception("unknown ast node") return None
class Parser: def __init__(self, lexer, snapshot=None, basepath=None): self.lexer = lexer self.basepath = basepath if basepath != None else os.getcwd() self.symbols_global : typing.Dict[str, bongtypes.BaseNode] = {} self.symbol_tree = SymbolTree() if snapshot != None: # When restoring the global dictionary, we need to copy the dict. # Otherwise, we change the snapshot that the caller (the repl) # will (most probably) reuse. self.symbols_global = snapshot[0].copy() # overwrite self.symbol_tree.restore_snapshot(snapshot[1]) # restore else: # Only when initializing symbol tables for the first time, register # builtin stuff for bfuncname, bfunc in bong_builtins.functions.items(): self.symbols_global[bfuncname] = bongtypes.BuiltinFunction(bfunc[1]) for btypename, btype in bongtypes.basic_types.items(): self.symbols_global[btypename] = bongtypes.Typedef(btype()) # TODO Somehow, the Parser is re-initialized each input round, the # evaluator is not. This is somehow the reason why snapshots have to be # taken and restored on the parser. # I guess this design can be revised, too. def take_snapshot(self) -> typing.Tuple[typing.Dict[str, bongtypes.BaseType], SymbolTreeNode]: return self.symbols_global, self.symbol_tree.take_snapshot() def compile(self) -> typing.Optional[ast.TranslationUnit]: try: return self.compile_uncaught() except lexer.TokenizeException as e: print(f"LexerError in {e.filepath}, line {e.line}," f" column {e.col}: {e.msg}", file=sys.stderr) except ParseException as e: t = self.peek(e.offset) if t.lexeme != None: lexeme = t.lexeme else: lexeme = t.type print("ParseError: Token '{}' found in {}, line {}, column {}: {}".format(lexeme, t.filepath, t.line, t.col, e.msg), file=sys.stderr) # t.length unused return None def compile_uncaught(self) -> ast.TranslationUnit: # init_token_access() can throw EofException so it should not # be done in the constructor. self.init_token_access() imp_stmts : typing.List[ast.Import] = [] struct_stmts : collections.OrderedDict[str, ast.StructDefinition] = collections.OrderedDict() func_stmts : collections.OrderedDict[str, ast.FunctionDefinition] = collections.OrderedDict() statements : typing.List[ast.BaseNode] = [] while self.peek().type != token.EOF: stmt = self.top_level_stmt() if isinstance(stmt, ast.Import): imp_stmts.append(stmt) elif isinstance(stmt, ast.StructDefinition): if stmt.name in struct_stmts: raise Exception("Struct Definition with same name generated twice.") struct_stmts[stmt.name] = stmt elif isinstance(stmt, ast.FunctionDefinition): func_stmts[stmt.name] = stmt else: statements.append(stmt) return ast.TranslationUnit(imp_stmts, struct_stmts, func_stmts, statements, self.symbols_global) def top_level_stmt(self) -> ast.BaseNode: if self.peek().type == token.IMPORT: return self.parse_import() if self.peek().type == token.STRUCT: return self.parse_struct_definition() if self.peek().type == token.FUNC: return self.parse_function_definition() return self.stmt() def stmt(self) -> ast.BaseNode: if self.peek().type == token.PRINT: return self.print_stmt() if self.peek().type == token.LET: return self.let_stmt() if self.peek().type == token.IF: return self.if_stmt() if self.peek().type == token.RETURN: return self.return_stmt() if self.peek().type == token.WHILE: return self.while_stmt() if self.peek().type == token.LBRACE: return self.block_stmt() if (self.peek().type == token.IDENTIFIER or self.peek().type == token.INT_VALUE or self.peek().type == token.FLOAT_VALUE or self.peek().type == token.BOOL_VALUE or self.peek().type == token.LPAREN or self.peek().type == token.OP_SUB or self.peek().type == token.OP_NEG or self.peek().type == token.LBRACKET or self.peek().type == token.STRING): return self.expression_stmt() # Special cases: Syscalls in current directory like './foo' or with # absolute path like '/foo/bar' if (self.peek(0).type==token.OP_DIV and self.peek(1).type==token.IDENTIFIER or self.peek(0).type==token.DOT and self.peek(1).type==token.OP_DIV or self.peek(0).type==token.DOT and self.peek(1).type==token.DOT and self.peek(2).type==token.OP_DIV): return self.expression_stmt() raise ParseException("Unknown statement found.") def parse_import(self): toks = TokenList() if not toks.add(self.match(token.IMPORT)): raise Exception("Expected import statement.") if not toks.add(self.match(token.STRING)): raise ParseException("Expected module path as string.") path = self.peek(-1).lexeme if not toks.add(self.match(token.AS)): raise ParseException("Expected as") if not toks.add(self.match(token.IDENTIFIER)): raise ParseException("Expected module alias name.") name = self.peek(-1).lexeme toks.add(self.match(token.SEMICOLON)) if not os.path.isabs(path): path = os.path.join(self.basepath, path) if name in self.symbols_global: raise ParseException(f"Name '{name}' already exists in global symbol table. Import impossible.") self.symbols_global[name] = bongtypes.UnknownType() return ast.Import(toks, name, path) def parse_function_definition(self) -> ast.FunctionDefinition: toks = TokenList() # FUNC foo (bar : int) : str { ... } if not toks.add(self.match(token.FUNC)): raise Exception("Expected function definition.") # func FOO (bar : int) : str { ... } if not toks.add(self.match(token.IDENTIFIER)): raise ParseException("Expected function name.") name = self.peek(-1).lexeme if name in self.symbols_global: raise ParseException(f"Name '{name}' already exists in symbol table. Function definition impossible.") # Register function name before parsing parameter names (no parameter name should have the function name!) self.symbols_global[name] = bongtypes.UnknownType() # ( if not toks.add(self.match(token.LPAREN)): raise ParseException("Expected ( to start the parameter list.") # Parameters parameter_names, parameter_types = self.parse_parameters() # ) if not toks.add(self.match(token.RPAREN)): raise ParseException("Expected ) to end the parameter list.") # Return types return_types : typing.List[ast.BongtypeIdentifier] = [] if toks.add(self.match(token.COLON)): self.check_eof("Return type list expected.") return_types.append(self.parse_type()) while toks.add(self.match(token.COMMA)): return_types.append(self.parse_type()) # { if not self.peek().type == token.LBRACE: raise ParseException("Expected function body.") # New local symbol table (tree) for statement block # We could just store the global symbol table in the object because # it will always be the same. But remembering the previous symbol # table here theoretically allows to parse function definitions inside # other functions (the local symbol table would be properly restored # then). global_symbol_tree = self.symbol_tree self.symbol_tree = SymbolTree() # Parameters for param,typ in zip(parameter_names,parameter_types): if param in self.symbol_tree: raise ParseException(f"Argument name '{param}' appears twice in function definition") self.symbol_tree.register(param, bongtypes.UnknownType()) # Snapshot before block is parsed (this changes the state of the tree) func_symbol_tree_snapshot = self.symbol_tree.take_snapshot() # Function body body = self.block_stmt() # Restore symbol table/tree self.symbol_tree = global_symbol_tree return ast.FunctionDefinition(toks, name, parameter_names, parameter_types, return_types, body, func_symbol_tree_snapshot) def parse_struct_definition(self) -> ast.StructDefinition: toks = TokenList() # STRUCT foo {bar : int, ...} if not toks.add(self.match(token.STRUCT)): raise Exception("Expected struct definition.") # func FOO {bar : int, ...} if not toks.add(self.match(token.IDENTIFIER)): raise ParseException("Expected struct name.") name = self.peek(-1).lexeme if name in self.symbols_global: raise ParseException(f"Name '{name}' already exists in global symbol table. Struct definition impossible.") # { if not toks.add(self.match(token.LBRACE)): raise ParseException("Expected { to start the field list.") # Fields field_names, field_types = self.parse_parameters() if len(field_names) == 0: raise ParseException(f"Struct {name} is empty.") fields : typing.Dict[str, ast.BongtypeIdentifier] = {} for field_name, field_type in zip(field_names, field_types): if field_name in fields: raise ParseException(f"Field '{field_name}' found multiple times" " in struct '{name}'.") fields[field_name] = field_type # If } occurs on its own line, an implicit semicolon is inserted # after the fields self.match(token.SEMICOLON) # } self.check_eof("Expected } to end the field list.") if not toks.add(self.match(token.RBRACE)): raise ParseException("Expected } to end the field list.") # If everything went fine, register the struct name self.symbols_global[name] = bongtypes.UnknownType() return ast.StructDefinition(toks, name, fields) # Used by parse_function_definition() and parse_struct_definition() def parse_parameters(self) -> typing.Tuple[typing.List[str],typing.List[ast.BongtypeIdentifier]]: parameter_names : typing.List[str] = [] parameter_types : typing.List[ast.BongtypeIdentifier] = [] self.check_eof("Parameter list expected") if self.peek().type != token.IDENTIFIER: return (parameter_names, parameter_types) name, typ = self.parse_parameter() parameter_names.append(name) parameter_types.append(typ) while self.match(token.COMMA): name, typ = self.parse_parameter() parameter_names.append(name) parameter_types.append(typ) return (parameter_names, parameter_types) def parse_parameter(self) -> typing.Tuple[str,ast.BongtypeIdentifier]: self.check_eof("Another parameter expected") if not self.match(token.IDENTIFIER): raise ParseException("Expected identifier as parameter name.") name = self.peek(-1).lexeme if not self.match(token.COLON): raise ParseException("Expected type hint for function parameter.") typ = self.parse_type() return (name, typ) # Used by function definition and let statement def parse_type(self) -> ast.BongtypeIdentifier: num_array_levels = 0 while self.match(token.LBRACKET): if not self.match(token.RBRACKET): raise ParseException("Expected closing bracket ']' in type specification.") num_array_levels += 1 if not self.match(token.IDENTIFIER): raise ParseException("Expected identifier as module or type.") typename = [self.peek(-1).lexeme] while self.match(token.DOT): if not self.match(token.IDENTIFIER): raise ParseException("Expected identifier as module or type.") typename.append(self.peek(-1).lexeme) return ast.BongtypeIdentifier(typename, num_array_levels) def return_stmt(self) -> ast.Return: toks = TokenList() if not toks.add(self.match(token.RETURN)): raise Exception("Expected return statement.") if toks.add(self.match(token.SEMICOLON)): return ast.Return(toks) expr = self.parse_commata_expressions() toks.add(self.match(token.SEMICOLON)) return ast.Return(toks, expr) def expression_stmt(self) -> ast.BaseNode: expr = self.assignment() if tok := self.match(token.SEMICOLON): expr.tokens.append(tok) return expr