def process(tokens, this_file): """Process the given tokens and return the preprocessed token list.""" processed = [] i = 0 while i < len(tokens) - 2: if (tokens[i].kind == token_kinds.pound and tokens[i + 1].kind == token_kinds.identifier and tokens[i + 1].content == "include" and tokens[i + 2].kind == token_kinds.include_file): # Replace tokens[i] -> tokens[i+2] with preprocessed contents of # the included file. file, filename = read_file(tokens[i + 2].content, this_file) if not file: error_collector.add( CompilerError("unable to read included file", tokens[i + 2].r)) else: new_tokens = process(lexer.tokenize(file, filename), filename) processed += new_tokens i += 3 else: processed.append(tokens[i]) i += 1 return processed + tokens[i:]
def tokenize(code, filename): """Convert given code into a flat list of Tokens. lines - List of list of Tagged objects, where each embedded list is a separate line in the input program. return - List of Token objects. """ # Store tokens as they are generated tokens = [] lines = split_to_tagged_lines(code, filename) join_extended_lines(lines) in_comment = False for line in lines: try: line_tokens, in_comment = tokenize_line(line, in_comment) tokens += line_tokens except CompilerError as e: error_collector.add(e) #for token in tokens: # print(token.__dict__) return tokens, defineDict
def read_file(file): """Return the contents of the given file.""" try: with open(file) as c_file: return c_file.read() except IOError as e: descrip = f"could not read file: '{file}'" error_collector.add(CompilerError(descrip))
def read_file(file): """Return the contents of the given file.""" try: with open(file) as c_file: return c_file.read() except IOError as e: descrip = f"could not read file: '{file}'" error_collector.add(CompilerError(descrip))
def assemble(asm_name, obj_name): """Assemble the given assembly file into an object file.""" try: subprocess.check_call(["as", "-64", "-o", obj_name, asm_name]) return True except subprocess.CalledProcessError: err = "assembler returned non-zero status" error_collector.add(CompilerError(err)) return False
def process_file(file, args): """Process single file into assembly code and return the code as string.""" #print("processing file: ", file) if file[-2:] == ".c": return process_c_file(file, args) else: err = f"unknown file type: '{file}'" error_collector.add(CompilerError(err)) return None
def assemble(asm_name, obj_name): """Assemble the given assembly file into an object file.""" try: subprocess.check_call(["as", "-64", "-o", obj_name, asm_name]) return True except subprocess.CalledProcessError: err = "assembler returned non-zero status" error_collector.add(CompilerError(err)) return False
def process_file(file, args): """Process single file into object file and return the object file name.""" if file[-2:] == ".c": return process_c_file(file, args) elif file[-2:] == ".o": return file else: err = f"unknown file type: '{file}'" error_collector.add(CompilerError(err)) return None
def process_file(file, args): """Process single file into object file and return the object file name.""" if file[-2:] == ".c": return process_c_file(file, args) elif file[-2:] == ".o": return file else: err = f"unknown file type: '{file}'" error_collector.add(CompilerError(err)) return None
def process(tokens, this_file, defineDict={}, includeList=[]): """Process the given tokens and return the preprocessed token list.""" #for token in tokens: # print(token) processed = [] i = 0 while i < len(tokens) - 2: if (tokens[i].kind == token_kinds.pound and tokens[i + 1].kind == token_kinds.identifier and tokens[i + 1].content == "include" and tokens[i + 2].kind == token_kinds.include_file): # Replace tokens[i] -> tokens[i+2] with preprocessed contents of # the included file. try: file, filename = read_file(tokens[i + 2].content, this_file) if filename not in includeList: includeList.append(filename) lexTokens, _ = lexer.tokenize(file, filename) new_tokens = process(lexTokens, filename, defineDict, includeList) processed += new_tokens except IOError: error_collector.add( CompilerError("unable to read included file", tokens[i + 2].r)) i += 3 # Ignore defines. Currently the value of the define is not in the token list elif (tokens[i].kind == token_kinds.pound and tokens[i + 1].kind == token_kinds.identifier and tokens[i + 1].content == "define"): i += 3 else: # Here we apply the Define dictionary if str(tokens[i]) in defineDict: if defineDict[str(tokens[i])].isdigit(): tokens[i].kind = token_kinds.number else: error_collector.add( CompilerError("Define value is not a number", tokens[i].r)) tokens[i].content = defineDict[str(tokens[i])] processed.append(tokens[i]) i += 1 return processed + tokens[i:]
def find_library_or_err(file): """Search the given library file and return path if found. If not found, add an error to the error collector and return None. """ path = find_library(file) if not path: err = f"could not find {file}" error_collector.add(CompilerError(err)) return None else: return path
def find_library_or_err(file): """Search the given library file and return path if found. If not found, add an error to the error collector and return None. """ path = find_library(file) if not path: err = f"could not find {file}" error_collector.add(CompilerError(err)) return None else: return path
def write_asm(asm_source, asm_filename): """Save the given assembly source to disk at asm_filename. asm_source (str) - Full assembly source code. asm_filename (str) - Filename to which to save the generated assembly. """ try: with open(asm_filename, "w") as s_file: s_file.write(asm_source) except IOError: descrip = f"could not write output file '{asm_filename}'" error_collector.add(CompilerError(descrip))
def find_crtnum(): """Search for the crt0, crt1, or crt2.o files on the system. If one is found, return its path. Else, add an error to the error_collector and return None. """ for file in ["crt2.o", "crt1.o", "crt0.o"]: crt = find_library(file) if crt: return crt err = "could not find crt0.o, crt1.o, or crt2.o for linking" error_collector.add(CompilerError(err)) return None
def write_asm(asm_source, asm_filename): """Save the given assembly source to disk at asm_filename. asm_source (str) - Full assembly source code. asm_filename (str) - Filename to which to save the generated assembly. """ try: with open(asm_filename, "w") as s_file: s_file.write(asm_source) except IOError: descrip = f"could not write output file '{asm_filename}'" error_collector.add(CompilerError(descrip))
def find_crtnum(): """Search for the crt0, crt1, or crt2.o files on the system. If one is found, return its path. Else, add an error to the error_collector and return None. """ for file in ["crt2.o", "crt1.o", "crt0.o"]: crt = find_library(file) if crt: return crt err = "could not find crt0.o, crt1.o, or crt2.o for linking" error_collector.add(CompilerError(err)) return None
def parse(tokens_to_parse): """Parse the given tokens into an AST. Also, as the entry point for the parser, responsible for setting the tokens global variable. """ p.best_error = None p.tokens = tokens_to_parse with log_error(): return parse_root(0)[0] error_collector.add(p.best_error) return None
def parse_abstract_declarator(index): """Parse an abstract declarator into a decl_nodes.Node. This function saves a CompilerError if the parsed entity is a declarator, rather than an abstract declarator. """ root, index = parse_declarator(index) node = root while not isinstance(node, decl_nodes.Identifier): node = node.child if node.identifier: # add error to the error_collector because more of a semantic error # than a parsing error err = "expected abstract declarator, but identifier name was provided" error_collector.add(CompilerError(err, node.identifier.r)) return root, index
def parse_abstract_declarator(index): """Parse an abstract declarator into a decl_nodes.Node. This function saves a CompilerError if the parsed entity is a declarator, rather than an abstract declarator. """ root, index = parse_declarator(index) node = root while not isinstance(node, decl_nodes.Identifier): node = node.child if node.identifier: # add error to the error_collector because more of a semantic error # than a parsing error err = "expected abstract declarator, but identifier name was provided" error_collector.add(CompilerError(err, node.identifier.r)) return root, index
def tokenize(code, filename): """Convert given code into a flat list of Tokens. lines - List of list of Tagged objects, where each embedded list is a separate line in the input program. return - List of Token objects. """ # Store tokens as they are generated tokens = [] lines = split_to_tagged_lines(code, filename) join_extended_lines(lines) in_comment = False for line in lines: try: line_tokens, in_comment = tokenize_line(line, in_comment) tokens += line_tokens except CompilerError as e: error_collector.add(e) return tokens
def parse_decl_specifiers(index, _spec_qual=False): """Parse a declaration specifier list. Examples: int const char typedef int If _spec_qual=True, produces a CompilerError if given any specifiers that are neither type specifier nor type qualifier. The returned `specs` list may contain two types of elements: tokens and Node objects. A Node object will be included for a struct or union declaration, and a token for all other declaration specifiers. """ type_specs = set(ctypes.simple_types.keys()) type_specs |= {token_kinds.signed_kw, token_kinds.unsigned_kw} type_quals = {token_kinds.const_kw} storage_specs = {token_kinds.auto_kw, token_kinds.static_kw, token_kinds.extern_kw, token_kinds.typedef_kw} specs = [] # The type specifier class, either SIMPLE, STRUCT, or TYPEDEF, # represents the allowed kinds of type specifiers. Once the first # specifier is parsed, the type specifier class is set. If the type # specifier class is set to STRUCT or TYPEDEF, no further type # specifiers are permitted in the type specifier list. If it is set to # SIMPLE, more simple type specifiers are permitted. This is important # for typedef parsing. SIMPLE = 1 STRUCT = 2 TYPEDEF = 3 type_spec_class = None while True: # Parse a struct specifier if there is one. if not type_spec_class and token_is(index, token_kinds.struct_kw): node, index = parse_struct_spec(index + 1) specs.append(node) type_spec_class = STRUCT # Parse a union specifier if there is one. elif not type_spec_class and token_is(index, token_kinds.union_kw): node, index = parse_union_spec(index + 1) specs.append(node) type_spec_class = STRUCT # Match a typedef name elif (not type_spec_class and token_is(index, token_kinds.identifier) and p.symbols.is_typedef(p.tokens[index])): specs.append(p.tokens[index]) index += 1 type_spec_class = TYPEDEF elif type_spec_class in {None, SIMPLE} and token_in(index, type_specs): specs.append(p.tokens[index]) index += 1 type_spec_class = SIMPLE elif token_in(index, type_quals): specs.append(p.tokens[index]) index += 1 elif token_in(index, storage_specs): if not _spec_qual: specs.append(p.tokens[index]) else: err = "storage specifier not permitted here" error_collector.add(CompilerError(err, p.tokens[index].r)) index += 1 else: break if specs: return specs, index else: raise_error("expected declaration specifier", index, ParserError.AT)
def tokenize_line(line, in_comment): """Tokenize the given single line. line - List of Tagged objects. in_comment - Whether the first character in this line is part of a C-style comment body. return - List of Token objects, and boolean indicating whether the next character is part of a comment body. """ tokens = [] # line[chunk_start:chunk_end] is the section of the line currently # being considered for conversion into a token; this string will be # called the 'chunk'. Everything before the chunk has already been # tokenized, and everything after has not yet been examined chunk_start = 0 chunk_end = 0 # Flag that is set True if the line begins with `#` and `include`, # perhaps with comments and whitespace in between. include_line = False # Flag that is set True if the line is an include directive and the # filename has been seen and succesfully parsed. seen_filename = False while chunk_end < len(line): symbol_kind = match_symbol_kind_at(line, chunk_end) next = match_symbol_kind_at(line, chunk_end + 1) # Set include_line flag True as soon as a `#include` is detected. if match_include_command(tokens): include_line = True if in_comment: # If next characters end the comment... if symbol_kind == token_kinds.star and next == token_kinds.slash: in_comment = False chunk_start = chunk_end + 2 chunk_end = chunk_start # Otherwise, just skip one character. else: chunk_start = chunk_end + 1 chunk_end = chunk_start # If next characters start a comment, process previous chunk and set # in_comment to true. elif symbol_kind == token_kinds.slash and next == token_kinds.star: add_chunk(line[chunk_start:chunk_end], tokens) in_comment = True # If next two characters are //, we skip the rest of this line. elif symbol_kind == token_kinds.slash and next == token_kinds.slash: break # Skip spaces and process previous chunk. elif line[chunk_end].c.isspace(): add_chunk(line[chunk_start:chunk_end], tokens) chunk_start = chunk_end + 1 chunk_end = chunk_start # If this is an include line, and not a comment or whitespace, # expect the line to match an include filename. elif include_line: # If the filename has already been seen, there should be no more # tokens. if seen_filename: descrip = "extra tokens at end of include directive" raise CompilerError(descrip, line[chunk_end].r) filename, end = read_include_filename(line, chunk_end) tokens.append(Token(token_kinds.include_file, filename, r=Range(line[chunk_end].p, line[end].p))) chunk_start = end + 1 chunk_end = chunk_start seen_filename = True # If next character is a quote, we read the whole string as a token. elif symbol_kind in {token_kinds.dquote, token_kinds.squote}: if symbol_kind == token_kinds.dquote: quote_str = '"' kind = token_kinds.string add_null = True else: quote_str = "'" kind = token_kinds.char_string add_null = False chars, end = read_string(line, chunk_end + 1, quote_str, add_null) rep = chunk_to_str(line[chunk_end:end + 1]) r = Range(line[chunk_end].p, line[end].p) if kind == token_kinds.char_string and len(chars) == 0: err = "empty character constant" error_collector.add(CompilerError(err, r)) elif kind == token_kinds.char_string and len(chars) > 1: err = "multiple characters in character constant" error_collector.add(CompilerError(err, r)) tokens.append(Token(kind, chars, rep, r=r)) chunk_start = end + 1 chunk_end = chunk_start # If next character is another symbol, add previous chunk and then # add the symbol. elif symbol_kind: symbol_start_index = chunk_end symbol_end_index = chunk_end + len(symbol_kind.text_repr) - 1 r = Range(line[symbol_start_index].p, line[symbol_end_index].p) symbol_token = Token(symbol_kind, r=r) add_chunk(line[chunk_start:chunk_end], tokens) tokens.append(symbol_token) chunk_start = chunk_end + len(symbol_kind.text_repr) chunk_end = chunk_start # Include another character in the chunk. else: chunk_end += 1 # Flush out anything that is left in the chunk to the output add_chunk(line[chunk_start:chunk_end], tokens) # Catch a `#include` on a line by itself. if (include_line or match_include_command(tokens)) and not seen_filename: read_include_filename(line, chunk_end) return tokens, in_comment
def tokenize_line(line, in_comment): """Tokenize the given single line. line - List of Tagged objects. in_comment - Whether the first character in this line is part of a C-style comment body. return - List of Token objects, and boolean indicating whether the next character is part of a comment body. """ tokens = [] # line[chunk_start:chunk_end] is the section of the line currently # being considered for conversion into a token; this string will be # called the 'chunk'. Everything before the chunk has already been # tokenized, and everything after has not yet been examined chunk_start = 0 chunk_end = 0 # Flag that is set True if the line begins with `#` and `include`, # perhaps with comments and whitespace in between. include_line = False # Flag that is set True if the line is an include directive and the # filename has been seen and succesfully parsed. seen_filename = False while chunk_end < len(line): symbol_kind = match_symbol_kind_at(line, chunk_end) next = match_symbol_kind_at(line, chunk_end + 1) # Set include_line flag True as soon as a `#include` is detected. if match_include_command(tokens): include_line = True if in_comment: # If next characters end the comment... if symbol_kind == token_kinds.star and next == token_kinds.slash: in_comment = False chunk_start = chunk_end + 2 chunk_end = chunk_start # Otherwise, just skip one character. else: chunk_start = chunk_end + 1 chunk_end = chunk_start # If next characters start a comment, process previous chunk and set # in_comment to true. elif symbol_kind == token_kinds.slash and next == token_kinds.star: add_chunk(line[chunk_start:chunk_end], tokens) in_comment = True # If next two characters are //, we skip the rest of this line. elif symbol_kind == token_kinds.slash and next == token_kinds.slash: break # Skip spaces and process previous chunk. elif line[chunk_end].c.isspace(): add_chunk(line[chunk_start:chunk_end], tokens) chunk_start = chunk_end + 1 chunk_end = chunk_start # If this is an include line, and not a comment or whitespace, # expect the line to match an include filename. elif include_line: # If the filename has already been seen, there should be no more # tokens. if seen_filename: descrip = "extra tokens at end of include directive" raise CompilerError(descrip, line[chunk_end].r) filename, end = read_include_filename(line, chunk_end) tokens.append(Token(token_kinds.include_file, filename, r=Range(line[chunk_end].p, line[end].p))) chunk_start = end + 1 chunk_end = chunk_start seen_filename = True # If next character is a quote, we read the whole string as a token. elif symbol_kind in {token_kinds.dquote, token_kinds.squote}: if symbol_kind == token_kinds.dquote: quote_str = '"' kind = token_kinds.string add_null = True else: quote_str = "'" kind = token_kinds.char_string add_null = False chars, end = read_string(line, chunk_end + 1, quote_str, add_null) rep = chunk_to_str(line[chunk_end:end + 1]) r = Range(line[chunk_end].p, line[end].p) if kind == token_kinds.char_string and len(chars) == 0: err = "empty character constant" error_collector.add(CompilerError(err, r)) elif kind == token_kinds.char_string and len(chars) > 1: err = "multiple characters in character constant" error_collector.add(CompilerError(err, r)) tokens.append(Token(kind, chars, rep, r=r)) chunk_start = end + 1 chunk_end = chunk_start # If next character is another symbol, add previous chunk and then # add the symbol. elif symbol_kind: symbol_start_index = chunk_end symbol_end_index = chunk_end + len(symbol_kind.text_repr) - 1 r = Range(line[symbol_start_index].p, line[symbol_end_index].p) symbol_token = Token(symbol_kind, r=r) add_chunk(line[chunk_start:chunk_end], tokens) tokens.append(symbol_token) chunk_start = chunk_end + len(symbol_kind.text_repr) chunk_end = chunk_start # Include another character in the chunk. else: chunk_end += 1 # Flush out anything that is left in the chunk to the output add_chunk(line[chunk_start:chunk_end], tokens) # Catch a `#include` on a line by itself. if (include_line or match_include_command(tokens)) and not seen_filename: read_include_filename(line, chunk_end) return tokens, in_comment
def report_err(): """Catch and add any errors to error collector.""" try: yield except CompilerError as e: error_collector.add(e)
def parse_decl_specifiers(index, _spec_qual=False): """Parse a declaration specifier list. Examples: int const char typedef int If _spec_qual=True, produces a CompilerError if given any specifiers that are neither type specifier nor type qualifier. The returned `specs` list may contain two types of elements: tokens and Node objects. A Node object will be included for a struct or union declaration, and a token for all other declaration specifiers. """ type_specs = set(ctypes.simple_types.keys()) type_specs |= {token_kinds.signed_kw, token_kinds.unsigned_kw} type_quals = {token_kinds.const_kw} storage_specs = { token_kinds.auto_kw, token_kinds.static_kw, token_kinds.extern_kw, token_kinds.typedef_kw } specs = [] # The type specifier class, either SIMPLE, STRUCT, or TYPEDEF, # represents the allowed kinds of type specifiers. Once the first # specifier is parsed, the type specifier class is set. If the type # specifier class is set to STRUCT or TYPEDEF, no further type # specifiers are permitted in the type specifier list. If it is set to # SIMPLE, more simple type specifiers are permitted. This is important # for typedef parsing. SIMPLE = 1 STRUCT = 2 TYPEDEF = 3 type_spec_class = None while True: # Parse a struct specifier if there is one. if not type_spec_class and token_is(index, token_kinds.struct_kw): node, index = parse_struct_spec(index + 1) specs.append(node) type_spec_class = STRUCT # Parse a union specifier if there is one. elif not type_spec_class and token_is(index, token_kinds.union_kw): node, index = parse_union_spec(index + 1) specs.append(node) type_spec_class = STRUCT # Match a typedef name elif (not type_spec_class and token_is(index, token_kinds.identifier) and p.symbols.is_typedef(p.tokens[index])): specs.append(p.tokens[index]) index += 1 type_spec_class = TYPEDEF elif type_spec_class in {None, SIMPLE} and token_in(index, type_specs): specs.append(p.tokens[index]) index += 1 type_spec_class = SIMPLE elif token_in(index, type_quals): specs.append(p.tokens[index]) index += 1 elif token_in(index, storage_specs): if not _spec_qual: specs.append(p.tokens[index]) else: err = "storage specifier not permitted here" error_collector.add(CompilerError(err, p.tokens[index].r)) index += 1 else: break if specs: return specs, index else: raise_error("expected declaration specifier", index, ParserError.AT)