def setUp(self): TemporaryVariables.reset() Label.reset() symbol_table = SymbolTable() symbol_table.add_symbol("a_int", Types.INT, 1) symbol_table.add_symbol("b_int", Types.INT, 1) symbol_table.add_symbol("c_float", Types.FLOAT, 2) symbol_table.add_symbol("d_float", Types.FLOAT, 2) self.transformer = CPLTransformer(symbol_table)
class SymbolTableGenerator(object): def __init__(self, elf_file): self._elf_parser = ElfParser(elf_file) self._symbol_table = self._elf_parser.parse_symbol_table() self._dwarf_info = self._elf_parser.parse_dwarf_info() self.valid_symbol_table = None """ Public methods """ def generate_symbol_table(self): """ build valid symbol table data structure compare symbol name and address from symbol table with dwarf info attributes valid symbol if address match and symbol name within dwarf attribute get the base type for all valid symbols """ if self.valid_symbol_table is None: self.valid_symbol_table = SymbolTable() if self._symbol_table is None or self._dwarf_info is None: raise ValueError( "Either Symbol Table or DWARF Info not populated") for entry in self._symbol_table: for abbrev in self._dwarf_info: symbol_found = False addr_found = False for attr in self._dwarf_info[abbrev]["attr"]: # There may be multiple symbol names so we do not match exact string if isinstance(attr["desc"], str): if attr["desc"] in entry.name: if not symbol_found: symbol_found = True # We do however, match address since its definitive if isinstance(attr["desc"], int): if attr["desc"] == entry.address: if not addr_found: addr_found = True # We only want symbols that have address in its DWARF attributes # get symbol type also if symbol_found and addr_found: # time to go down rabbit holes... base_offset, base_type, base_size = self._get_type( abbrev) if base_type == "struct": # collect member info from struct members = self._get_struct_or_union_members( base_offset) decoded_struct = self._decode_struct(members) for member in decoded_struct: # append struct info to its member if member.data_type not in ["void", "ptr"]: member_name = "{0}.{1}".format( entry.name, member.name) member_address = entry.address + member.address data_type = self._decode_data_type( member.data_type, member.size) symbol_entry = Symbol( member_name, hex(member_address), str(member.size), data_type) self.valid_symbol_table.add_symbol( symbol_entry) elif base_type in ["void", "ptr"]: # do not include these in symbol table pass else: # base types or pointers entry.data_type = base_type data_type = self._decode_data_type( base_type, base_size) symbol_entry = Symbol(entry.name, hex(entry.address), str(base_size), data_type) self.valid_symbol_table.add_symbol(symbol_entry) break return self.valid_symbol_table """ Private methods """ def _decode_struct(self, struct): """ decode all members in struct recursive call itself to decode structs within structs, if any exists struct: a list of Symbol objects or Symbol object return: a list of all members in struct """ struct_list = [] if isinstance(struct, list): for struct_member in struct: if isinstance(struct_member.data_type, list): for member_list in struct_member.data_type: members = self._decode_struct(member_list) for member in members: member_name = "{0}.{1}".format( struct_member.name, member.name) member_address = struct_member.address + member.address member_symbol = Symbol(member_name, member_address, member.size, member.data_type) struct_list.append(member_symbol) else: struct_list.append(struct_member) elif isinstance(struct, Symbol): members = self._decode_struct(struct.data_type) for member in members: member_name = "{0}.{1}".format(struct.name, member.name) member_address = struct.address + member.address member_symbol = Symbol(member_name, member_address, member.size, member.data_type) struct_list.append(member_symbol) if not isinstance(struct.data_type, list): struct_list.append(struct) return struct_list @staticmethod def _decode_data_type(data_type, data_size): """ Given base type and size, translate to a generic set of data type enum """ enum_type = "" sign = "" if "char" in data_type: # all data_type containing char normalized to char data_type = "char" elif "ptr" == data_type: # data size may vary data_type = "uintptr_t" elif data_type == "_Bool": data_type = "bool" elif "int" in data_type: size_to_data_type_map = { 1: "int8_t", 2: "int16_t", 4: "int32_t", 8: "int64_t" } if "unsigned" in data_type: sign = "u" data_type = size_to_data_type_map[data_size] elif data_type in ["float", "double"]: # IEEE 754: float: 4 bytes # IEEE 754: double: 8 bytes # data type unchanged pass else: raise ValueError("Unable to decode type: {0}".format(data_type)) enum_type = "{0}{1}".format(sign, data_type) return DataType(enum_type) def _get_type(self, offset): """ get base symbol information recursive call to find base type, if needed offset: offset number in dwarf info to start search return: return tuple of base symbol information """ symbol_offset = self._get_type_offset(offset) if symbol_offset: # we found the root symbol_type = self._get_description(symbol_offset, "DW_AT_name") symbol_size = self._get_description(symbol_offset, "DW_AT_byte_size") if self._is_struct_or_union(symbol_offset): # name the type struct for later processing symbol_type = "struct" elif self._is_pointer(symbol_offset): symbol_type = "ptr" elif not symbol_type: # name the type void for later processing symbol_type = "void" else: # base types pass # return tuple after we find base type return (symbol_offset, symbol_type, symbol_size) def _get_type_offset(self, offset): """ get descripton at offset for type offset: offset number in dwarf info return: offset location to find base type """ if self._is_pointer(offset): # return early if pointer type return offset symbol_offset = self._get_description(offset, "DW_AT_type") if symbol_offset: return self._get_type_offset(symbol_offset) else: return offset def _get_struct_or_union_members(self, offset): """ get structure or union members starting at offset offset: offset number in dwarf info return: a list of Symbol object which contain struct member information it is possible for Symbol["type"] to be either a string or a list Symbol["type"] is a list if that member is a struct and ["type"] contains the struct member """ struct_members = [] for key, val in dropwhile(lambda x: x[0] != offset, self._dwarf_info.items()): # we start iterating from offset if val["offset"] == offset: # we skip it, we are interested in the struct members base_die_depth = val["depth"] continue if val["tag"] == "DW_TAG_member" and val[ "depth"] == base_die_depth + 1: member_name = self._get_description(key, "DW_AT_name") member_location_offset = self._get_description( key, "DW_AT_data_member_location") if not member_location_offset: # for union, since they do not have "DW_AT_data_member_location" type member_location_offset = 0 else: member_location_offset = int(member_location_offset) base_member_offset, base_member_type, base_member_size = self._get_type( key) if base_member_type == "struct": # struct inside struct base_member_type = self._get_struct_or_union_members( base_member_offset) # embed struct members in "type" key to be decoded new_struct_member = Symbol(member_name, member_location_offset, base_member_size, base_member_type) struct_members.append(new_struct_member) else: return struct_members def _is_struct_or_union(self, offset): """ check if struct or union tag exists at offset offset: offset number in dwarf info return: True if struct or union tag, else False """ dwarf_tag = self._get_dwarf_tag(offset) return dwarf_tag in ["DW_TAG_structure_type", "DW_TAG_union_type"] def _is_pointer(self, offset): """ check if pointer tag exists at offset offset: offset number in dwarf info return: True if pointer tag, else False """ dwarf_tag = self._get_dwarf_tag(offset) return (dwarf_tag == "DW_TAG_pointer_type") def _get_dwarf_tag(self, offset): """ check if abbrev tag exists at offset offset: offset number in dwarf info return: boolean value if tag matches """ if self._dwarf_info is None: return tag = "" if offset in self._dwarf_info: tag = self._dwarf_info[offset]["tag"] if not tag: raise ValueError("Abbreviation tag not found") return tag def _get_description(self, offset, attr_type): """ get description for attribute type at offset offset: offset number in dwarf info attr_type: attribute type of interest return: return description for attribute type at offset if it exist """ if self._dwarf_info is None: return if offset in self._dwarf_info: for attr in self._dwarf_info[offset]["attr"]: if attr["name"] == attr_type: return attr["desc"] return None
class ElfParser(object): def __init__(self, elf_file): self._elf = ELFFile(elf_file) self.symbol_table = None self.dwarf_info = None """ Public methods """ def parse_symbol_table(self): """ build symbol table data structure :return: list of symbols """ if self.symbol_table is None: self.symbol_table = SymbolTable() symbol_tables = [ section for section in self._elf.iter_sections() if isinstance(section, SymbolTableSection) ] for section in symbol_tables: for symbol in section.iter_symbols(): if ((int(symbol["st_size"]) > 0) and ("OBJECT" == describe_symbol_type( symbol["st_info"]["type"]))): symbol_entry = Symbol(symbol.name, symbol["st_value"], symbol["st_size"]) self.symbol_table.add_symbol(symbol_entry) return self.symbol_table def parse_dwarf_info(self): """ build dwarf info data structure :return: OrderedDict """ if self.dwarf_info is None: self.dwarf_info = OrderedDict() logging.debug('Parsing DWARF Info...') dwarf_info = self._elf.get_dwarf_info() if not dwarf_info.has_debug_info: raise ValueError( "Debug information not available in ELF file. \ Symbol table will be empty") for cu in dwarf_info.iter_CUs(): die_depth = 0 for die in cu.iter_DIEs(): if die.is_null(): die_depth -= 1 continue # abbreviation property of interest abbreviation = OrderedDict() abbreviation["depth"] = die_depth abbreviation["offset"] = die.offset abbreviation["code"] = die.abbrev_code abbreviation["tag"] = die.tag if not die.is_null() else "" abbreviation["attr"] = [] abbreviation_log_string = " <{0}><{1}>: Abbrev Number: {2} ({3})".format( die_depth, hex(die.offset), die.abbrev_code, die.tag) logging.debug(abbreviation_log_string) for attr in itervalues(die.attributes): description = self._get_attribute_description( attr, die) if description is not None: attr_dict = OrderedDict() attr_dict["offset"] = attr.offset attr_dict["name"] = attr.name attr_dict["desc"] = description abbreviation["attr"].append(attr_dict) log_description = hex(description) if isinstance( description, int) else description attribute_log_string = " <{0}> {1}: {2}".format( hex(attr.offset), attr.name, log_description) logging.debug(attribute_log_string) if abbreviation["attr"]: self.dwarf_info[die.offset] = abbreviation if die.has_children: die_depth += 1 return self.dwarf_info """ Private methods """ def _get_attribute_description(self, attr, die): """ Use regex to parse attribute description (value) """ description = describe_attr_value(attr, die, 0) regex_pattern = "" if "DW_AT_name" == attr.name: regex_pattern = "^([\w ]+\t)|: ([\w ]+\t)$" elif "DW_AT_type" == attr.name: regex_pattern = "^<(0x[\da-fA-F]+)>\t$" elif "DW_AT_location" == attr.name: regex_pattern = ".*DW_OP_addr: ([\w]+)" elif "DW_AT_data_member_location" == attr.name: regex_pattern = "^([\d]+\t)$" elif "DW_AT_byte_size" == attr.name: regex_pattern = "^([\d]+\t)$" if "" != regex_pattern: match = re.compile(regex_pattern) match = match.search(description) if match: match_group = match.groups() if attr.name in ["DW_AT_type", "DW_AT_location"]: description = match_group[0].rstrip() description = int(description, 16) elif attr.name in [ "DW_AT_data_member_location", "DW_AT_byte_size" ]: description = match_group[0].rstrip() description = int(description) elif attr.name in ["DW_AT_name"]: index = [ match for match in range(len(match_group)) if match_group[match] != None ] description = match_group[index[0]].rstrip() else: pass else: description = description.rstrip() else: description = None return description
# Do a second run of the assembly program # Adds all @symbols to symbol table, with a corresponding address starting from 16 # If the address is already taken increment it by 1 until a free spot in RAM is found # Generates all A and C-command mnemonics and binaries while not assembly_program.is_parsed(): current_command = assembly_program.advance() command_type = assembly_program.get_command_type(current_command) if command_type == 'A_COMMAND': symbol = current_command[1:] # Check if the A_COMMAND is a decimal number or a @symbol (variable) if assembly_program.is_decimal(symbol): translated_program.append(convert_to_bin(symbol)) elif not symbol_table.contains_symbol(symbol): symbol_table.add_symbol(symbol) translated_program.append( convert_to_bin(symbol_table.get_address(symbol))) else: translated_program.append( convert_to_bin(symbol_table.get_address(symbol))) else: # It's a C_COMMAND dest_mnemonics = assembly_program.get_dest_mnemonics(current_command) comp_mnemonics = assembly_program.get_comp_mnemonics(current_command) jump_mnemonics = assembly_program.get_jump_mnemonics(current_command) # Generate a Translator object, # which contains the converted command mnemonics to bits mnemonics_to_bin = Translator(dest_mnemonics, comp_mnemonics, jump_mnemonics)