def loadAndMatchAnchors(self, anchors_config, manual_anchors_config): """Load the list of anchor functions, and try to match them with the binary. Args: anchors_config (list): list of anchor src indices manual_anchors_config (list): list of user defined matches (Manual Anchors): (src index, bin_ea) """ # Parse the anchors file self.logger.info("Loading the list of Anchor functions") self._src_anchor_list = anchors_config # Locate the anchor functions self.logger.info("Searching for the Anchor functions in the binary") self.logger.addIndent() all_bin_functions = self.disas.functions() # range narrowing variables lower_match_ea = None upper_match_ea = None lower_match_index = None upper_match_index = None lower_border_ea = 0 upper_border_ea = 2**64 - 1 lower_border_index = None upper_border_index = None function_range = None overall_num_functions = len(self._src_functions_list) multiple_option_candidates = [] anchor_eas = [] first_const_anchor = True efficient_const_search = False # pre-scan (for optimization reasons) anchor_stats = [] num_const_clues = 0 all_const_clues = set() all_string_clues = set() seen_strings, seen_consts, function_list = getContextsStats() for src_anchor_index in list(self._src_anchor_list): src_func_ctx = self.src_functions_ctx[src_anchor_index] is_str, threshold, anchor_clues = anchor.isAnchor( src_func_ctx, seen_strings, seen_consts, function_list, self.logger) # sanity check if anchor_clues is None: self._src_anchor_list.remove(src_anchor_index) self.logger.warning( "Anchor candidate %s (%d) failed as an anchor function", src_func_ctx.name, src_anchor_index) continue anchor_stats.append((src_anchor_index, src_func_ctx, is_str, threshold, anchor_clues)) if is_str: all_string_clues = all_string_clues.union(anchor_clues) else: num_const_clues += len(anchor_clues) all_const_clues = all_const_clues.union(anchor_clues) # Traverse all of the strings only once, it is heavy anchor_bin_strs = defaultdict(list) # Scanning the entire string list and checking against each anchor string - O(kN) - efficient in memory if len(all_string_clues) > 0: for bin_str_ctx in self.disas.strings(): bin_str = str(bin_str_ctx) if bin_str in all_string_clues: anchor_bin_strs[bin_str].append(bin_str_ctx) # full scan (maybe only string scan) for src_anchor_index, src_func_ctx, is_str, threshold, anchor_clues in anchor_stats: candidates = None candidate_sets = [] # scan the full clue list for clue_idx, clue in enumerate(anchor_clues): # strings if is_str: current_set = set() # found the string clue in the binary if clue in anchor_bin_strs: for bin_str in anchor_bin_strs[clue]: for ref in self.disas.drefsTo(bin_str.ea): caller_func = self.disas.funcAt(ref) if caller_func is None: continue callar_func_start = self.disas.funcStart( caller_func) if lower_border_ea <= callar_func_start and callar_func_start <= upper_border_ea: current_set.add(callar_func_start) # consts else: # measure some times (for the first one only) if first_const_anchor: start_time = time.time() # scanning the entire firmware per anchor const - O(kN) current_set = set() # search for it in the binary (non efficient) if lower_match_index is None or not efficient_const_search: search_start = lower_border_ea if not first_const_anchor else 0 search_end = upper_border_ea if not first_const_anchor else ( 2**64 - 1) # start our search for match_ea in self.disas.findImmediate( search_start, search_end, clue): # Filter out matches that are not inside functions caller_func = self.disas.funcAt(match_ea) if caller_func is not None: current_set.add( self.disas.funcStart(caller_func)) # measure the end time too if first_const_anchor: end_time = time.time() overall_search_time = ( end_time - start_time) * num_const_clues if lower_match_index is None: efficient_const_search = anchor.MAXIMAL_CONST_SEARCH_TIME <= overall_search_time else: efficient_const_search = anchor.MAXIMAL_CONST_SEARCH_RATE <= overall_search_time * 1.0 / ( upper_match_index - lower_match_index + 1) # no longer the first const first_const_anchor = False # efficient search else: if function_range is None: self.logger.info( "Anchor search - switching to efficient const search mode" ) # build the fast mapping, and then continue as before function_range = [] for function_ea in all_bin_functions[ lower_border_index:upper_border_index]: function_range.append( (function_ea, self.disas.locateAnchorConsts( function_ea, all_const_clues))) # Now actually search for the wanted const value in the result sets for function_ea, const_set in function_range: if clue in const_set: current_set.add(function_ea) # Same merging logic, for strings and consts # simply add this option (only if relevant) if len(current_set) > 0: candidate_sets.append(current_set) # check if reached the limit if len(candidate_sets) >= threshold: # start checking for a match candidate_attempt = defaultdict(int) for candidate_set in candidate_sets: for candidate in candidate_set: candidate_attempt[candidate] += 1 candidates = list( filter(lambda x: candidate_attempt[x] >= threshold, candidate_attempt.keys())) future_candidates = list( filter( lambda x: candidate_attempt[x] >= threshold - (len(anchor_clues) - (clue_idx + 1)), candidate_attempt.keys())) # stop condition if len(candidates) == 1 and len(future_candidates) == 0: break # check if needs to decide between multiple options if candidates is not None and len(candidates) > 1: sorted_candidates = list(candidate_attempt.keys()) sorted_candidates.sort(key=lambda x: candidate_attempt[x], reverse=True) # if we have an absolute winner, than pick it (safe to access both cells because len() > 1) if candidate_attempt[sorted_candidates[0]] > candidate_attempt[ sorted_candidates[1]]: candidates = [sorted_candidates[0]] # check if we have any candidate left if candidates is None or len(candidates) == 0: self.logger.warning( "Anchor function - %s: Failed to find a match", self._src_functions_list[src_anchor_index]) self._src_anchor_list.remove(src_anchor_index) elif len(candidates) == 1: caller_func = self.disas.funcAt(candidates.pop()) caller_func_start = self.disas.funcStart(caller_func) self.logger.info("Anchor function - %s: Matched at 0x%x (%s)", self._src_functions_list[src_anchor_index], caller_func_start, self.disas.funcName(caller_func)) self._matched_anchors_ea[src_anchor_index] = caller_func_start anchor_eas.append(caller_func_start) self.declareMatch(src_anchor_index, caller_func_start, REASON_ANCHOR) # use the match to improve our search range # first anchor if len(self._matched_anchors_ea.keys()) == 1: lower_match_ea = caller_func_start upper_match_ea = lower_match_ea lower_match_index = all_bin_functions.index( caller_func_start) upper_match_index = lower_match_index change = True else: # try to improve the lower border if caller_func_start < lower_match_ea: lower_match_ea = caller_func_start new_lower_index = all_bin_functions.index( caller_func_start) if function_range is not None: function_range = function_range[new_lower_index - lower_match_index:] lower_match_index = new_lower_index change = True # try to improve the lower border elif upper_match_ea < caller_func_start: upper_match_ea = caller_func_start new_upper_index = all_bin_functions.index( caller_func_start) if function_range is not None: function_range = function_range[:new_upper_index - upper_match_index] upper_match_index = new_upper_index change = True else: change = False # adjust the borders accordingly if change: locked_gap = upper_match_index - lower_match_index + 1 lower_border_index = lower_match_index - ( overall_num_functions - locked_gap) upper_border_index = upper_match_index + ( overall_num_functions - locked_gap) lower_border_ea = all_bin_functions[max( lower_match_index - (overall_num_functions - locked_gap), 0)] upper_border_ea = all_bin_functions[min( upper_match_index + (overall_num_functions - locked_gap), len(all_bin_functions) - 1)] else: self.logger.warning( "Anchor function - %s: Found several matches (%d), will check it again later", self._src_functions_list[src_anchor_index], len(candidates)) multiple_option_candidates.append( (src_anchor_index, candidates)) self.logger.removeIndent() # good time to match the user declared functions for src_index, bin_ea in manual_anchors_config: # check for user errors func_ctx = self.disas.funcAt(bin_ea) if func_ctx is None or self.disas.funcStart(func_ctx) != bin_ea: self.logger.warning( "User defined anchor function %s should be matched to a *start* of a function, not to 0x%x (%s)", self._src_functions_list[src_index], bin_ea, self.disas.funcNameEA(bin_ea)) continue # check for duplicates if src_index in self._matched_anchors_ea: # contradiction if bin_ea != self._matched_anchors_ea[src_index]: actual_ea = self._matched_anchors_ea[src_index] self.logger.warning( "User defined anchor function %s contradicts match at 0x%x (%s), ignoring user definition", self._src_functions_list[src_index], actual_ea, self.disas.funcNameEA(actual_ea)) continue # duplicate else: continue # duplicate at this point could only be a contradiction if bin_ea in anchor_eas and src_index not in self._matched_anchors_ea: self.logger.warning( "User defined anchor function %s contradicts match at 0x%x (%s), ignoring user definition", self._src_functions_list[src_index], bin_ea, self.disas.funcNameEA(bin_ea)) continue # can now safely declare this match self.logger.info( "User defined anchor function - %s: Matched at 0x%x (%s)", self._src_functions_list[src_index], bin_ea, self.disas.funcNameEA(bin_ea)) self._matched_anchors_ea[src_index] = bin_ea anchor_eas.append(bin_ea) self._src_anchor_list.append(src_index) self.declareMatch(src_index, bin_ea, REASON_MANUAL_ANCHOR) # use the match to improve our search range # first anchor if len(self._matched_anchors_ea.keys()) == 1: lower_match_ea = bin_ea upper_match_ea = lower_match_ea lower_match_index = all_bin_functions.index(bin_ea) upper_match_index = lower_match_index change = True else: # try to improve the lower border if bin_ea < lower_match_ea: lower_match_ea = bin_ea new_lower_index = all_bin_functions.index(bin_ea) if function_range is not None: function_range = function_range[new_lower_index - lower_match_index:] lower_match_index = new_lower_index change = True # try to improve the lower border elif upper_match_ea < bin_ea: upper_match_ea = bin_ea new_upper_index = all_bin_functions.index(bin_ea) if function_range is not None: function_range = function_range[:new_upper_index - upper_match_index] upper_match_index = new_upper_index change = True else: change = False # adjust the borders accordingly if change: locked_gap = upper_match_index - lower_match_index + 1 lower_border_index = lower_match_index - ( overall_num_functions - locked_gap) upper_border_index = upper_match_index + ( overall_num_functions - locked_gap) lower_border_ea = all_bin_functions[max( lower_match_index - (overall_num_functions - locked_gap), 0)] upper_border_ea = all_bin_functions[min( upper_match_index + (overall_num_functions - locked_gap), len(all_bin_functions) - 1)] # double check the candidates which had multiple options (if narrowed the search space) if lower_match_ea is not None: for src_anchor_index, candidates in multiple_option_candidates: # check if the manual definitions already defined this one if src_anchor_index in self._matched_anchors_ea: continue filterred_candidates = list( filter( lambda x: lower_match_ea <= x and x <= upper_match_ea, candidates)) # matched if len(filterred_candidates) == 1: bin_ea = filterred_candidates.pop() if bin_ea in anchor_eas: self.logger.warning( "User defined anchor function at 0x%x (%s), blocked revived anchor: %s, dropped the anchor", bin_ea, self.disas.funcNameEA(bin_ea), self._src_functions_list[src_anchor_index]) self._src_anchor_list.remove(src_anchor_index) continue caller_func = self.disas.funcAt(bin_ea) caller_func_start = self.disas.funcStart(caller_func) self.logger.info( "Anchor function (revived) - %s: Matched at 0x%x (%s)", self._src_functions_list[src_anchor_index], caller_func_start, self.disas.funcName(caller_func)) self._matched_anchors_ea[ src_anchor_index] = caller_func_start anchor_eas.append(caller_func_start) self.declareMatch(src_anchor_index, caller_func_start, REASON_ANCHOR) # still not found else: self._src_anchor_list.remove(src_anchor_index) # make sure we found at least one anchor function if len(self._src_anchor_list) == 0: self.logger.error("Failed to match even a single Anchor function") raise KartaException # Create a binary anchor list for future use self._bin_anchor_list = [] for src_anchor_index in self._src_anchor_list: self._bin_anchor_list.append( all_bin_functions.index( self.function_matches[src_anchor_index])) # Sort the file list according to the (bin) order of the anchors old_anchor_list = list(self._src_anchor_list) self._src_anchor_list.sort( key=lambda x: self._bin_anchor_list[old_anchor_list.index(x)]) # Sanity Check: make sure that the files are not mixed up anchor_files = [] started = True for src_anchor_index in self._src_anchor_list: if not started and self.src_functions_ctx[ src_anchor_index].file != anchor_files[-1]: if self.src_functions_ctx[ src_anchor_index].file in anchor_files: self.logger.error( "Sanity check failed: the matched anchor functions are tangled between files..." ) raise KartaException if self.src_functions_ctx[ src_anchor_index].file not in anchor_files: anchor_files.append( self.src_functions_ctx[src_anchor_index].file) started = False # remove empty files (wierd edge case) self._src_file_names = list( filter(lambda x: len(self._src_file_mappings[x]) != 0, self._src_file_mappings.keys())) removed_names = list( filter(lambda x: len(self._src_file_mappings[x]) == 0, self._src_file_mappings.keys())) for name in removed_names: self._src_file_mappings.pop(name) # Now sort the src file names list according to the sorted anchors self._src_file_names = anchor_files + list( set(self._src_file_names).difference(anchor_files))
def analyzeLibrary(config_name, bin_dirs, compiled_ars, prompter): """Analyze the open source library, file-by-file and merge the results. Args: config_name (str): name of the final JSON config file bin_dirs (list): list of paths to the binary folders containing the compiled *.o files compiled_ars (list): list of paths to the compiled *.ar files prompter (prompter): prompter instance """ prompter.info("Starting to analyze the library") prompter.addIndent() ignore_archive = len(compiled_ars) == 0 finished_scan = False # workaround the enumerate in the next loop if ignore_archive: compiled_ars = range(len(bin_dirs)) # ida has severe bugs, make sure to warn the user in advance if disas_cmd.name() == "IDA" and ' ' in SCRIPT_PATH: prompter.error( "IDA does not support spaces (' ') in the script's path. Please move %s's directory accordingly (I feel your pain)", (LIBRARY_NAME)) prompter.removeIndent() return # We could have 2 iteration rounds here while not finished_scan: # Prepare & load the stats from each file for index, compiled_ar in enumerate(compiled_ars): # check if this is a windows archive is_windows = isWindows() bin_dir = bin_dirs[index] bin_suffix = "o" if not is_windows else "obj" if not ignore_archive: prompter.info( "Analyzing each of the files in the archive - %s", compiled_ar) else: prompter.info( "Analyzing each of the *.%s files in the bin directory" % (bin_suffix)) prompter.addIndent() archive_files = list( locateFiles( bin_dir, filter(lambda x: x.endswith("." + bin_suffix), getArchiveFiles(compiled_ar)) if not ignore_archive else None, bin_suffix)) # check if we need a progress bar if len( archive_files ) >= PROGRESS_BAR_THRESHOLD and prompter._min_level > logging.DEBUG: progress_bar = ProgressBar( 'Analyzed %d/%d files - %d%% Completed', len(archive_files), 20, True, time_format="Elapsed %M:%S -") progress_bar.start() else: progress_bar = None # start the work itself for full_file_path, compiled_file in archive_files: # ida has severe bugs, make sure to warn the user in advance if disas_cmd.name() == "IDA" and ' ' in full_file_path: prompter.error( "IDA does not support spaces (' ') in the file's path (in script mode). Please move the binary directory accordingly (I feel your pain)" ) prompter.removeIndent() return prompter.debug("%s - %s", full_file_path, compiled_file) if progress_bar is None: prompter.info("%s - %s", compiled_file, full_file_path) # analyze the file analyzeFile(full_file_path, is_windows) # load the JSON data from it try: fd = open(full_file_path + STATE_FILE_SUFFIX, 'r') except IOError: prompter.error( "Failed to create the .JSON file for file: %s" % (compiled_file)) prompter.error( "Read the log file for more information: %s" % (constructLogPath(full_file_path))) prompter.removeIndent() prompter.removeIndent() prompter.error("Encountered an error, exiting") exit(1) # all was OK, can continue parseFileStats( full_file_path, json.load(fd, object_pairs_hook=collections.OrderedDict)) fd.close() if progress_bar is not None: progress_bar.advance(1) # wrap it up if progress_bar is not None: progress_bar.finish() prompter.removeIndent() # Resolve several unknowns refs as code refs prompter.info("Resolving cross-references between different files") resolveUnknowns() # check if we have any files in the list if len(src_file_mappings) == 0 and not ignore_archive: prompter.error("No files found in the archive :(") prompter.removeIndent() new_path = prompter.input( "Do you want to analyze all of the *.%s files in the bin directory? <Y/N>: " % (bin_suffix)).lower() if new_path != 'y': prompter.error("Finished with errors!") exit(2) # run again, and ignore the archive this time ignore_archive = True prompter.addIndent() else: finished_scan = True # Remove empty files prompter.info("Filtering out empty files") for file_name in filter(lambda x: len(src_file_mappings[x]) == 0, src_file_mappings): src_file_mappings.pop(file_name) # Create the list of anchors str_anchors = [] const_anchors = [] anchors_list = [] anchors_files = set() prompter.info("Identifying possible Anchor functions") prompter.addIndent() seen_strings, seen_consts, function_list = getContextsStats() for src_func_index, src_func_ctx in enumerate(src_functions_ctx): is_str, threshold, candidates = anchor.isAnchor( src_func_ctx, seen_strings, seen_consts, function_list, prompter) if candidates is None: continue if is_str: str_anchors.append(src_func_index) else: const_anchors.append(src_func_index) anchors_files.add(src_func_ctx.file) prompter.removeIndent() # strings before const, because they are faster to search for anchors_list = str_anchors + const_anchors # check if we have any files left if len(src_file_mappings) == 0: prompter.error("All files were empty :(") prompter.removeIndent() prompter.error("Finished with errors!") exit(2) # Check for an error if len(anchors_list) == 0: prompter.warning("Failed to find Anchor functions in the library :(") prompter.warning("You should define manual anchors instead") # Create the anchors file prompter.info("Generating the full JSON file: %s", config_name) prompter.addIndent() full_json = {} # Serialize the anchor list prompter.info("Writing the anchor list") full_json[JSON_TAG_ANCHORS] = anchors_list # Serialize the functions of each files prompter.info("Writing the function list for each of the files") file_dict = collections.OrderedDict() # find a common file prefix, and remove it form the file path if len(src_file_mappings) > 1: base_value = src_file_mappings.keys()[0].split(os.path.sep) comparison_value = src_file_mappings.keys()[-1].split(os.path.sep) for index in xrange(min(len(comparison_value), len(base_value))): if base_value[index] != comparison_value[index]: break common_path_len = len(os.path.sep.join(base_value[:index])) + 1 else: common_path_len = len(bin_dirs[0]) + 1 for src_file_name in src_file_mappings: file_dict[src_file_name[common_path_len:]] = map( lambda c: c.serialize(), src_file_mappings[src_file_name]) full_json[JSON_TAG_FILES] = file_dict # actually dump it fd = open(config_name, "w") json.dump(full_json, fd) fd.close() prompter.removeIndent() prompter.info("Anchor to file ratio is: %d/%d", len(anchors_files), len(src_file_mappings)) prompter.info("Anchor to function ratio is: %d/%d", len(anchors_list), len(src_functions_list)) prompter.removeIndent()