Example #1
0
    def loadAndMatchAnchors(self, anchors_config, manual_anchors_config):
        """Load the list of anchor functions, and try to match them with the binary.

        Args:
            anchors_config (list): list of anchor src indices
            manual_anchors_config (list): list of user defined matches (Manual Anchors): (src index, bin_ea)
        """
        # Parse the anchors file
        self.logger.info("Loading the list of Anchor functions")
        self._src_anchor_list = anchors_config

        # Locate the anchor functions
        self.logger.info("Searching for the Anchor functions in the binary")
        self.logger.addIndent()
        all_bin_functions = self.disas.functions()
        # range narrowing variables
        lower_match_ea = None
        upper_match_ea = None
        lower_match_index = None
        upper_match_index = None
        lower_border_ea = 0
        upper_border_ea = 2**64 - 1
        lower_border_index = None
        upper_border_index = None
        function_range = None
        overall_num_functions = len(self._src_functions_list)
        multiple_option_candidates = []
        anchor_eas = []
        first_const_anchor = True
        efficient_const_search = False
        # pre-scan (for optimization reasons)
        anchor_stats = []
        num_const_clues = 0
        all_const_clues = set()
        all_string_clues = set()
        seen_strings, seen_consts, function_list = getContextsStats()
        for src_anchor_index in list(self._src_anchor_list):
            src_func_ctx = self.src_functions_ctx[src_anchor_index]
            is_str, threshold, anchor_clues = anchor.isAnchor(
                src_func_ctx, seen_strings, seen_consts, function_list,
                self.logger)
            # sanity check
            if anchor_clues is None:
                self._src_anchor_list.remove(src_anchor_index)
                self.logger.warning(
                    "Anchor candidate %s (%d) failed as an anchor function",
                    src_func_ctx.name, src_anchor_index)
                continue
            anchor_stats.append((src_anchor_index, src_func_ctx, is_str,
                                 threshold, anchor_clues))
            if is_str:
                all_string_clues = all_string_clues.union(anchor_clues)
            else:
                num_const_clues += len(anchor_clues)
                all_const_clues = all_const_clues.union(anchor_clues)

        # Traverse all of the strings only once, it is heavy
        anchor_bin_strs = defaultdict(list)
        # Scanning the entire string list and checking against each anchor string - O(kN) - efficient in memory
        if len(all_string_clues) > 0:
            for bin_str_ctx in self.disas.strings():
                bin_str = str(bin_str_ctx)
                if bin_str in all_string_clues:
                    anchor_bin_strs[bin_str].append(bin_str_ctx)

        # full scan (maybe only string scan)
        for src_anchor_index, src_func_ctx, is_str, threshold, anchor_clues in anchor_stats:
            candidates = None
            candidate_sets = []
            # scan the full clue list
            for clue_idx, clue in enumerate(anchor_clues):
                # strings
                if is_str:
                    current_set = set()
                    # found the string clue in the binary
                    if clue in anchor_bin_strs:
                        for bin_str in anchor_bin_strs[clue]:
                            for ref in self.disas.drefsTo(bin_str.ea):
                                caller_func = self.disas.funcAt(ref)
                                if caller_func is None:
                                    continue
                                callar_func_start = self.disas.funcStart(
                                    caller_func)
                                if lower_border_ea <= callar_func_start and callar_func_start <= upper_border_ea:
                                    current_set.add(callar_func_start)
                # consts
                else:
                    # measure some times (for the first one only)
                    if first_const_anchor:
                        start_time = time.time()
                    # scanning the entire firmware per anchor const - O(kN)
                    current_set = set()
                    # search for it in the binary (non efficient)
                    if lower_match_index is None or not efficient_const_search:
                        search_start = lower_border_ea if not first_const_anchor else 0
                        search_end = upper_border_ea if not first_const_anchor else (
                            2**64 - 1)
                        # start our search
                        for match_ea in self.disas.findImmediate(
                                search_start, search_end, clue):
                            # Filter out matches that are not inside functions
                            caller_func = self.disas.funcAt(match_ea)
                            if caller_func is not None:
                                current_set.add(
                                    self.disas.funcStart(caller_func))
                        # measure the end time too
                        if first_const_anchor:
                            end_time = time.time()
                            overall_search_time = (
                                end_time - start_time) * num_const_clues
                            if lower_match_index is None:
                                efficient_const_search = anchor.MAXIMAL_CONST_SEARCH_TIME <= overall_search_time
                            else:
                                efficient_const_search = anchor.MAXIMAL_CONST_SEARCH_RATE <= overall_search_time * 1.0 / (
                                    upper_match_index - lower_match_index + 1)
                            # no longer the first const
                            first_const_anchor = False
                    # efficient search
                    else:
                        if function_range is None:
                            self.logger.info(
                                "Anchor search - switching to efficient const search mode"
                            )
                            # build the fast mapping, and then continue as before
                            function_range = []
                            for function_ea in all_bin_functions[
                                    lower_border_index:upper_border_index]:
                                function_range.append(
                                    (function_ea,
                                     self.disas.locateAnchorConsts(
                                         function_ea, all_const_clues)))
                        # Now actually search for the wanted const value in the result sets
                        for function_ea, const_set in function_range:
                            if clue in const_set:
                                current_set.add(function_ea)

                # Same merging logic, for strings and consts
                # simply add this option (only if relevant)
                if len(current_set) > 0:
                    candidate_sets.append(current_set)
                # check if reached the limit
                if len(candidate_sets) >= threshold:
                    # start checking for a match
                    candidate_attempt = defaultdict(int)
                    for candidate_set in candidate_sets:
                        for candidate in candidate_set:
                            candidate_attempt[candidate] += 1
                    candidates = list(
                        filter(lambda x: candidate_attempt[x] >= threshold,
                               candidate_attempt.keys()))
                    future_candidates = list(
                        filter(
                            lambda x: candidate_attempt[x] >= threshold -
                            (len(anchor_clues) - (clue_idx + 1)),
                            candidate_attempt.keys()))
                    # stop condition
                    if len(candidates) == 1 and len(future_candidates) == 0:
                        break

            # check if needs to decide between multiple options
            if candidates is not None and len(candidates) > 1:
                sorted_candidates = list(candidate_attempt.keys())
                sorted_candidates.sort(key=lambda x: candidate_attempt[x],
                                       reverse=True)
                # if we have an absolute winner, than pick it (safe to access both cells because len() > 1)
                if candidate_attempt[sorted_candidates[0]] > candidate_attempt[
                        sorted_candidates[1]]:
                    candidates = [sorted_candidates[0]]

            # check if we have any candidate left
            if candidates is None or len(candidates) == 0:
                self.logger.warning(
                    "Anchor function - %s: Failed to find a match",
                    self._src_functions_list[src_anchor_index])
                self._src_anchor_list.remove(src_anchor_index)
            elif len(candidates) == 1:
                caller_func = self.disas.funcAt(candidates.pop())
                caller_func_start = self.disas.funcStart(caller_func)
                self.logger.info("Anchor function - %s: Matched at 0x%x (%s)",
                                 self._src_functions_list[src_anchor_index],
                                 caller_func_start,
                                 self.disas.funcName(caller_func))
                self._matched_anchors_ea[src_anchor_index] = caller_func_start
                anchor_eas.append(caller_func_start)
                self.declareMatch(src_anchor_index, caller_func_start,
                                  REASON_ANCHOR)
                # use the match to improve our search range
                # first anchor
                if len(self._matched_anchors_ea.keys()) == 1:
                    lower_match_ea = caller_func_start
                    upper_match_ea = lower_match_ea
                    lower_match_index = all_bin_functions.index(
                        caller_func_start)
                    upper_match_index = lower_match_index
                    change = True
                else:
                    # try to improve the lower border
                    if caller_func_start < lower_match_ea:
                        lower_match_ea = caller_func_start
                        new_lower_index = all_bin_functions.index(
                            caller_func_start)
                        if function_range is not None:
                            function_range = function_range[new_lower_index -
                                                            lower_match_index:]
                        lower_match_index = new_lower_index
                        change = True
                    # try to improve the lower border
                    elif upper_match_ea < caller_func_start:
                        upper_match_ea = caller_func_start
                        new_upper_index = all_bin_functions.index(
                            caller_func_start)
                        if function_range is not None:
                            function_range = function_range[:new_upper_index -
                                                            upper_match_index]
                        upper_match_index = new_upper_index
                        change = True
                    else:
                        change = False
                # adjust the borders accordingly
                if change:
                    locked_gap = upper_match_index - lower_match_index + 1
                    lower_border_index = lower_match_index - (
                        overall_num_functions - locked_gap)
                    upper_border_index = upper_match_index + (
                        overall_num_functions - locked_gap)
                    lower_border_ea = all_bin_functions[max(
                        lower_match_index -
                        (overall_num_functions - locked_gap), 0)]
                    upper_border_ea = all_bin_functions[min(
                        upper_match_index +
                        (overall_num_functions - locked_gap),
                        len(all_bin_functions) - 1)]
            else:
                self.logger.warning(
                    "Anchor function - %s: Found several matches (%d), will check it again later",
                    self._src_functions_list[src_anchor_index],
                    len(candidates))
                multiple_option_candidates.append(
                    (src_anchor_index, candidates))
        self.logger.removeIndent()

        # good time to match the user declared functions
        for src_index, bin_ea in manual_anchors_config:
            # check for user errors
            func_ctx = self.disas.funcAt(bin_ea)
            if func_ctx is None or self.disas.funcStart(func_ctx) != bin_ea:
                self.logger.warning(
                    "User defined anchor function %s should be matched to a *start* of a function, not to 0x%x (%s)",
                    self._src_functions_list[src_index], bin_ea,
                    self.disas.funcNameEA(bin_ea))
                continue
            # check for duplicates
            if src_index in self._matched_anchors_ea:
                # contradiction
                if bin_ea != self._matched_anchors_ea[src_index]:
                    actual_ea = self._matched_anchors_ea[src_index]
                    self.logger.warning(
                        "User defined anchor function %s contradicts match at 0x%x (%s), ignoring user definition",
                        self._src_functions_list[src_index], actual_ea,
                        self.disas.funcNameEA(actual_ea))
                    continue
                # duplicate
                else:
                    continue
            # duplicate at this point could only be a contradiction
            if bin_ea in anchor_eas and src_index not in self._matched_anchors_ea:
                self.logger.warning(
                    "User defined anchor function %s contradicts match at 0x%x (%s), ignoring user definition",
                    self._src_functions_list[src_index], bin_ea,
                    self.disas.funcNameEA(bin_ea))
                continue
            # can now safely declare this match
            self.logger.info(
                "User defined anchor function - %s: Matched at 0x%x (%s)",
                self._src_functions_list[src_index], bin_ea,
                self.disas.funcNameEA(bin_ea))
            self._matched_anchors_ea[src_index] = bin_ea
            anchor_eas.append(bin_ea)
            self._src_anchor_list.append(src_index)
            self.declareMatch(src_index, bin_ea, REASON_MANUAL_ANCHOR)
            # use the match to improve our search range
            # first anchor
            if len(self._matched_anchors_ea.keys()) == 1:
                lower_match_ea = bin_ea
                upper_match_ea = lower_match_ea
                lower_match_index = all_bin_functions.index(bin_ea)
                upper_match_index = lower_match_index
                change = True
            else:
                # try to improve the lower border
                if bin_ea < lower_match_ea:
                    lower_match_ea = bin_ea
                    new_lower_index = all_bin_functions.index(bin_ea)
                    if function_range is not None:
                        function_range = function_range[new_lower_index -
                                                        lower_match_index:]
                    lower_match_index = new_lower_index
                    change = True
                # try to improve the lower border
                elif upper_match_ea < bin_ea:
                    upper_match_ea = bin_ea
                    new_upper_index = all_bin_functions.index(bin_ea)
                    if function_range is not None:
                        function_range = function_range[:new_upper_index -
                                                        upper_match_index]
                    upper_match_index = new_upper_index
                    change = True
                else:
                    change = False
            # adjust the borders accordingly
            if change:
                locked_gap = upper_match_index - lower_match_index + 1
                lower_border_index = lower_match_index - (
                    overall_num_functions - locked_gap)
                upper_border_index = upper_match_index + (
                    overall_num_functions - locked_gap)
                lower_border_ea = all_bin_functions[max(
                    lower_match_index - (overall_num_functions - locked_gap),
                    0)]
                upper_border_ea = all_bin_functions[min(
                    upper_match_index + (overall_num_functions - locked_gap),
                    len(all_bin_functions) - 1)]

        # double check the candidates which had multiple options (if narrowed the search space)
        if lower_match_ea is not None:
            for src_anchor_index, candidates in multiple_option_candidates:
                # check if the manual definitions already defined this one
                if src_anchor_index in self._matched_anchors_ea:
                    continue
                filterred_candidates = list(
                    filter(
                        lambda x: lower_match_ea <= x and x <= upper_match_ea,
                        candidates))
                # matched
                if len(filterred_candidates) == 1:
                    bin_ea = filterred_candidates.pop()
                    if bin_ea in anchor_eas:
                        self.logger.warning(
                            "User defined anchor function at 0x%x (%s), blocked revived anchor: %s, dropped the anchor",
                            bin_ea, self.disas.funcNameEA(bin_ea),
                            self._src_functions_list[src_anchor_index])
                        self._src_anchor_list.remove(src_anchor_index)
                        continue
                    caller_func = self.disas.funcAt(bin_ea)
                    caller_func_start = self.disas.funcStart(caller_func)
                    self.logger.info(
                        "Anchor function (revived) - %s: Matched at 0x%x (%s)",
                        self._src_functions_list[src_anchor_index],
                        caller_func_start, self.disas.funcName(caller_func))
                    self._matched_anchors_ea[
                        src_anchor_index] = caller_func_start
                    anchor_eas.append(caller_func_start)
                    self.declareMatch(src_anchor_index, caller_func_start,
                                      REASON_ANCHOR)
                # still not found
                else:
                    self._src_anchor_list.remove(src_anchor_index)

        # make sure we found at least one anchor function
        if len(self._src_anchor_list) == 0:
            self.logger.error("Failed to match even a single Anchor function")
            raise KartaException

        # Create a binary anchor list for future use
        self._bin_anchor_list = []
        for src_anchor_index in self._src_anchor_list:
            self._bin_anchor_list.append(
                all_bin_functions.index(
                    self.function_matches[src_anchor_index]))

        # Sort the file list according to the (bin) order of the anchors
        old_anchor_list = list(self._src_anchor_list)
        self._src_anchor_list.sort(
            key=lambda x: self._bin_anchor_list[old_anchor_list.index(x)])

        # Sanity Check: make sure that the files are not mixed up
        anchor_files = []
        started = True
        for src_anchor_index in self._src_anchor_list:
            if not started and self.src_functions_ctx[
                    src_anchor_index].file != anchor_files[-1]:
                if self.src_functions_ctx[
                        src_anchor_index].file in anchor_files:
                    self.logger.error(
                        "Sanity check failed: the matched anchor functions are tangled between files..."
                    )
                    raise KartaException
            if self.src_functions_ctx[
                    src_anchor_index].file not in anchor_files:
                anchor_files.append(
                    self.src_functions_ctx[src_anchor_index].file)
            started = False

        # remove empty files (wierd edge case)
        self._src_file_names = list(
            filter(lambda x: len(self._src_file_mappings[x]) != 0,
                   self._src_file_mappings.keys()))
        removed_names = list(
            filter(lambda x: len(self._src_file_mappings[x]) == 0,
                   self._src_file_mappings.keys()))
        for name in removed_names:
            self._src_file_mappings.pop(name)

        # Now sort the src file names list according to the sorted anchors
        self._src_file_names = anchor_files + list(
            set(self._src_file_names).difference(anchor_files))
Example #2
0
def analyzeLibrary(config_name, bin_dirs, compiled_ars, prompter):
    """Analyze the open source library, file-by-file and merge the results.

    Args:
        config_name (str): name of the final JSON config file
        bin_dirs (list): list of paths to the binary folders containing the compiled *.o files
        compiled_ars (list): list of paths to the compiled *.ar files
        prompter (prompter): prompter instance
    """
    prompter.info("Starting to analyze the library")
    prompter.addIndent()
    ignore_archive = len(compiled_ars) == 0
    finished_scan = False

    # workaround the enumerate in the next loop
    if ignore_archive:
        compiled_ars = range(len(bin_dirs))

    # ida has severe bugs, make sure to warn the user in advance
    if disas_cmd.name() == "IDA" and ' ' in SCRIPT_PATH:
        prompter.error(
            "IDA does not support spaces (' ') in the script's path. Please move %s's directory accordingly (I feel your pain)",
            (LIBRARY_NAME))
        prompter.removeIndent()
        return

    # We could have 2 iteration rounds here
    while not finished_scan:
        # Prepare & load the stats from each file
        for index, compiled_ar in enumerate(compiled_ars):
            # check if this is a windows archive
            is_windows = isWindows()
            bin_dir = bin_dirs[index]
            bin_suffix = "o" if not is_windows else "obj"
            if not ignore_archive:
                prompter.info(
                    "Analyzing each of the files in the archive - %s",
                    compiled_ar)
            else:
                prompter.info(
                    "Analyzing each of the *.%s files in the bin directory" %
                    (bin_suffix))
            prompter.addIndent()
            archive_files = list(
                locateFiles(
                    bin_dir,
                    filter(lambda x: x.endswith("." + bin_suffix),
                           getArchiveFiles(compiled_ar))
                    if not ignore_archive else None, bin_suffix))
            # check if we need a progress bar
            if len(
                    archive_files
            ) >= PROGRESS_BAR_THRESHOLD and prompter._min_level > logging.DEBUG:
                progress_bar = ProgressBar(
                    'Analyzed %d/%d files - %d%% Completed',
                    len(archive_files),
                    20,
                    True,
                    time_format="Elapsed %M:%S -")
                progress_bar.start()
            else:
                progress_bar = None
            # start the work itself
            for full_file_path, compiled_file in archive_files:
                # ida has severe bugs, make sure to warn the user in advance
                if disas_cmd.name() == "IDA" and ' ' in full_file_path:
                    prompter.error(
                        "IDA does not support spaces (' ') in the file's path (in script mode). Please move the binary directory accordingly (I feel your pain)"
                    )
                    prompter.removeIndent()
                    return
                prompter.debug("%s - %s", full_file_path, compiled_file)
                if progress_bar is None:
                    prompter.info("%s - %s", compiled_file, full_file_path)
                # analyze the file
                analyzeFile(full_file_path, is_windows)
                # load the JSON data from it
                try:
                    fd = open(full_file_path + STATE_FILE_SUFFIX, 'r')
                except IOError:
                    prompter.error(
                        "Failed to create the .JSON file for file: %s" %
                        (compiled_file))
                    prompter.error(
                        "Read the log file for more information: %s" %
                        (constructLogPath(full_file_path)))
                    prompter.removeIndent()
                    prompter.removeIndent()
                    prompter.error("Encountered an error, exiting")
                    exit(1)
                # all was OK, can continue
                parseFileStats(
                    full_file_path,
                    json.load(fd, object_pairs_hook=collections.OrderedDict))
                fd.close()
                if progress_bar is not None:
                    progress_bar.advance(1)
            # wrap it up
            if progress_bar is not None:
                progress_bar.finish()
            prompter.removeIndent()

        # Resolve several unknowns refs as code refs
        prompter.info("Resolving cross-references between different files")
        resolveUnknowns()

        # check if we have any files in the list
        if len(src_file_mappings) == 0 and not ignore_archive:
            prompter.error("No files found in the archive :(")
            prompter.removeIndent()
            new_path = prompter.input(
                "Do you want to analyze all of the *.%s files in the bin directory? <Y/N>: "
                % (bin_suffix)).lower()
            if new_path != 'y':
                prompter.error("Finished with errors!")
                exit(2)
            # run again, and ignore the archive this time
            ignore_archive = True
            prompter.addIndent()
        else:
            finished_scan = True

    # Remove empty files
    prompter.info("Filtering out empty files")
    for file_name in filter(lambda x: len(src_file_mappings[x]) == 0,
                            src_file_mappings):
        src_file_mappings.pop(file_name)

    # Create the list of anchors
    str_anchors = []
    const_anchors = []
    anchors_list = []
    anchors_files = set()
    prompter.info("Identifying possible Anchor functions")
    prompter.addIndent()
    seen_strings, seen_consts, function_list = getContextsStats()
    for src_func_index, src_func_ctx in enumerate(src_functions_ctx):
        is_str, threshold, candidates = anchor.isAnchor(
            src_func_ctx, seen_strings, seen_consts, function_list, prompter)
        if candidates is None:
            continue
        if is_str:
            str_anchors.append(src_func_index)
        else:
            const_anchors.append(src_func_index)
        anchors_files.add(src_func_ctx.file)
    prompter.removeIndent()

    # strings before const, because they are faster to search for
    anchors_list = str_anchors + const_anchors

    # check if we have any files left
    if len(src_file_mappings) == 0:
        prompter.error("All files were empty :(")
        prompter.removeIndent()
        prompter.error("Finished with errors!")
        exit(2)

    # Check for an error
    if len(anchors_list) == 0:
        prompter.warning("Failed to find Anchor functions in the library :(")
        prompter.warning("You should define manual anchors instead")

    # Create the anchors file
    prompter.info("Generating the full JSON file: %s", config_name)
    prompter.addIndent()
    full_json = {}

    # Serialize the anchor list
    prompter.info("Writing the anchor list")
    full_json[JSON_TAG_ANCHORS] = anchors_list

    # Serialize the functions of each files
    prompter.info("Writing the function list for each of the files")
    file_dict = collections.OrderedDict()
    # find a common file prefix, and remove it form the file path
    if len(src_file_mappings) > 1:
        base_value = src_file_mappings.keys()[0].split(os.path.sep)
        comparison_value = src_file_mappings.keys()[-1].split(os.path.sep)
        for index in xrange(min(len(comparison_value), len(base_value))):
            if base_value[index] != comparison_value[index]:
                break
        common_path_len = len(os.path.sep.join(base_value[:index])) + 1
    else:
        common_path_len = len(bin_dirs[0]) + 1

    for src_file_name in src_file_mappings:
        file_dict[src_file_name[common_path_len:]] = map(
            lambda c: c.serialize(), src_file_mappings[src_file_name])
    full_json[JSON_TAG_FILES] = file_dict

    # actually dump it
    fd = open(config_name, "w")
    json.dump(full_json, fd)
    fd.close()
    prompter.removeIndent()

    prompter.info("Anchor to file ratio is: %d/%d", len(anchors_files),
                  len(src_file_mappings))
    prompter.info("Anchor to function ratio is: %d/%d", len(anchors_list),
                  len(src_functions_list))
    prompter.removeIndent()