Exemple #1
0
class Plugin:
    def __init__(self):
        self.sim_hash_location = None
        self.metadata = None
        self.exec_id_cache = {}

    def init_db(self):
        # Fetch location
        location = bn.interaction.get_open_filename_input(
            "Load SimHash database\n> ", ".simhash")
        if not location or location == '':
            bn.log_info(
                "[*] Using default location for SimHash database: {}".format(
                    default_sim_hash_location))
            location = default_sim_hash_location

        # setup metadata class
        self.sim_hash_location = location
        self.metadata = Metadata(location + '.meta')

    @staticmethod
    def parse_instruction(instruction):
        tokens = []

        for token in instruction:
            tokens.append(token.text)

        s = ''.join(tokens)
        mnemonic, op = s.split(' ', 1)
        operands = tuple(op.lstrip().split(','))

        return mnemonic, operands

    def extract_flowgraph_hash(self, function, minimum_size=5):
        """
      Generates a flowgraph object that can be fed into FunctionSimSearch from a
      given address in Binary Ninja and returns set of hashes.
    """

        nodes = []
        graph = []

        # Retrieve CFG data
        for block in function:
            local_node = []
            shift = 0
            position = block.start

            for instruction in block:
                mnemonic, operands = Plugin.parse_instruction(instruction[0])
                local_node.append((mnemonic, operands))
                shift += instruction[1]

                if mnemonic == 'call':  # Split on call with assumption that we only care about x86/64 for now
                    nodes.append((position, local_node))
                    local_node = []
                    graph.append((position, block.start + shift))
                    position = block.start + shift

            for edge in block.outgoing_edges:
                graph.append((position, edge.target.start))

            if local_node:
                nodes.append((position, local_node))
            else:
                if len(graph) > 0:
                    graph.pop(-1)

        # Generate flowgraph
        flowgraph = fss.FlowgraphWithInstructions()

        for node in nodes:
            flowgraph.add_node(node[0])
            flowgraph.add_instructions(node[0],
                                       tuple(node[1]))  # Format conversion

        for edge in graph:
            flowgraph.add_edge(edge[0], edge[1])

        if flowgraph.number_of_branching_nodes() < minimum_size:
            return (None, None)
        hasher = fss.SimHasher()

        return hasher.calculate_hash(flowgraph)

    def get_exec_id(self, filename):
        if self.exec_id_cache.has_key(filename):
            return self.exec_id_cache[filename]
        h = hashlib.sha256()
        with open(filename, 'r') as fh:
            h.update(fh.read())
        self.exec_id_cache[filename] = long(h.hexdigest()[0:16], 16)
        return self.exec_id_cache[filename]

    def save_single_function_hash(self,
                                  bv,
                                  search_index,
                                  function,
                                  write_meta=True):
        """
      Save the hash of a given function into a given search index.
    """

        # TODO: detect if we are opening database instead of binary
        exec_id = self.get_exec_id(bv.file.filename)
        h1, h2 = self.extract_flowgraph_hash(function)

        if h1 and h2:
            search_index.add_function(h1, h2, exec_id, function.start)
            bn.log_info(
                '[+] Added function <{:x}:0x{:x} {:x}-{:x}> to search index.'.
                format(exec_id, function.start, h1, h2))
            self.metadata.add(exec_id, function.start, bv.file.filename,
                              function.name)
            if write_meta:
                self.metadata.__save__()
        else:
            bn.log_info(
                '[-] Did not add function <{:x}:0x{:x}> to search index.'.
                format(exec_id, function.start))

    def init_index(self, bv):
        if not self.sim_hash_location:
            self.init_db()

        # Supported platform check
        if bv.platform.name not in supported_arch:
            bn.log_error(
                '[!] Right now this plugin supports only the following architectures: '
                + str(supported_arch))
            return -1

        if os.path.isfile(self.sim_hash_location):
            create_index = False
        else:
            create_index = True

        search_index = fss.SimHashSearchIndex(self.sim_hash_location,
                                              create_index, 50)

        return search_index

    def save_hash(self, bv, current_function):
        """
      Save hash of current function into search index.
    """
        search_index = self.init_index(bv)
        self.save_single_function_hash(bv, search_index, current_function)

    def save_all_functions(self, bv, _):
        """
      Walk through all functions and save them into the index.
    """
        search_index = self.init_index(bv)
        for function in bv.functions:
            self.save_single_function_hash(bv, search_index, function, False)
        self.metadata.__save__()

    def add_report_from_result(self,
                               results,
                               report,
                               address,
                               minimal_match=100):
        results = [r for r in results if r[0] > minimal_match]
        if len(results) > 0:
            report += "## Best match results for 0x{:x}\n".format(address)
            for r in results:
                m = self.metadata.get(r[1], r[2])  # file name, function name
                if not m or len(m) == 0:
                    line = "- {:d}/128 - {:f} - {:x}:0x{:x}".format(
                        int(r[0]),
                        max(float(r[0]) / 128.0 - 0.5, 0.0) * 2, r[1], r[2])
                else:
                    line = "- {:d}/128 - {:f} - {:x}:0x{:x} {} '{}'".format(
                        int(r[0]),
                        max(float(r[0]) / 128.0 - 0.5, 0.0) * 2, r[1], r[2],
                        m[0], m[1])
                report += line + "\n"
        return report

    def find_function_hash(self, bv, h1, h2, address, search_index, report):
        results = search_index.query_top_N(h1, h2, 5)
        return self.add_report_from_result(results, report, address)

    def find_hash(self, bv, current_function):
        """
      Find functions similar to the current one.
    """
        search_index = self.init_index(bv)
        h1, h2 = self.extract_flowgraph_hash(current_function)
        if h1 and h2:
            report = self.find_function_hash(bv, h1, h2,
                                             current_function.start,
                                             search_index, "")
            bn.interaction.show_markdown_report(
                'Function Similarity Search Report\n',
                report,
                plaintext=report)  # I know it sucks
        else:
            bn.log_info(
                '[-] Did not search for function <{:x}:0x{:x}> to search index.'
                .format(self.get_exec_id(bv.file.filename),
                        current_function.start))

    def find_all_hashes(self, bv, _):
        """
     Run similarity search based for each function.
    """
        search_index = self.init_index(bv)
        report = ""
        for function in bv.functions:
            h1, h2 = self.extract_flowgraph_hash(function)

            if h1 and h2:
                report = self.find_function_hash(bv, h1, h2, function.start,
                                                 search_index, report)
            else:
                bn.log_info('[-] Did not search for function 0x{:x}.'.format(
                    function.start))

        bn.interaction.show_markdown_report(
            'Function Similarity Search Report\n', report,
            plaintext=report)  # I know it sucks