Ejemplo n.º 1
0
class CryptoIdentifier():
    """
    This class contains the logic to perform Crypto identification.
    Two techniques are currently supported:
    1. A heuristic approach that identifies functions and basic blocks
    based on the ratio of arithmetic/logic instructions to all instructions
    2. A signature-based approach, using the signatures defined in PatternManager
    """
    def __init__(self):
        self.name = "CryptoIdentifier"
        print("[*] loading CryptoIdentifier")
        self.time = time
        self.re = re
        self.GraphHelper = GraphHelper
        self.CryptoSignatureHit = CryptoSignatureHit
        self.AritlogBasicBlock = AritlogBasicBlock
        self.Segment = Segment
        self.pm = PatternManager()
        self.low_rating_threshold = 0.4
        self.high_rating_threshold = 1.0
        self.low_instruction_threshold = 8
        self.high_instruction_threshold = 100
        # if the threshold is set to this value, it is automatically expanded to infinite.
        self.max_instruction_threshold = 100
        self.low_call_threshold = 0
        self.high_call_threshold = 1
        # if the threshold is set to this value, it is automatically expanded to infinite.
        self.max_call_threshold = 10
        # if at least this fraction of a signature's length' has been identified
        # consecutively, the location is marked as a signature hit.
        self.match_filter_factor = 0.5
        self.aritlog_blocks = []
        self.signature_hits = []
        self.ida_proxy = IdaProxy()
        return

    def scan(self):
        """
        Scan the whole IDB with all available techniques.
        """
        self.scanAritlog()
        self.scanCryptoPatterns()

################################################################################
# Aritlog scanning
################################################################################

    def scanAritlog(self):
        """
        scan with the arithmetic/logic heuristic
        @return: a list of AritLogBasicBlock data objects that fulfill the parameters as specified
        """
        print("[*] CryptoIdentifier: Starting aritlog heuristic analysis.")
        self.aritlog_blocks = []
        time_before = self.time.time()
        for function_ea in self.ida_proxy.Functions():
            function_chart = self.ida_proxy.FlowChart(
                self.ida_proxy.get_func(function_ea))
            calls_in_function = 0
            function_blocks = []
            function_dgraph = {}
            blocks_in_loops = set()
            for current_block in function_chart:
                block = self.AritlogBasicBlock(current_block.startEA,
                                               current_block.endEA)
                for instruction in self.ida_proxy.Heads(
                        block.start_ea, block.end_ea):
                    if self.ida_proxy.isCode(
                            self.ida_proxy.GetFlags(instruction)):
                        mnemonic = self.ida_proxy.GetMnem(instruction)
                        has_identical_operands = self.ida_proxy.GetOperandValue(instruction, 0) == \
                            self.ida_proxy.GetOperandValue(instruction, 1)
                        block.updateInstructionCount(mnemonic,
                                                     has_identical_operands)
                        if mnemonic == "call":
                            calls_in_function += 1
                function_blocks.append(block)
                # prepare graph for Tarjan's algorithm
                succeeding_blocks = [
                    succ.startEA for succ in current_block.succs()
                ]
                function_dgraph[current_block.startEA] = succeeding_blocks
                # add trivial loops
                if current_block.startEA in succeeding_blocks:
                    block.is_contained_in_trivial_loop = True
                    blocks_in_loops.update([current_block.startEA])
            # perform Tarjan's algorithm to identify strongly connected components (= loops) in the function graph
            graph_helper = self.GraphHelper()
            strongly_connected = graph_helper.calculateStronglyConnectedComponents(
                function_dgraph)
            non_trivial_loops = [
                component for component in strongly_connected
                if len(component) > 1
            ]
            for component in non_trivial_loops:
                for block in component:
                    blocks_in_loops.update([block])
            for block in function_blocks:
                if block.start_ea in blocks_in_loops:
                    block.is_contained_in_loop = True
                block.num_calls_in_function = calls_in_function
            self.aritlog_blocks.extend(function_blocks)
        print("[*] Heuristics analysis took %3.2f seconds." %
              (self.time.time() - time_before))

        return self.getAritlogBlocks(
            self.low_rating_threshold, self.high_rating_threshold,
            self.low_instruction_threshold, self.high_instruction_threshold,
            self.low_call_threshold, self.high_call_threshold, False, False,
            False)

    def _updateThresholds(self, min_rating, max_rating, min_instr, max_instr,
                          min_call, max_call):
        """
        update all six threshold bounds
        @param min_rating: the minimum arit/log ratio a basic block must have
        @type min_rating: float
        @param max_rating: the maximum arit/log ratio a basic block can have
        @type max_rating: float
        @param min_instr: the minimum number of instructions a basic block must have
        @type min_instr: int
        @param max_instr: the minimum number of instructions a basic block can have
        @type max_instr: int
        @param min_call: the minimum number of calls a basic block must have
        @type min_call: int
        @param max_call: the minimum number of calls a basic block can have
        @type max_call: int
        """
        self.low_rating_threshold = max(0.0, min_rating)
        self.high_rating_threshold = min(1.0, max_rating)
        self.low_instruction_threshold = max(0, min_instr)
        if max_instr >= self.max_instruction_threshold:
            # we cap the value here and safely assume there is no block with more than 1000000 instructions
            self.high_instruction_threshold = 1000000
        else:
            self.high_instruction_threshold = max_instr
        self.low_call_threshold = max(0, min_call)
        if max_call >= self.max_call_threshold:
            # we cap the value here and safely assume there is no block with more than 1000000 instructions
            self.high_call_threshold = 1000000
        else:
            self.high_call_threshold = max_call

    def getAritlogBlocks(self, min_rating, max_rating, min_instr, max_instr, min_api, max_api, is_nonzero, \
        is_looped, is_trivially_looped):
        """
        get all blocks that are within the limits specified by the heuristic parameters.
        parameters are the same as in function "_updateThresholds" except
        param is_nonzero: defines whether zeroing instructions (like xor eax, eax) shall be counted or not.
        type is_nonzero: boolean
        param is_looped: defines whether only basic blocks in loops shall be selected
        type is_looped: boolean
        @return: a list of AritlogBasicBlock data objects, according to the parameters
        """
        self._updateThresholds(min_rating, max_rating, min_instr, max_instr,
                               min_api, max_api)
        return [
            block for block in self.aritlog_blocks
            if (self.high_rating_threshold >= block.getAritlogRating(
                is_nonzero) >= self.low_rating_threshold) and (
                    self.high_instruction_threshold >= block.num_instructions
                    >= self.low_instruction_threshold) and (
                        self.high_call_threshold >= block.num_calls_in_function
                        >= self.low_call_threshold) and
            (not is_looped or block.is_contained_in_loop) and (
                not is_trivially_looped or block.is_contained_in_trivial_loop)
        ]

    def getUnfilteredBlockCount(self):
        """
        returns the number of basic blocks that have been analyzed.
        @return: (int) number of basic blocks
        """
        return len(self.aritlog_blocks)

################################################################################
# Signature scanning
################################################################################

    def getSegmentData(self):
        """
        returns the raw bytes of the segments as stored by IDA
        @return: a list of Segment data objects.
        """
        segments = []
        for segment_ea in self.ida_proxy.Segments():
            try:
                segment = self.Segment()
                segment.start_ea = segment_ea
                segment.end_ea = self.ida_proxy.SegEnd(segment_ea)
                segment.name = self.ida_proxy.SegName(segment_ea)
                buf = ""
                for ea in helpers.Misc.lrange(
                        segment_ea, self.ida_proxy.SegEnd(segment_ea)):
                    buf += chr(self.ida_proxy.get_byte(ea))
                segment.data = buf
                segments.append(segment)
            except:
                print(
                    "[!] Tried to access invalid segment data. An error has occurred while address conversion"
                )
        return segments

    def scanCryptoPatterns(self, pattern_size=32):
        crypt_results = []
        print("[*] CryptoIdentifier: Starting crypto signature scanning.")
        time_before_matching = self.time.time()
        segments = self.getSegmentData()
        keywords = self.pm.getTokenizedSignatures(pattern_size)
        for keyword in keywords.keys():
            for segment in segments:
                crypt_results.extend([
                    self.CryptoSignatureHit(segment.start_ea + match.start(),
                                            keywords[keyword], keyword)
                    for match in self.re.finditer(self.re.escape(keyword),
                                                  segment.data)
                ])
        variable_matches = self.scanVariablePatterns()
        crypt_results.extend(variable_matches)
        print("[*] Full matching took %3.2f seconds and resulted in %d hits." %
              (self.time.time() - time_before_matching, len(crypt_results)))
        self.signature_hits = crypt_results
        return crypt_results

    def scanVariablePatterns(self):
        # the scanning code is roughly based on kyprizel's signature scan, see credtis above for more information
        crypt_results = []
        variable_signatures = self.pm.getVariableSignatures()
        for var_sig in variable_signatures.keys():
            current_seg = self.ida_proxy.FirstSeg()
            seg_end = self.ida_proxy.SegEnd(current_seg)
            while current_seg != self.ida_proxy.BAD_ADDR:
                signature_hit = self.ida_proxy.find_binary(
                    current_seg, seg_end, variable_signatures[var_sig], 16, 1)
                if signature_hit != self.ida_proxy.BAD_ADDR:
                    crypt_results.append(
                        self.CryptoSignatureHit(signature_hit, [var_sig],
                                                variable_signatures[var_sig]))
                    current_seg = signature_hit + variable_signatures[
                        var_sig].count(" ") + 1
                else:
                    current_seg = self.ida_proxy.NextSeg(seg_end)
                    if not current_seg == self.ida_proxy.BAD_ADDR:
                        seg_end = self.ida_proxy.SegEnd(current_seg)
        return crypt_results

    def getSignatureLength(self, signature_name):
        """
        returns the length for a signature, identified by its name
        @param signature_name: name for a signature, e.g. "ADLER 32"
        @type signature_name: str
        @return: (int) length of the signature.
        """
        for item in self.pm.signatures.items():
            if item[1] == signature_name:
                return len(item[0])
        return 0

    def getSignatureHits(self):
        """
        Get all signature hits that have a length of at least match_filter_factor percent
        of the signature they triggered.
        Hits are grouped by signature names.
        @return: a dictionary  with key/value entries of the following form: ("signature name", [CryptoSignatureHit])
        """
        sorted_hits = sorted(self.signature_hits)
        unified_hits = []

        previous_signature_names = []
        for hit in sorted_hits:
            hit_intersection = [
                element for element in hit.signature_names
                if element in previous_signature_names
            ]
            if len(hit_intersection) == 0:
                previous_signature_names = hit.signature_names
                unified_hits.append(self.CryptoSignatureHit(hit.start_address, hit.signature_names, \
                    hit.matched_signature))
            else:
                previous_signature_names = hit_intersection
                previous_hit = unified_hits[-1]
                if hit.start_address == previous_hit.start_address + len(
                        previous_hit.matched_signature):
                    previous_hit.matched_signature += hit.matched_signature
                    previous_hit.signature_names = hit_intersection
                else:
                    unified_hits.append(self.CryptoSignatureHit(hit.start_address, hit.signature_names, \
                        hit.matched_signature))

        filtered_hits = []
        for hit in unified_hits:
            if len(hit.matched_signature) >= max([
                    self.match_filter_factor * self.getSignatureLength(name)
                    for name in hit.signature_names
            ]):
                hit.code_refs_to = self.getXrefsToAddress(hit.start_address)
                filtered_hits.append(hit)

        grouped_hits = {}
        for hit in filtered_hits:
            for name in hit.signature_names:
                if name not in grouped_hits:
                    grouped_hits[name] = [hit]
                else:
                    grouped_hits[name].append(hit)

        return grouped_hits

    def getXrefsToAddress(self, address):
        """
        get all references to a certain address.
        These are no xrefs in IDA sense but references to the crypto signatures.
        If the signature points to an instruction, e.g. if a constant is moved to a register, the return is flagged as
        "True", meaning it is an in-code reference.
        @param address: an arbitrary address
        @type address: int
        @return: a list of tuples (int, boolean)
        """
        xrefs = []
        head_to_address = self.ida_proxy.PrevHead(address, address - 14)
        if head_to_address != 0xFFFFFFFF:
            flags = self.ida_proxy.GetFlags(head_to_address)
            if self.ida_proxy.isCode(flags):
                xrefs.append((head_to_address, True))
        for x in self.ida_proxy.XrefsTo(address):
            flags = self.ida_proxy.GetFlags(x.frm)
            if self.ida_proxy.isCode(flags):
                xrefs.append((x.frm, False))
        return xrefs
Ejemplo n.º 2
0
class DocumentationHelper():
    """
    This class handles instruction coloring.
    """

    # data layout of color maps
    layout_color_map = {
        "tag": {
            "base_color": 0x112233,
            "highlight_color": 0x445566
        }
    }

    def __init__(self, idascope_config):
        print("[|] loading DocumentationHelper")
        self.ida_proxy = IdaProxy()
        # default colors are grey / light red / red
        self.default_neutral_color = 0xCCCCCC
        self.default_base_color = 0xB3B3FF
        self.default_highlight_color = 0x3333FF
        self.color_state = "unknown"
        self.idascope_config = idascope_config
        self._loadConfig(self.idascope_config.semantics_file)
        return

    def _loadConfig(self, config_filename):
        """
        Loads a semantic configuration file and generates a color map from the contained information.
        @param config_filename: filename of a semantic configuration file
        @type config_filename: str
        """
        config_file = open(config_filename, "r")
        config = config_file.read()
        parsed_config = json.loads(config, object_hook=JsonHelper.decode_dict)
        self.default_neutral_color = int(
            parsed_config["default_neutral_color"], 16)
        self.default_base_color = int(parsed_config["default_base_color"], 16)
        self.default_highlight_color = int(
            parsed_config["default_highlight_color"], 16)
        self.color_map = self._generateColorMapFromDefinitions(parsed_config)
        return

    def _generateColorMapFromDefinitions(self, config):
        """
        Internal function to generate a color map from a semantic definitions config file.
        @param definitions: the defintions part of a semantic definitions config file.
        @type definitions: dict
        @return: a dictionary of a color map, see I{layout_color_map} for a reference
        """
        color_map = {}
        for definition in config["semantic_definitions"]:
            # convert text representation of color codes to numbers
            group_colors = self._getColorsForGroup(definition["group"], config)
            color_map[definition["tag"]] = {"base_color": int(group_colors[0], 16), \
                "highlight_color": int(group_colors[1], 16)}
        return color_map

    def _getColorsForGroup(self, target_group, config):
        for group in config["semantic_groups"]:
            if group["tag"] == target_group:
                return (group["base_color"], group["highlight_color"])
        print "[-] Failed to get colors for group \"%s\" - you might want to check your semantics file." % target_group
        return (self.default_base_color, self.default_highlight_color)

    def uncolorAll(self):
        """
        Uncolors all instructions of all segments by changing their color to white.
        """
        for seg_ea in self.ida_proxy.Segments():
            for function_address in self.ida_proxy.Functions(self.ida_proxy.SegStart(seg_ea), \
                self.ida_proxy.SegEnd(seg_ea)):
                for block in self.ida_proxy.FlowChart(
                        self.ida_proxy.get_func(function_address)):
                    for head in self.ida_proxy.Heads(block.startEA,
                                                     block.endEA):
                        self.colorInstruction(head, 0xFFFFFF, refresh=False)
        self.ida_proxy.refresh_idaview_anyway()

    def colorInstruction(self, address, color, refresh=True):
        """
        Colors the instruction at an address with the given color code.
        @param address: address of the instruction to color
        @type address: int
        @param color: color-code to set for the instruction
        @type color: int (0xBBGGRR)
        @param refresh: refresh IDA view to ensure the color shows directly, can be omitted for performance.
        @type refresh: boolean
        """
        self.ida_proxy.SetColor(address, self.ida_proxy.CIC_ITEM, color)
        if refresh:
            self.ida_proxy.refresh_idaview_anyway()

    def colorBasicBlock(self, address, color, refresh=True):
        """
        Colors the basic block containing a target address with the given color code.
        @param address: address an instruction in the basic block to color
        @type address: int
        @param color: color-code to set for the instruction
        @type color: int (0xBBGGRR)
        @param refresh: refresh IDA view to ensure the color shows directly, can be omitted for performance.
        @type refresh: boolean
        """
        function_chart = self.ida_proxy.FlowChart(
            self.ida_proxy.get_func(address))
        for block in function_chart:
            if block.startEA <= address < block.endEA:
                for head in self.ida_proxy.Heads(block.startEA, block.endEA):
                    self.colorInstruction(head, color, refresh)

    def getNextColorScheme(self):
        """
        get the next color scheme in the three-cycle "individual/mono/uncolored", where individual is semantic coloring
        @return: next state
        """
        if self.color_state == "individual":
            return "mono"
        elif self.color_state == "mono":
            return "uncolored"
        elif self.color_state == "uncolored":
            return "individual"
        else:
            return "individual"

    def selectHighlightColor(self, tag):
        """
        automatically chooses the highlight color for a tag based on the current color scheme
        @return: (int) a color code
        """
        if self.getNextColorScheme() == "uncolored":
            return 0xFFFFFF
        elif self.getNextColorScheme() == "mono":
            return self.default_highlight_color
        else:
            return self.color_map[tag]["highlight_color"]

    def selectBaseColor(self, tagged_addresses_in_block):
        """
        automatically chooses the base color for a block based on the current color scheme
        @param tagged_addresses_in_block: all tagged addresses in a basic block for which the color shall be chosen
        @type tagged_addresses_in_block: a list of tuples (int, str) containing pairs of instruction addresses and tags
        @return: (int) a color code
        """
        if self.getNextColorScheme() == "uncolored":
            return 0xFFFFFF
        elif self.getNextColorScheme() == "mono":
            return self.default_base_color
        else:
            tags_in_block = [item[1] for item in tagged_addresses_in_block]
            colors_in_block = set([self.color_map[tags_in_block[index]]["base_color"] \
                for index in xrange(len(tags_in_block))])
            if len(colors_in_block) == 1:
                return colors_in_block.pop()
            else:
                return self.default_neutral_color

    def colorize(self, scan_result):
        """
        perform coloring on the IDB, based on a scan performed by SemanticIdentifier
        @param scan_result: result of a scan as performed by SemanticIdentifier
        @type scan_result: a dictionary with key/value entries of the following form: (address, [FunctionContext])
        """
        for function_address in scan_result.keys():
            tagged_addresses_in_function = scan_result[
                function_address].getAllTaggedAddresses()
            function_chart = self.ida_proxy.FlowChart(
                self.ida_proxy.get_func(function_address))
            for basic_block in function_chart:
                tagged_addresses_in_block = [(addr, tagged_addresses_in_function[addr]) for addr in \
                    tagged_addresses_in_function.keys() if addr in xrange(basic_block.startEA, basic_block.endEA)]
                if len(tagged_addresses_in_block) > 0:
                    base_color = self.selectBaseColor(
                        tagged_addresses_in_block)
                    self.colorBasicBlock(basic_block.startEA,
                                         base_color,
                                         refresh=False)
                    for tagged_address in tagged_addresses_in_block:
                        highlight_color = self.selectHighlightColor(
                            tagged_address[1])
                        self.colorInstruction(tagged_address[0],
                                              highlight_color,
                                              refresh=False)
        self.color_state = self.getNextColorScheme()
        self.ida_proxy.refresh_idaview_anyway()

    def getNextNonFuncInstruction(self, addr):
        next_instruction = addr
        while next_instruction != self.ida_proxy.BAD_ADDR:
            next_instruction = self.ida_proxy.find_not_func(
                next_instruction, self.ida_proxy.SEARCH_DOWN)
            flags = self.ida_proxy.GetFlags(next_instruction)
            if self.ida_proxy.isCode(flags):
                return next_instruction
        return self.ida_proxy.BAD_ADDR

    def convertNonFunctionCode(self):
        self.convertAnyProloguesToFunctions()
        # do a second run to define the rest
        next_instruction = self.ida_proxy.minEA()
        while next_instruction != self.ida_proxy.BAD_ADDR:
            next_instruction = self.getNextNonFuncInstruction(next_instruction)
            print("[+] Fixed undefined code to function @ [%08x]" % \
                (next_instruction))
            self.ida_proxy.MakeFunction(next_instruction)
        return

    def convertAnyProloguesToFunctions(self):
        self.convertDataWithPrologueToCode()
        self.convertNonFunctionCodeWithPrologues()

    def convertNonFunctionCodeWithPrologues(self):
        next_instruction = self.ida_proxy.minEA()
        while next_instruction != self.ida_proxy.BAD_ADDR:
            next_instruction = self.getNextNonFuncInstruction(next_instruction)
            if self.ida_proxy.GetMnem(next_instruction).startswith("push") and \
                self.ida_proxy.GetOpType(next_instruction, 0) == 1 and \
                self.ida_proxy.GetOperandValue(next_instruction, 0) == 5:
                instruction_after_push = self.getNextNonFuncInstruction(
                    next_instruction)
                if self.ida_proxy.GetMnem(instruction_after_push).startswith("mov") and \
                    self.ida_proxy.GetOpType(instruction_after_push, 0) == 1 and \
                    self.ida_proxy.GetOperandValue(instruction_after_push, 0) == 5 and \
                    self.ida_proxy.GetOpType(instruction_after_push, 1) == 1 and \
                    self.ida_proxy.GetOperandValue(instruction_after_push, 1) == 4:
                    print("[+] Fixed undefined code with function prologue (push ebp; mov ebp, esp) to function " \
                        + "@ [%08x]" % (next_instruction))
                    self.ida_proxy.MakeFunction(next_instruction)

    def convertDataWithPrologueToCode(self):
        current_seg = self.ida_proxy.FirstSeg()
        seg_end = self.ida_proxy.SegEnd(current_seg)
        while current_seg != self.ida_proxy.BAD_ADDR:
            signature_hit = self.ida_proxy.find_binary(current_seg, seg_end,
                                                       "55 8B EC", 16, 1)
            if signature_hit != self.ida_proxy.BAD_ADDR:
                flags = self.ida_proxy.GetFlags(signature_hit)
                if not self.ida_proxy.isCode(flags):
                    self.ida_proxy.MakeFunction(signature_hit)
                    print("[+] Fixed undefined data with potential function prologue (push ebp; mov ebp, esp) to function " \
                            + "@ [%08x]" % (signature_hit))
                current_seg = signature_hit + 3 + 1
            else:
                current_seg = self.ida_proxy.NextSeg(seg_end)
                if not current_seg == self.ida_proxy.BAD_ADDR:
                    seg_end = self.ida_proxy.SegEnd(current_seg)