Ejemplo n.º 1
0
 def dissemble_code(self, code, baseaddr):
     md = Cs(capstone.CS_ARCH_PPC,
             capstone.CS_MODE_32 | capstone.CS_MODE_BIG_ENDIAN)
     md.syntax = capstone.CS_OPT_SYNTAX_INTEL
     for (address, size, mnemonic,
          op_str) in md.disasm_lite(code, baseaddr):
         print "0x%x:\t%s\t%s" % (address, mnemonic, op_str)
Ejemplo n.º 2
0
class GenericDisassembler:
    def __init__(self, arch, mode):
        self.arch = arch
        self.mode = mode
        self.capstone = Cs(self.arch, self.mode)

        self.prologues = {
            # Triple backslash (\\\) are needed to escape bytes in the compiled regex
            CS_MODE_32: [
                "\\\x55\\\x89\\\xE5",  # push ebp & mov ebp, esp
                "\\\x55\\\x8B\\\xEC",  # push ebp & mov ebp, esp
                "\\\x55\\\x8b\\\x6c\\\x24",  # push ebp & mov ebp, [esp+?]
            ],
            CS_MODE_64: [
                "\\\x55\\\x48\\\x89\\\xE5",  # push rbp & mov rbp, rsp
            ]
        }[mode]

        self.conditional_jmp_mnemonics = {
            'jz', 'je', 'jcxz', 'jecxz', 'jrcxz', 'jnz', 'jp', 'jpe', 'jnp',
            'ja', 'jae', 'jb', 'jbe', 'jg', 'jge', 'jl', 'jle', 'js', 'jns',
            'jo', 'jno', 'jecxz', 'loop', 'loopne', 'loope', 'jne'
        }
        self.x86_32_registers = {
            'eax', 'ebx', 'ecx', 'edx', 'esi', 'edi', 'esp', 'ebp'
        }
        self.max_instruction_size = 16

    def linear_sweep_cache(self,
                           data,
                           offset,
                           insts,
                           bin_instance,
                           verbose=False):
        curr_offset = offset
        try:
            inst_va = self.get_va_from_offset(bin_instance, curr_offset)
            instructions = self.capstone.disasm_lite(data[offset:], inst_va)

            curr_offset = offset
            for (address, size, mnemonic, op_str) in instructions:
                inst = Instruction(
                    offset=curr_offset,
                    va=inst_va,
                    address=address,
                    mnemonic=mnemonic,
                    op_str=op_str,
                    size=size,
                    bytes=data[curr_offset:curr_offset + size],
                    cache_only=True,
                )

                insts[curr_offset] = inst
                curr_offset += size
                inst_va += size
        except Exception, e:
            print "WARNING:", repr(e)
        return insts
Ejemplo n.º 3
0
    def __gadgetsFinding(self, section, gadgets, arch, mode):

        PREV_BYTES = 9 # Number of bytes prior to the gadget to store.

        opcodes = section["opcodes"]
        sec_vaddr = section["vaddr"]

        ret = []
        md = Cs(arch, mode)
        for gad_op, gad_size, gad_align in gadgets:
            allRefRet = [m.start() for m in re.finditer(gad_op, opcodes)]
            for ref in allRefRet:
                end = ref + gad_size
                for i in range(self.__options.depth):
                    start = ref - (i * gad_align)
                    if (sec_vaddr+start) % gad_align == 0:
                        code = opcodes[start:end]
                        decodes = md.disasm_lite(code, sec_vaddr+ref)
                        decodes = list(decodes)
                        if sum(size for _, size, _, _ in decodes) != i*gad_align + gad_size:
                            # We've read less instructions than planned so something went wrong
                            continue
                        if self.passClean(decodes):
                            continue
                        off = self.__offset
                        vaddr = off+sec_vaddr+start
                        g = {"vaddr" :  vaddr}
                        #if not self.__options.noinstr:
                        g["gadget"] = " ; ".join("{}{}{}".format(mnemonic, " " if op_str else "", op_str)
                                                     for _, _, mnemonic, op_str in decodes).replace("  ", " ")
                        if self.__options.callPreceded:
                            prevBytesAddr = max(sec_vaddr, vaddr - PREV_BYTES)
                            g["prev"] = opcodes[prevBytesAddr-sec_vaddr:vaddr-sec_vaddr]
                        #if self.__options.dump:
                        g["bytes"] = code
                        ret.append(g)
        return ret
Ejemplo n.º 4
0
class GenericDisassembler:
    def __init__(self, arch, mode):
        self.arch = arch
        self.mode = mode
        self.capstone = Cs(self.arch, self.mode)

        self.prologues = {
            # Triple backslash (\\\) are needed to escape bytes in the compiled regex
            CS_MODE_32: [
                b"\x55\x89\xE5",  # push ebp & mov ebp, esp
                b"\x55\x8B\xEC",  # push ebp & mov ebp, esp
                b"\x55\x8b\x6c\x24",  # push ebp & mov ebp, [esp+?]
            ],
            CS_MODE_64: [
                b"\x55\x48\x89\xE5",  # push rbp & mov rbp, rsp
            ]
        }[mode]

        self.conditional_jmp_mnemonics = {
            'jz', 'je', 'jcxz', 'jecxz', 'jrcxz', 'jnz', 'jp', 'jpe', 'jnp',
            'ja', 'jae', 'jb', 'jbe', 'jg', 'jge', 'jl', 'jle', 'js', 'jns',
            'jo', 'jno', 'jecxz', 'loop', 'loopne', 'loope', 'jne'
        }
        self.x86_32_registers = {
            'eax', 'ebx', 'ecx', 'edx', 'esi', 'edi', 'esp', 'ebp'
        }
        self.max_instruction_size = 16

    def linear_sweep_cache(self,
                           data,
                           offset,
                           insts,
                           bin_instance,
                           verbose=False):
        section_offset_last = self.get_section_offset_last(
            bin_instance, offset)
        curr_offset = offset
        try:
            inst_va = self.get_va_from_offset(bin_instance, curr_offset)
            instructions = self.capstone.disasm_lite(data[offset:], inst_va)

            curr_offset = offset
            for (address, size, mnemonic, op_str) in instructions:
                inst = Instruction(
                    offset=curr_offset,
                    va=inst_va,
                    address=address,
                    mnemonic=mnemonic,
                    op_str=op_str,
                    size=size,
                    bytes=data[curr_offset:curr_offset + size],
                    cache_only=True,
                )

                insts[curr_offset] = inst
                curr_offset += size
                inst_va += size

                if section_offset_last is not None and curr_offset > section_offset_last:
                    break

        except Exception as e:
            print("WARNING:", repr(e))
        return insts

    def _dis(self,
             data,
             offset,
             insts,
             bin_instance,
             iat_api=dict(),
             verbose=False,
             ifrom=None,
             from_pred=True,
             is_rva=False):
        '''
            <insts> is a dict like {'offset': <Instruction>}
        '''

        args_queue = []
        args_queue.append((offset, ifrom, from_pred))

        while args_queue != []:
            offset, ifrom, from_pred = args_queue.pop(0)

            if offset is None:
                continue

            inst = None
            if offset in insts:
                inst = insts[offset]
                if inst.cache_only:
                    inst.cache_only = False
                else:
                    if ifrom:
                        inst.add_ifrom(ifrom.offset)
                        insts[ifrom.offset].add_ito(inst.offset, from_pred)
                    continue

            if inst is None:
                try:
                    inst_va = self.get_va_from_offset(bin_instance, offset)
                    (address, size, mnemonic, op_str) = next(
                        self.capstone.disasm_lite(
                            data[offset:offset + self.max_instruction_size],
                            inst_va,
                            count=1))

                    inst = Instruction(
                        offset=offset,
                        va=inst_va,
                        address=address,
                        mnemonic=mnemonic,
                        op_str=op_str,
                        size=size,
                        bytes=data[offset:offset + size],
                        cache_only=False,
                    )
                    insts[inst.offset] = inst
                except Exception as e:
                    if verbose:
                        print("WARNING:", repr(e))
                    continue

            if ifrom:
                insts[inst.offset].add_ifrom(ifrom.offset)
                insts[ifrom.offset].add_ito(inst.offset, from_pred)

            # No child
            if inst.mnemonic in ['ret', 'retf']:
                pass

            # 1 remote child
            elif inst.mnemonic in ['jmp', 'jmpf']:
                if "word ptr [0x" in inst.op_str:
                    iat_va = int(inst.op_str.split('[')[1].split(']')[0], 16)

                    if iat_va in iat_api:
                        inst.op_str = iat_api[iat_va]
                else:
                    try:
                        remote_offset = self.get_offset_from_va(
                            bin_instance, int(inst.op_str, 16))
                        if remote_offset is not None:
                            args_queue.insert(
                                0, (remote_offset, insts[inst.offset], False))
                    except Exception as e:
                        if verbose:
                            print("WARNING:", repr(e))
                        pass

            # 2 children (next, then remote) - except call
            elif inst.mnemonic in self.conditional_jmp_mnemonics:
                next_offset = inst.offset + inst.size

                args_queue.insert(0, (next_offset, insts[inst.offset], True))

                # Call to Imported API (in IAT)
                # dword ptr [0x........] or qword ptr [0x........]
                if "word ptr [0x" in inst.op_str:
                    iat_va = int(inst.op_str.split('[')[1].split(']')[0], 16)

                    if iat_va in iat_api:
                        inst.op_str = iat_api[iat_va]
                elif inst.op_str in [
                        'eax', 'ebx', 'ecx', 'edx', 'esi', 'edi', 'esp', 'ebp'
                ]:
                    pass
                else:
                    try:
                        remote_offset = self.get_offset_from_va(
                            bin_instance, int(inst.op_str, 16))
                    except Exception as e:
                        if verbose:
                            print("WARNING:", repr(e))
                        continue

                args_queue.insert(1,
                                  (remote_offset, insts[inst.offset], False))

            # 2 children (next, then remote) - call
            elif inst.mnemonic in ['call']:

                next_offset = inst.offset + inst.size
                remote_offset = None

                args_queue.insert(0, (next_offset, insts[inst.offset], True))

                # Call to Imported API (in IAT)
                # dword ptr [0x........] or qword ptr [0x........]
                if "word ptr [0x" in inst.op_str:
                    iat_va = int(inst.op_str.split('[')[1].split(']')[0], 16)

                    if iat_va in iat_api:
                        inst.op_str = iat_api[iat_va]
                elif inst.op_str in self.x86_32_registers:
                    pass
                else:
                    try:
                        remote_offset = self.get_offset_from_va(
                            bin_instance, int(inst.op_str, 16))
                    except Exception as e:
                        if verbose:
                            print("WARNING:", repr(e))
                        pass

                if remote_offset:
                    args_queue.insert(
                        1, (remote_offset, insts[inst.offset], False))

            # 1 child (next) - basic instruction
            else:
                next_offset = inst.offset + inst.size
                args_queue.insert(0, (next_offset, insts[inst.offset], True))

        return insts

    def dis_prologues(self, data, bin_instance, iat_api, insts, verbose):
        prologues_re = "|".encode().join(self.prologues)
        compiled_re = re.compile(prologues_re)
        for m in compiled_re.finditer(data):
            function_offset = m.start()

            inst = insts.get(function_offset, None)
            if inst is None or inst.cache_only:
                insts = self._dis(data=data,
                                  offset=function_offset,
                                  iat_api=iat_api,
                                  bin_instance=bin_instance,
                                  insts=insts,
                                  verbose=verbose)
        return insts

    def dis(self, data, offset, iat_api, bin_instance, verbose=False):
        '''
            data: raw binary of full PE
            va: va of the instruction located at <data[index]>
            iat_api: dict of imported API like {VA_IN_IAT: API_NAME}
        '''

        insts = dict()
        insts = self.linear_sweep_cache(data=data,
                                        offset=offset,
                                        insts=insts,
                                        bin_instance=bin_instance,
                                        verbose=verbose)
        insts = self._dis(data=data,
                          offset=offset,
                          iat_api=iat_api,
                          bin_instance=bin_instance,
                          insts=insts,
                          verbose=verbose)

        # Exploration of the exported functions
        self._dis_exported_funcs(bin_instance=bin_instance,
                                 insts=insts,
                                 data=data,
                                 verbose=verbose,
                                 iat_api=iat_api)

        # Search for unrecognized functions from their prolog function
        insts = self.dis_prologues(data=data,
                                   bin_instance=bin_instance,
                                   iat_api=iat_api,
                                   insts=insts,
                                   verbose=verbose)

        return insts

    def display(self, insts, offset_from=0):
        for offset, inst in sorted(insts.items()):
            if offset >= offset_from:
                print(inst)

    def export_to_dot(self, insts, oep_offset, displayable=True):
        '''
            Export the intruction graph to DOT format
        '''
        nodes = io.StringIO()
        edges = io.StringIO()
        dot = io.StringIO()

        header = "digraph G {\n"
        footer = "}"

        if displayable:

            for offset, inst in sorted(insts.items()):
                if not inst.cache_only:
                    if inst.op_str == "":
                        inst_str = "%s" % inst.mnemonic
                    else:
                        inst_str = "%s %s" % (inst.mnemonic, inst.op_str)

                    if offset != oep_offset:
                        nodes.write(
                            ('"%X" [label="%s", address="0x%X", inst="%s", '
                             'style="", shape=box, fillcolor="white"]\n') %
                            (inst.va, "%016X: %s %s" %
                             (inst.va, inst.mnemonic, inst.op_str), inst.va,
                             inst_str))
                    else:
                        nodes.write((
                            '"%X" [label="%s", address="0x%X", inst="%s", '
                            'style="", shape=box, fillcolor="white", root=true]\n'
                        ) % (inst.va, "%016X: %s %s" %
                             (inst.va, inst.mnemonic, inst.op_str), inst.va,
                             inst_str))

                    if inst.to_succ is not None:
                        edges.write((
                            '"%X" -> "%X" [label=0, color=%s, child_number=1]\n'
                        ) % (inst.va, insts[inst.to_succ].va, "black"))

                    if inst.to_other is not None:
                        edges.write((
                            '"%X" -> "%X" [label=1, color=%s, child_number=2]\n'
                        ) % (inst.va, insts[inst.to_other].va, "red"))
        else:

            for offset, inst in sorted(insts.items()):
                if not inst.cache_only:
                    if inst.op_str == "":
                        inst_str = "%s" % inst.mnemonic
                    else:
                        inst_str = "%s %s" % (inst.mnemonic, inst.op_str)

                    if offset != oep_offset:
                        nodes.write(('"%X" [inst="%s", address="0x%X"]\n') %
                                    (inst.va, inst_str, inst.va))
                    else:
                        nodes.write(
                            ('"%X" [inst="%s", address="0x%X", root=true]\n') %
                            (inst.va, inst_str, inst.va))

                    if inst.to_succ is not None:
                        edges.write(('"%X" -> "%X" [child_number=1]\n') %
                                    (inst.va, insts[inst.to_succ].va))

                    if inst.to_other is not None:
                        edges.write(('"%X" -> "%X" [child_number=2]\n') %
                                    (inst.va, insts[inst.to_other].va))

        dot.write(header)
        dot.write(nodes.getvalue())
        dot.write(edges.getvalue())
        dot.write(footer)

        return dot.getvalue()
class FunctionCandidateManager(object):
    def __init__(self, config):
        self.config = config
        self.lang_analyzer = None
        self.disassembly = None
        self.bitness = None
        self._code_areas = []
        self.candidates = {}
        self.candidate_queue = []
        self.cached_candidates = None
        self._candidate_offsets = []
        self.candidate_index = 0
        self._all_call_refs = {}
        self.symbol_addresses = []
        self.identified_alignment = 0
        # gap filling
        self.function_gaps = None
        self.max_function_addr = 0
        self.gap_pointer = None
        self.previously_analyzed_gap = 0
        self.capstone = None

    def init(self, disassembly):
        if disassembly.binary_info.code_areas:
            self._code_areas = disassembly.binary_info.code_areas
        self.disassembly = disassembly
        self.lang_analyzer = LanguageAnalyzer(disassembly)
        self.disassembly.language = self.lang_analyzer.identify()
        self.bitness = disassembly.binary_info.bitness
        self.capstone = Cs(CS_ARCH_X86, CS_MODE_32)
        if self.bitness == 64:
            self.capstone = Cs(CS_ARCH_X86, CS_MODE_64)
        self.locateCandidates()
        self.disassembly.identified_alignment = self.identified_alignment
        self._buildQueue()

    def _passesCodeFilter(self, addr):
        if addr is None:
            return False
        if self._code_areas:
            for area in self._code_areas:
                if area[0] <= addr < area[1]:
                    return True
            return False
        return True

    def getBitMask(self):
        if self.bitness == 64:
            return 0xFFFFFFFFFFFFFFFF
        return 0xFFFFFFFF

    def setInitialCandidate(self, addr):
        if addr in self.candidates:
            self.candidates[addr].setInitialCandidate(True)

    def isFunctionCandidate(self, addr):
        return addr in self.candidates

    def getFunctionCandidate(self, addr):
        if addr in self.candidates:
            return self.candidates[addr]
        return None

    def getAbortedCandidates(self):
        aborted = []
        for addr, candidate in self.candidates.items():
            if candidate.analysis_aborted:
                aborted.append(addr)
        return aborted

    def updateAnalysisAborted(self, addr, reason):
        LOGGER.debug("function analysis of 0x%08x aborted: %s", addr, reason)
        if addr in self.candidates:
            self.candidates[addr].setAnalysisAborted(reason)

    def updateAnalysisFinished(self, addr):
        LOGGER.debug("function analysis of 0x%08x successfully completed.",
                     addr)
        if addr in self.candidates:
            self.candidates[addr].setAnalysisCompleted()

    def updateCandidates(self, state):
        if self.config.HIGH_ACCURACY:
            conflicts = state.identifyCallConflicts(self._all_call_refs)
            if conflicts:
                for candidate_addr, conflict in conflicts.items():
                    self.candidates[candidate_addr].removeCallRefs(conflict)
                self.candidate_queue.update()

    def addCandidate(self, addr, is_gap=False, reference_source=None):
        if not self._passesCodeFilter(addr):
            return False
        self.ensureCandidate(addr)
        self.candidates[addr].setIsGapCandidate(is_gap)
        if reference_source:
            self.candidates[addr].addCallRef(reference_source)
        self.candidate_queue.add(self.candidates[addr])
        self.candidate_queue.update()

    def getNextFunctionStartCandidate(self):
        for candidate in self.candidate_queue:
            if not (candidate.isFinished() or candidate.getScore() == 0):
                if self.identified_alignment and candidate.alignment < self.identified_alignment:
                    continue
                yield candidate

    def _logCandidateStats(self):
        LOGGER.debug("Candidate Statistics:")
        try:
            maxc = max([c.getScore() for c in self.candidates.values()])
            minc = min([c.getScore() for c in self.candidates.values()])
            candidates_2 = len([
                c.getScore() for c in self.candidates.values()
                if c.getScore() == 2
            ])
            candidates_1 = len([
                c.getScore() for c in self.candidates.values()
                if c.getScore() == 1
            ])
            candidates_0 = len([
                c.getScore() for c in self.candidates.values()
                if c.getScore() == 0
            ])
            LOGGER.debug("  Max: %f, Min: %f", maxc, minc)
            LOGGER.debug("  2: %d, 1: %d, 0: %d", candidates_2, candidates_1,
                         candidates_0)
        except:
            LOGGER.debug("  No candidates found.")

    def getFunctionStartCandidates(self):
        return self._candidate_offsets

    def updateFunctionGaps(self):
        gaps = []
        prev_ins = 0
        min_code = min(self.disassembly.code_map
                       ) if self.disassembly.code_map else self.getBitMask()
        max_code = max(
            self.disassembly.code_map) if self.disassembly.code_map else 0
        for code_area in self._code_areas:
            if code_area[0] < min_code < code_area[
                    1] and min_code != code_area[0]:
                gaps.append([code_area[0], min_code, min_code - code_area[0]])
            if code_area[0] < max_code < code_area[
                    1] and max_code != code_area[1]:
                gaps.append([max_code, code_area[1], code_area[1] - max_code])
        for ins in sorted(self.disassembly.code_map.keys()):
            if prev_ins != 0:
                if ins - prev_ins > 1:
                    gaps.append([prev_ins + 1, ins, ins - prev_ins])
            prev_ins = ins
        self.function_gaps = sorted(gaps)

    def initGapSearch(self):
        if self.gap_pointer is None:
            LOGGER.debug("initGapSearch()")
            self.gap_pointer = self.getBitMask()
            self.updateFunctionGaps()
            if self.function_gaps:
                self.gap_pointer = self.function_gaps[0][0]
        LOGGER.debug("initGapSearch() gaps are:")
        for gap in self.function_gaps:
            LOGGER.debug("initGapSearch() 0x%08x - 0x%08x == %d", gap[0],
                         gap[1], gap[2])
        return

    def getNextGap(self, dont_skip=False):
        next_gap = self.getBitMask()
        for gap in self.function_gaps:
            if gap[0] > self.gap_pointer:
                next_gap = gap[0]
                break
        LOGGER.debug("getNextGap(%s) for 0x%08x based on gap_map: 0x%08x",
                     dont_skip, self.gap_pointer, next_gap)
        # we potentially just disassembled a function and want to continue directly behind it in case we would otherwise miss more
        if dont_skip:
            if self.gap_pointer in self.disassembly.code_map:
                function = self.disassembly.ins2fn[self.gap_pointer]
                next_gap = min(next_gap,
                               self.disassembly.function_borders[function][1])
                LOGGER.debug(
                    "getNextGap(%s) without skip => after checking versus code map: 0x%08x",
                    dont_skip, next_gap)
        LOGGER.debug("getNextGap(%s) final gap_ptr: 0x%08x", dont_skip,
                     next_gap)
        return next_gap

    def isEffectiveNop(self, byte_sequence):
        if byte_sequence in GAP_SEQUENCES[len(byte_sequence)]:
            return True
        return False

    def isAlignmentSequence(self, instruction_sequence):
        is_alignment_sequence = False
        if len(instruction_sequence) > 0:
            current_offset = instruction_sequence[0].address
            for instruction in instruction_sequence:
                if instruction.bytes in GAP_SEQUENCES[len(instruction.bytes)]:
                    current_offset += len(instruction.bytes)
                    if current_offset % 16 == 0:
                        is_alignment_sequence = True
                        break
                else:
                    break
        return is_alignment_sequence

    def nextGapCandidate(self, start_gap_pointer=None):
        if self.gap_pointer is None:
            self.initGapSearch()
        if start_gap_pointer:
            self.gap_pointer = start_gap_pointer
        LOGGER.debug(
            "nextGapCandidate() finding new gap candidate, current gap_ptr: 0x%08x",
            self.gap_pointer)
        while True:
            if self.disassembly.binary_info.base_addr + self.disassembly.binary_info.binary_size < self.gap_pointer:
                LOGGER.debug("nextGapCandidate() gap_ptr: 0x%08x - finishing",
                             self.gap_pointer)
                return None
            gap_offset = self.gap_pointer - self.disassembly.binary_info.base_addr
            if gap_offset >= self.disassembly.binary_info.binary_size:
                return None
            # compatibility with python2/3...
            try:
                byte = self.disassembly.getRawByte(gap_offset)
            except:
                print("0x%08x" % self.disassembly.binary_info.base_addr,
                      "0x%08x" % self.disassembly.binary_info.binary_size,
                      "0x%08x" % self.gap_pointer, "0x%08x" % gap_offset)
            # try to find padding symbols and skip them
            if isinstance(byte, int):
                byte = struct.pack("B", byte)
            if byte in GAP_SEQUENCES[1]:
                LOGGER.debug(
                    "nextGapCandidate() found 0xCC / 0x00 - gap_ptr += 1: 0x%08x",
                    self.gap_pointer)
                self.gap_pointer += 1
                continue
            # try to find instructions that directly encode as NOP and skip them
            ins_buf = [
                i for i in self.capstone.disasm_lite(
                    self.disassembly.getRawBytes(gap_offset, 15), gap_offset)
            ]
            if ins_buf:
                i_address, i_size, i_mnemonic, i_op_str = ins_buf[0]
                if i_mnemonic == "nop":
                    nop_instruction = i_mnemonic + " " + i_op_str
                    nop_length = i_size
                    LOGGER.debug(
                        "nextGapCandidate() found nop instruction (%s) - gap_ptr += %d: 0x%08x",
                        nop_instruction, nop_length, self.gap_pointer)
                    self.gap_pointer += nop_length
                    continue
            # try to find effective NOPs and skip them.
            found_multi_byte_nop = False
            for gap_length in range(max(GAP_SEQUENCES.keys()), 1, -1):
                if self.disassembly.getRawBytes(
                        gap_offset, gap_length) in GAP_SEQUENCES[gap_length]:
                    LOGGER.debug(
                        "nextGapCandidate() found %d byte effective nop - gap_ptr += %d: 0x%08x",
                        gap_length, gap_length, self.gap_pointer)
                    self.gap_pointer += gap_length
                    found_multi_byte_nop = True
                    break
            if found_multi_byte_nop:
                continue
            # we know this place from data already
            if self.gap_pointer in self.disassembly.data_map:
                LOGGER.debug(
                    "nextGapCandidate() gap_ptr is already inside data map: 0x%08x",
                    self.gap_pointer)
                self.gap_pointer += 1
                continue
            if self.gap_pointer in self.disassembly.code_map:
                LOGGER.debug(
                    "nextGapCandidate() gap_ptr is already inside code map: 0x%08x",
                    self.gap_pointer)
                self.gap_pointer = self.getNextGap()
                continue
            # we may have a candidate here
            LOGGER.debug("nextGapCandidate() using 0x%08x as candidate",
                         self.gap_pointer)
            start_byte = self.disassembly.getRawByte(gap_offset)
            has_common_prologue = True  # start_byte in FunctionCandidate(self.gap_pointer, start_byte, self.bitness).common_gap_starts[self.bitness]
            if self.previously_analyzed_gap == self.gap_pointer:
                LOGGER.debug(
                    "--- HRM, nextGapCandidate() gap_ptr at: 0x%08x was previously analyzed",
                    self.gap_pointer)
                self.gap_pointer = self.getNextGap(dont_skip=True)
            elif not has_common_prologue:
                LOGGER.debug(
                    "--- HRM, nextGapCandidate() gap_ptr at: 0x%08x has no common prologue (0x%08x)",
                    self.gap_pointer, ord(start_byte))
                self.gap_pointer = self.getNextGap(dont_skip=True)
            else:
                self.previously_analyzed_gap = self.gap_pointer
                self.addGapCandidate(self.gap_pointer)
                return self.gap_pointer
        return None

    def checkFunctionOverlap(self):
        function_boundaries = []
        for function in self.disassembly.functions:
            min_addr = self.getBitMask()
            max_addr = 0
            for block in self.disassembly.functions[function]:
                min_addr = min(min_addr,
                               min([instruction[0] for instruction in block]))
                max_addr = max(
                    max_addr,
                    max([
                        instruction[0] + instruction[1]
                        for instruction in block
                    ]))
            function_boundaries.append((min_addr, max_addr))
        current_entry = (0, 0)
        for entry in sorted(function_boundaries):
            if current_entry[1] > entry[0]:
                return True
            current_entry = entry
        return False

    def checkCodePadding(self):
        pattern_count = 0
        pattern_functions = []
        for pattern in re.finditer(r"((\xCC){2,}|(\x90){2,})",
                                   self.disassembly.binary_info.binary):
            pattern_count += 1
            pattern_functions.append(pattern.span()[1] + 1)

    def ensureCandidate(self, addr):
        """ create candidate if it does not exist yet, returns True if newly created, else False """
        if addr not in self.candidates:
            self.candidates[addr] = FunctionCandidate(
                self.disassembly.binary_info, addr)
            return True
        return False

    def addGapCandidate(self, addr):
        if not self._passesCodeFilter(addr):
            return False
        self.ensureCandidate(addr)
        self.candidates[addr].setIsGapCandidate(True)

    def addTailcallCandidate(self, addr):
        if not self._passesCodeFilter(addr):
            return False
        self.ensureCandidate(addr)
        self.candidates[addr].setIsTailcallCandidate(True)

    def addReferenceCandidate(self, addr, source_ref):
        if not self._passesCodeFilter(addr):
            return False
        if self.ensureCandidate(addr):
            self._all_call_refs[source_ref] = addr
        self.candidates[addr].addCallRef(source_ref)

    def addLanguageSpecCandidate(self, addr, lang_spec):
        if not self._passesCodeFilter(addr):
            return False
        self.ensureCandidate(addr)
        self.candidates[addr].setLanguageSpec(lang_spec)

    def addPrologueCandidate(self, addr):
        if not self._passesCodeFilter(addr):
            return False
        return self.ensureCandidate(addr)

    def addSymbolCandidate(self, addr):
        if not self._passesCodeFilter(addr):
            return False
        self.ensureCandidate(addr)
        self.candidates[addr].setIsSymbol(True)
        self.candidates[addr].setInitialCandidate(True)

    def addExceptionCandidate(self, addr):
        if not self._passesCodeFilter(addr):
            return False
        self.ensureCandidate(addr)
        self.candidates[addr].setIsExceptionHandler(True)
        self.candidates[addr].setInitialCandidate(True)

    def resolvePointerReference(self, offset):
        if self.bitness == 32:
            addr_block = self.disassembly.getRawBytes(offset + 2, 4)
            function_pointer = struct.unpack("I", addr_block)[0]
            return self.disassembly.dereferenceDword(function_pointer)
        if self.bitness == 64:
            addr_block = self.disassembly.getRawBytes(offset + 2, 4)
            function_pointer = struct.unpack("i", addr_block)[0]
            # we need to calculate RIP + offset + 7 (48 ff 25 ** ** ** **)
            if self.disassembly.getRawBytes(offset, 2) == "\xFF\x25":
                function_pointer += offset + 7
            elif self.disassembly.getRawBytes(offset, 2) == "\xFF\x15":
                function_pointer += offset + 6
            else:
                raise Exception(
                    "resolvePointerReference: should only be used on call/jmp * ptr"
                )
            return self.disassembly.binary_info.base_addr + function_pointer
        raise Exception("resolvePointerReference: undefined bitness")

    def _identifyAlignment(self):
        identified_alignment = 0
        if self.config.USE_ALIGNMENT:
            num_candidates = sum([
                1 for addr, candidate in self.candidates.items()
                if len(candidate.call_ref_sources) > 1
            ])
            num_aligned_16_candidates = sum([
                1 for addr, candidate in self.candidates.items()
                if len(candidate.call_ref_sources) > 1
                and candidate.alignment == 16
            ])
            num_aligned_4_candidates = sum([
                1 for addr, candidate in self.candidates.items()
                if len(candidate.call_ref_sources) > 1
                and candidate.alignment >= 4
            ])
            if num_candidates:
                alignment_16_ratio = 1.0 * num_aligned_16_candidates / num_candidates
                alignment_4_ratio = 1.0 * num_aligned_4_candidates / num_candidates
                if num_candidates > 20 and alignment_4_ratio > 0.95:
                    identified_alignment = 4
                if num_candidates > 20 and alignment_16_ratio > 0.95:
                    identified_alignment = 16
        return identified_alignment

    def locateCandidates(self):
        self.locateSymbolCandidates()
        self.locateReferenceCandidates()
        self.locatePrologueCandidates()
        self.locateLangSpecCandidates()
        self.locateStubChainCandidates()
        self.locateExceptionHandlerCandidates()
        self.identified_alignment = self._identifyAlignment()

    def _buildQueue(self):
        LOGGER.debug("Located %d function candidates", len(self.candidates))
        # increase lookup speed with static list
        self._candidate_offsets = [c.addr for c in self.candidates.values()]
        self.cached_candidates = list(self.candidates.values())
        self.candidate_queue = PriorityQueue(content=self.cached_candidates)

    def locateSymbolCandidates(self):
        for symbol_addr in self.symbol_addresses:
            self.addSymbolCandidate(symbol_addr)

    def locateReferenceCandidates(self):
        # check for potential call instructions and check if their destinations have a common function prologue
        for call_match in re.finditer(b"\xE8",
                                      self.disassembly.binary_info.binary):
            if not self._passesCodeFilter(
                    self.disassembly.binary_info.base_addr +
                    call_match.start()):
                continue
            if len(self.disassembly.binary_info.binary) - call_match.start(
            ) > 5:
                packed_call = self.disassembly.getRawBytes(
                    call_match.start() + 1, 4)
                rel_call_offset = struct.unpack("i", packed_call)[0]
                # ignore zero offset calls, as they will likely not lead to functions but are rather used for positioning in shellcode etc
                if rel_call_offset == 0:
                    continue
                call_destination = (self.disassembly.binary_info.base_addr +
                                    rel_call_offset + call_match.start() +
                                    5) & self.getBitMask()
                if self.disassembly.isAddrWithinMemoryImage(call_destination):
                    self.addReferenceCandidate(
                        call_destination,
                        self.disassembly.binary_info.base_addr +
                        call_match.start())
                    self.setInitialCandidate(call_destination)
        # also check for "jmp dword ptr <offset>", as they sometimes point to local functions (i.e. non-API)
        if self.bitness == 32:
            for match in re.finditer(b"\xFF\x25",
                                     self.disassembly.binary_info.binary):
                function_addr = self.resolvePointerReference(match.start())
                if not self._passesCodeFilter(function_addr):
                    continue
                if self.disassembly.isAddrWithinMemoryImage(function_addr):
                    self.addReferenceCandidate(
                        function_addr,
                        self.disassembly.binary_info.base_addr + match.start())
                    self.setInitialCandidate(function_addr)
            # also check for "call dword ptr <offset>", as they sometimes point to local functions (i.e. non-API)
            for match in re.finditer(b"\xFF\x15",
                                     self.disassembly.binary_info.binary):
                function_addr = self.resolvePointerReference(match.start())
                if not self._passesCodeFilter(function_addr):
                    continue
                if self.disassembly.isAddrWithinMemoryImage(function_addr):
                    self.addReferenceCandidate(
                        function_addr,
                        self.disassembly.binary_info.base_addr + match.start())
                    self.setInitialCandidate(function_addr)

    def locatePrologueCandidates(self):
        # next check for the default function prologue regardless of references
        for re_prologue in DEFAULT_PROLOGUES:
            for prologue_match in re.finditer(
                    re_prologue, self.disassembly.binary_info.binary):
                if not self._passesCodeFilter(
                        self.disassembly.binary_info.base_addr +
                        prologue_match.start()):
                    continue
                self.addPrologueCandidate(
                    (self.disassembly.binary_info.base_addr +
                     prologue_match.start()) & self.getBitMask())
                self.setInitialCandidate(
                    (self.disassembly.binary_info.base_addr +
                     prologue_match.start()) & self.getBitMask())

    def locateLangSpecCandidates(self):
        # if the sample is highly likely delphi, extract t-string-objects and use their function-addresses as high-confidence function starts
        delphi_candidates = set([])
        if self.lang_analyzer.checkDelphi():
            LOGGER.debug(
                "Programming language recognized as Delphi, adding function start addresses from TObjects"
            )
            t_objects = self.lang_analyzer.getDelphiObjects()
            for t_string in t_objects:
                delphi_candidates.update(set(t_objects[t_string]))
            LOGGER.debug("delphi candidates based on TObject analysis: %d",
                         len(delphi_candidates))
            for obj in delphi_candidates:
                self.addLanguageSpecCandidate(obj, "delphi")

    def locateStubChainCandidates(self):
        # binaries often contain long sequences of stubs, consisting only of jmp dword ptr <offset>, add such chains as candidates
        for block in re.finditer(b"(?P<block>(\xFF\x25[\S\s]{4}){2,})",
                                 self.disassembly.binary_info.binary):
            for match in re.finditer(b"\xFF\x25(?P<function>[\S\s]{4})",
                                     block.group("block")):
                stub_addr = self.disassembly.binary_info.base_addr + block.start(
                ) + match.start()
                if not self._passesCodeFilter(stub_addr):
                    continue
                self.addPrologueCandidate(stub_addr & self.getBitMask())
                self.setInitialCandidate(stub_addr & self.getBitMask())
                self.candidates[stub_addr].setIsStub(True)
        # structure for plt entries is similar but interleaved with additional code not considered functions
        for block in re.finditer(
                b"(?P<block>(\xFF\x25[\S\s]{4}\x68[\S\s]{4}\xE9[\S\s]{4}){2,})",
                self.disassembly.binary_info.binary):
            for match in re.finditer(b"\xFF\x25(?P<function>[\S\s]{4})",
                                     block.group("block")):
                stub_addr = self.disassembly.binary_info.base_addr + block.start(
                ) + match.start()
                if not self._passesCodeFilter(stub_addr):
                    continue
                self.addPrologueCandidate(stub_addr & self.getBitMask())
                self.setInitialCandidate(stub_addr & self.getBitMask())
                self.candidates[stub_addr].setIsStub(True)
                # define data bytes inbetween
                for offset in range(10):
                    self.disassembly.data_map.add(stub_addr + 6 + offset)

    def locateExceptionHandlerCandidates(self):
        # 64bit only - if we have a .pdata section describing exception handlers, we extract entries of guaranteed function starts from it.
        # TODO 2020-10-29 continue here and extract function start candidates
        if self.disassembly.binary_info.bitness == 64:
            for section_info in self.disassembly.binary_info.getSections():
                section_name, section_va_start, section_va_end = section_info
                if section_name == ".pdata":
                    rva_start = section_va_start - self.disassembly.binary_info.base_addr
                    rva_end = section_va_end - self.disassembly.binary_info.base_addr
                    for offset in range(rva_start, rva_end + 1, 12):
                        rva_function_candidate = struct.unpack(
                            "I",
                            self.disassembly.binary_info.binary[offset:offset +
                                                                4])[0]
                        self.addExceptionCandidate(
                            self.disassembly.binary_info.base_addr +
                            rva_function_candidate)
                        if not rva_function_candidate:
                            break
Ejemplo n.º 6
0
class IdaExporter(object):
    def __init__(self, config, bitness=None):
        self.config = config
        self.ida_interface = IdaInterface()
        self.bitness = bitness if bitness else self.ida_interface.getBitness()
        self.capstone = None
        self.disassembly = DisassemblyResult()
        self.disassembly.smda_version = config.VERSION
        self._initCapstone()

    def _initCapstone(self):
        self.capstone = Cs(CS_ARCH_X86, CS_MODE_32)
        if self.bitness == 64:
            self.capstone = Cs(CS_ARCH_X86, CS_MODE_64)

    def _convertIdaInsToSmda(self, offset, instruction_bytes):
        cache = [
            i for i in self.capstone.disasm_lite(instruction_bytes, offset)
        ]
        if cache:
            i_address, i_size, i_mnemonic, i_op_str = []
            smda_ins = (i_address, i_size, i_mnemonic, i_op_str,
                        instruction_bytes)
        else:
            # record error and emit placeholder instruction
            bytes_as_hex = "".join(
                ["%02x" % c for c in bytearray(instruction_bytes)])
            print("missing capstone disassembly output at 0x%x (%s)" %
                  (offset, bytes_as_hex))
            self.disassembly.errors[offset] = {
                "type": "capstone disassembly failure",
                "instruction_bytes": bytes_as_hex
            }
            smda_ins = (offset, len(instruction_bytes), "error", "error",
                        bytearray(instruction_bytes))
        return smda_ins

    def analyzeBuffer(self, binary_info, cb_analysis_timeout=None):
        """ instead of performing a full analysis, simply collect all data from IDA and convert it into a report """
        self.disassembly.analysis_start_ts = datetime.datetime.utcnow()
        self.disassembly.binary_info = binary_info
        self.disassembly.binary_info.architecture = self.ida_interface.getArchitecture(
        )
        if not self.disassembly.binary_info.base_addr:
            self.disassembly.binary_info.base_addr = self.ida_interface.getBaseAddr(
            )
        if not self.disassembly.binary_info.binary:
            self.disassembly.binary_info.binary = self.ida_interface.getBinary(
            )
        if not self.disassembly.binary_info.bitness:
            self.disassembly.binary_info.bitness = self.bitness
        self.disassembly.function_symbols = self.ida_interface.getFunctionSymbols(
        )
        api_map = self.ida_interface.getApiMap()
        for function_offset in self.ida_interface.getFunctions():
            if self.ida_interface.isExternalFunction(function_offset):
                continue
            converted_function = []
            for block in self.ida_interface.getBlocks(function_offset):
                converted_block = []
                for instruction_offset in block:
                    instruction_bytes = self.ida_interface.getInstructionBytes(
                        instruction_offset)
                    smda_instruction = self._convertIdaInsToSmda(
                        instruction_offset, instruction_bytes)
                    converted_block.append(smda_instruction)
                    self.disassembly.instructions[smda_instruction[0]] = (
                        smda_instruction[2], smda_instruction[1])
                    in_refs = self.ida_interface.getCodeInRefs(
                        smda_instruction[0])
                    for in_ref in in_refs:
                        self.disassembly.addCodeRefs(in_ref[0], in_ref[1])
                    out_refs = self.ida_interface.getCodeOutRefs(
                        smda_instruction[0])
                    for out_ref in out_refs:
                        self.disassembly.addCodeRefs(out_ref[0], out_ref[1])
                        if out_ref[1] in api_map:
                            self.disassembly.addr_to_api[
                                instruction_offset] = api_map[out_ref[1]]
                converted_function.append(converted_block)
            self.disassembly.functions[function_offset] = converted_function
            if self.disassembly.isRecursiveFunction(function_offset):
                self.disassembly.recursive_functions.add(function_offset)
            if self.disassembly.isLeafFunction(function_offset):
                self.disassembly.leaf_functions.add(function_offset)
        self.disassembly.analysis_end_ts = datetime.datetime.utcnow()
        return self.disassembly
Ejemplo n.º 7
0
def open_x64dbg_trace(filename):
    """Opens x64dbg trace file

    Args:
        filename: name of trace file
    Returns:
        TraceData object
    """
    with open(filename, "rb") as f:
        trace_data = TraceData()
        trace_data.filename = filename

        # check first 4 bytes
        magic = f.read(4)
        if magic != b"TRAC":
            raise ValueError("Error, wrong file format.")

        json_length_bytes = f.read(4)
        json_length = int.from_bytes(json_length_bytes, "little")

        # read JSON blob
        json_blob = f.read(json_length)
        json_str = str(json_blob, "utf-8")
        arch = json.loads(json_str)["arch"]

        reg_indexes = {}
        if arch == "x64":
            regs = prefs.X64_REGS
            ip_reg = "rip"
            capstone_mode = CS_MODE_64
            pointer_size = 8  # qword
        else:
            regs = prefs.X32_REGS
            ip_reg = "eip"
            capstone_mode = CS_MODE_32
            pointer_size = 4  # dword

        for i, reg in enumerate(regs):
            reg_indexes[reg] = i

        trace_data.arch = arch
        trace_data.ip_reg = ip_reg
        trace_data.regs = reg_indexes
        trace_data.pointer_size = pointer_size

        md = Cs(CS_ARCH_X86, capstone_mode)
        reg_values = [None] * len(reg_indexes)
        trace = []
        row_id = 0
        while f.read(1) == b"\x00":
            register_changes = int.from_bytes(f.read(1), "little")
            memory_accesses = int.from_bytes(f.read(1), "little")
            flags_and_opcode_size = int.from_bytes(f.read(1),
                                                   "little")  # Bitfield
            thread_id_bit = (flags_and_opcode_size >> 7) & 1  # msb
            opcode_size = flags_and_opcode_size & 15  # 4 lsbs

            if thread_id_bit > 0:
                thread_id = int.from_bytes(f.read(4), "little")

            opcodes = f.read(opcode_size)

            register_change_position = []
            for _ in range(register_changes):
                register_change_position.append(
                    int.from_bytes(f.read(1), "little"))

            register_change_new_data = []
            for _ in range(register_changes):
                register_change_new_data.append(
                    int.from_bytes(f.read(pointer_size), "little"))

            memory_access_flags = []
            for _ in range(memory_accesses):
                memory_access_flags.append(int.from_bytes(f.read(1), "little"))

            memory_access_addresses = []
            for _ in range(memory_accesses):
                memory_access_addresses.append(
                    int.from_bytes(f.read(pointer_size), "little"))

            memory_access_old_data = []
            for _ in range(memory_accesses):
                memory_access_old_data.append(
                    int.from_bytes(f.read(pointer_size), "little"))

            memory_access_new_data = []
            for i in range(memory_accesses):
                if memory_access_flags[i] & 1 == 0:
                    memory_access_new_data.append(
                        int.from_bytes(f.read(pointer_size), "little"))

            reg_id = 0
            for i, change in enumerate(register_change_position):
                reg_id += change
                if reg_id + i < len(reg_indexes):
                    reg_values[reg_id + i] = register_change_new_data[i]

            # disassemble
            ip_value = reg_values[reg_indexes[ip_reg]]
            for (_address, _size, mnemonic,
                 op_str) in md.disasm_lite(opcodes, ip_value):
                disasm = mnemonic
                if op_str:
                    disasm += " " + op_str

            mems = []
            mem = {}
            new_data_counter = 0
            for i in range(memory_accesses):
                flag = memory_access_flags[i]
                value = memory_access_old_data[i]
                mem["access"] = "READ"
                if flag & 1 == 0:
                    value = memory_access_new_data[new_data_counter]
                    mem["access"] = "WRITE"
                    new_data_counter += 1
                else:
                    pass
                    # memory value didn't change
                    # (it is read or overwritten with identical value)
                    # this has to be fixed somehow in x64dbg

                mem["addr"] = memory_access_addresses[i]

                # fix value (x64dbg saves all values as qwords)
                if "qword" in disasm:
                    pass
                elif "dword" in disasm:
                    value &= 0xFFFFFFFF
                elif "word" in disasm:
                    value &= 0xFFFF
                elif "byte" in disasm:
                    value &= 0xFF
                mem["value"] = value
                mems.append(mem.copy())

            trace_row = {}
            trace_row["id"] = row_id
            trace_row["ip"] = ip_value
            trace_row["disasm"] = disasm
            trace_row["regs"] = reg_values.copy()
            trace_row["opcodes"] = opcodes.hex()
            trace_row["mem"] = mems.copy()
            trace_row["comment"] = ""
            trace.append(trace_row)
            row_id += 1

        trace_data.trace = trace
        return trace_data
Ejemplo n.º 8
0
def main():
    BYTES = 500
    NUM_MNEM = 30
    SIG_FILE = "./mpesm.sig"
    THRESHOLD = .85
    VERBOSE = False
    DIR_PROCESSING = False
    signatures = {}
    file_list = []
    nos = 0
    ep = 0
    ep_ava = 0

    parser = ArgumentParser(description="Mnemonic PE Signature Matching")
    parser.add_argument("-n", "--num-mnem",
                        dest="num_mnem", help="Use a lenght of 'n' mnemonics (default: " + str(NUM_MNEM) + ')')
    parser.add_argument("-s", "--signatures",
                        dest="sig_file", help="signature file to use (default: " + SIG_FILE + ')')
    parser.add_argument("-b", "--bytes",
                        dest="bytes", help="Grab and disassemble x bytes from EP, you should only need to change this if you give a super large number for -n (default: " + str(BYTES) + ')')
    parser.add_argument("-t", "--threshold",
                        dest="threshold", help="Display all matches greater than -t supplied similarity (default: " + str(THRESHOLD) + ')')
    parser.add_argument("-v", "--verbose",
                        dest="verbose", help="Verbose output", action='store_true')
    parser.add_argument("file", nargs=1, help='File to analyze')
    args = parser.parse_args()

    if args.sig_file:
        SIG_FILE = args.sig_file
    if args.threshold:
        THRESHOLD = float(args.threshold)
    if args.bytes:
        BYTES = args.bytes
    if args.num_mnem:
        NUM_MNEM = args.num_mnem
    if args.verbose:
        VERBOSE = True

    config = ConfigParser.RawConfigParser()
    config.read(SIG_FILE)

    if len(config.sections()) == 0:
        print "Error Reading from config file: %s, it's either empty or not present" %(SIG_FILE)
        sys.exit(1)
    for s in config.sections():
        signatures[s] = {}
        signatures[s]['mnemonics'] = config.get(s, 'mnemonics').split(',')
        if config.has_option(s, 'num_mnemonics'):
            signatures[s]['num_mnemonics'] = config.getint(s, 'num_mnemonics')
        if config.has_option(s, 'major_linker'):
            signatures[s]['major_linker'] = config.getint(s, 'major_linker')
        if config.has_option(s, 'minor_linker'):
            signatures[s]['minor_linker'] = config.getint(s, 'minor_linker')
        if config.has_option(s, 'numberofsections'):
            signatures[s]['numberofsections'] = config.getint(s, 'numberofsections')

    if os.path.isdir(args.file[0]):
        file_list = glob.glob(args.file[0]+'/*')
        DIR_PROCESSING = True
    else:
        file_list.append(args.file[0])

    for f in file_list:
        file_type = None
        if VERBOSE:
            print '[*] Processing: ' + f
        try:
            fe = pefile.PE(f)
            file_type = 'PE'
        except Exception as e:
            if VERBOSE:
                sys.stderr.write("[*] Error with %s - %s\n" %(f, str(e)))


        if not file_type:
            try:
                fe = macholib.MachO.MachO(f)
                file_type = 'MACHO'

            except Exception as e:
                if VERBOSE:
                    sys.stderr.write("[*] Error with %s - %s\n" %(f, str(e)))

        if not file_type:
            sys.stderr.write("[*] Error with %s - not a PE or Mach-O\n" % f)



        if file_type == 'PE':
            try:
                minor_linker = 0
                major_linker = 0
                try:
                    minor_linker = fe.OPTIONAL_HEADER.MinorLinkerVersion
                    major_linker = fe.OPTIONAL_HEADER.MajorLinkerVersion
                except Exception as e:
                    pass
                if hasattr(fe, 'FILE_HEADER') and hasattr(fe.FILE_HEADER, 'NumberOfSections'):
                    nos = fe.FILE_HEADER.NumberOfSections
                if hasattr(fe, 'OPTIONAL_HEADER') and hasattr(fe.OPTIONAL_HEADER, 'AddressOfEntryPoint'):
                    ep = fe.OPTIONAL_HEADER.AddressOfEntryPoint
                if hasattr(fe, 'OPTIONAL_HEADER') and hasattr(fe.OPTIONAL_HEADER, 'ImageBase') and ep > 0:
                    ep_ava = ep+fe.OPTIONAL_HEADER.ImageBase
                    data = fe.get_memory_mapped_image()[ep:ep+BYTES]
                    #
                    # Determine if the file is 32bit or 64bit
                    #
                    mode = CS_MODE_32
                    if fe.OPTIONAL_HEADER.Magic == 0x20b:
                        mode = CS_MODE_64

                    md = Cs(CS_ARCH_X86, mode)
                    match = []
                    for (address, size, mnemonic, op_str) in md.disasm_lite(data, 0x1000):
                        match.append(mnemonic.encode('utf-8').strip())

                    for s in signatures:
                        m = match
                        sig = signatures[s]['mnemonics']
                        if m and m[0] == sig[0] or THRESHOLD < .7:
                            additional_info = []
                            if 'minor_linker' in signatures[s]:
                                if minor_linker == signatures[s]['minor_linker']:
                                    additional_info.append('Minor Linker Version Match: True')
                                else:
                                    additional_info.append('Minor Linker Version Match: False')
                            if 'major_linker' in signatures[s]:
                                if major_linker == signatures[s]['major_linker']:
                                    additional_info.append('Major Linker Version Match: True')
                                else:
                                    additional_info.append('Major Linker Version Match: False')
                            if 'numberofsections' in signatures[s]:
                                if nos == signatures[s]['numberofsections']:
                                    additional_info.append('Number Of Sections Match: True')
                                else:
                                    additional_info.append('Number Of Sections Match: False')

                            if 'num_mnemonics' in signatures[s]:
                                nm = signatures[s]['num_mnemonics']
                                m = match[:nm]
                                sig = signatures[s]['mnemonics'][:nm]
                            else:
                                m = match[:NUM_MNEM]
                                sig = signatures[s]['mnemonics'][:NUM_MNEM]
                            distance = tapered_levenshtein(sig, m)
                            similarity = 1.0 - distance/float(max(len(sig), len(m)))
                            if similarity > THRESHOLD:
                                if DIR_PROCESSING:
                                    print "[%s] [%s] (Edits: %s | Similarity: %0.3f) (%s)" %(f, s, distance, similarity, ' | '.join(additional_info))
                                else:
                                    print "[%s] (Edits: %s | Similarity: %0.3f) (%s)" %(s, distance, similarity, ' | '.join(additional_info))
                                if VERBOSE:
                                    print "%s\n%s\n" %(sig, m)
            except Exception as e:
                print str(e)
        elif file_type == 'MACHO':
            macho_file = open(f, 'rb')
            macho_data = macho_file.read()
            macho_file.close()
            for header in fe.headers:
                # Limit it to X86
                if header.header.cputype not in [7, 0x01000007]:
                    continue

                # Limit it to Object and Executable files
                if header.header.filetype not in [1, 2]:
                    continue

                magic = int(header.MH_MAGIC)
                offset = int(header.offset)

                all_sections = []
                entrypoint_type = ''
                entrypoint_address = 0
                for cmd in header.commands:
                    load_cmd = cmd[0]
                    cmd_info = cmd[1]
                    cmd_data = cmd[2]
                    cmd_name = load_cmd.get_cmd_name()
                    if cmd_name in ('LC_SEGMENT', 'LC_SEGMENT_64'):
                        for section_data in cmd_data:
                            sd = section_data.describe()
                            all_sections.append(sd)

                    elif cmd_name in ('LC_THREAD', 'LC_UNIXTHREAD'):
                        entrypoint_type = 'old'
                        flavor = int(struct.unpack(header.endian + 'I', cmd_data[0:4])[0])
                        count = int(struct.unpack(header.endian + 'I', cmd_data[4:8])[0])
                        if flavor == 1:
                            entrypoint_address = int(struct.unpack(header.endian + 'I', cmd_data[48:52])[0])
                        elif flavor == 4:
                            entrypoint_address = int(struct.unpack(header.endian + 'Q', cmd_data[136:144])[0])

                    elif cmd_name == 'LC_MAIN':
                        entrypoint_type = 'new'
                        entrypoint_address = cmd_info.describe()['entryoff']

                entrypoint_data = ''
                if entrypoint_type == 'new':
                    entrypoint_offset = offset + entrypoint_address
                    entrypoint_data = macho_data[entrypoint_offset:entrypoint_offset+500]
                elif entrypoint_type == 'old':
                    found_section = False
                    for sec in all_sections:
                        if entrypoint_address >= sec['addr'] and entrypoint_address < (sec['addr'] + sec['size']):
                            found_section = True
                            entrypoint_address = (entrypoint_address - sec['addr']) + sec['offset']
                            break

                    if found_section:
                        entrypoint_offset = offset + entrypoint_address
                        entrypoint_data = macho_data[entrypoint_offset:entrypoint_offset+500]

                mode = CS_MODE_32
                if magic == 0xcffaedfe:
                    mode = CS_MODE_64

                md = Cs(CS_ARCH_X86, mode)
                match = []
                if entrypoint_data:
                    try:
                        for (address, size, mnemonic, op_str) in md.disasm_lite(entrypoint_data, 0x1000):
                            match.append(mnemonic.encode('utf-8').strip())
                    except Exception as e:
                        print str(e)

                    for s in signatures:
                        m = match
                        sig = signatures[s]['mnemonics']
                        if m and m[0] == sig[0] or THRESHOLD < .7:
                            additional_info = []
                            if 'num_mnemonics' in signatures[s]:
                                nm = signatures[s]['num_mnemonics']
                                m = match[:nm]
                                sig = signatures[s]['mnemonics'][:nm]
                            else:
                                m = match[:NUM_MNEM]
                                sig = signatures[s]['mnemonics'][:NUM_MNEM]

                            distance = tapered_levenshtein(sig, m)
                            similarity = 1.0 - distance/float(max(len(sig), len(m)))
                            if similarity > THRESHOLD:
                                if DIR_PROCESSING:
                                    print "[%s] [%s] (Edits: %s | Similarity: %0.3f) (%s)" %(f, s, distance, similarity, ' | '.join(additional_info))
                                else:
                                    print "[%s] (Edits: %s | Similarity: %0.3f) (%s)" %(s, distance, similarity, ' | '.join(additional_info))
                                if VERBOSE:
                                    print "%s\n%s\n" %(sig, m)
Ejemplo n.º 9
0
def main():
    parser = ArgumentParser(
        description="Mnemonic PE Signature Matching, signature generator")
    parser.add_argument("-n",
                        "--num-mnem",
                        dest="num_mnem",
                        help="Use a length of 'n' mnemonics (default: None)")
    parser.add_argument("-t",
                        "--title",
                        dest="sig_title",
                        help="Title (name) to use for the signature")
    parser.add_argument(
        "-l",
        "--linker",
        dest="linker",
        help="Use Major and Minor linker versions in the signature",
        action="store_true")
    parser.add_argument(
        "-s",
        "--numofsections",
        dest="nos",
        help="Use the number of sections in the PE file in the signature",
        action="store_true")
    parser.add_argument("file", nargs=1, help='File to analyze')
    args = parser.parse_args()

    file_type = None
    filename = args.file[0]
    error = ''
    try:
        fe = pefile.PE(filename)
        file_type = 'PE'
    except Exception as e:
        error = str(e)
        pass

    if not file_type:
        try:
            fe = macholib.MachO.MachO(filename)
            file_type = 'MACHO'

        except Exception:
            error = str(e)
            pass

    if not file_type:
        sys.stderr.write("[*] Error with %s - not a PE or Mach-O\n" %
                         sys.argv[1])
        sys.exit(1)

    if file_type == 'PE':
        try:
            if args.sig_title and len(args.sig_title) > 0:
                print "[%s]" % (args.sig_title)

            if args.linker:
                maj_linker = 0
                min_linker = 0
                try:
                    maj_linker = fe.OPTIONAL_HEADER.MajorLinkerVersion
                    min_linker = fe.OPTIONAL_HEADER.MinorLinkerVersion
                except Exception as e:
                    pass
                print "major_linker = %s" % (maj_linker)
                print "minor_linker = %s" % (min_linker)

            if args.nos:
                try:
                    print "numberofsections = %s" % (
                        fe.FILE_HEADER.NumberOfSections)
                except Exception as e:
                    sys.stderr.write(
                        "Image File Header not found in PE file\n")

            ep = fe.OPTIONAL_HEADER.AddressOfEntryPoint
            ep_ava = ep + fe.OPTIONAL_HEADER.ImageBase
            data = fe.get_memory_mapped_image()[ep:ep + 500]
            #
            # Determine if the file is 32bit or 64bit
            #
            mode = CS_MODE_32
            if fe.OPTIONAL_HEADER.Magic == 0x20b:
                mode = CS_MODE_64

            md = Cs(CS_ARCH_X86, mode)
            match = []
            for (address, size, mnemonic,
                 op_str) in md.disasm_lite(data, 0x1000):
                match.append(mnemonic.encode('utf-8').strip())

            print 'mnemonics = ' + ','.join(match[:30])
        except Exception as e:
            print str(e)

    elif file_type == 'MACHO':
        f = open(filename, 'rb')
        macho_data = f.read()
        f.close()
        for header in fe.headers:
            # Limit it to X86
            if header.header.cputype not in [7, 0x01000007]:
                continue

            # Limit it to Object and Executable files
            if header.header.filetype not in [1, 2]:
                continue

            magic = int(header.MH_MAGIC)
            offset = int(header.offset)

            all_sections = []
            entrypoint_type = ''
            entrypoint_address = 0
            for cmd in header.commands:
                load_cmd = cmd[0]
                cmd_info = cmd[1]
                cmd_data = cmd[2]
                cmd_name = load_cmd.get_cmd_name()
                if cmd_name in ('LC_SEGMENT', 'LC_SEGMENT_64'):
                    for section_data in cmd_data:
                        sd = section_data.describe()
                        all_sections.append(sd)

                elif cmd_name in ('LC_THREAD', 'LC_UNIXTHREAD'):
                    entrypoint_type = 'old'
                    flavor = int(
                        struct.unpack(header.endian + 'I', cmd_data[0:4])[0])
                    count = int(
                        struct.unpack(header.endian + 'I', cmd_data[4:8])[0])
                    if flavor == 1:
                        entrypoint_address = int(
                            struct.unpack(header.endian + 'I',
                                          cmd_data[48:52])[0])
                    elif flavor == 4:
                        entrypoint_address = int(
                            struct.unpack(header.endian + 'Q',
                                          cmd_data[136:144])[0])

                elif cmd_name == 'LC_MAIN':
                    entrypoint_type = 'new'
                    entrypoint_address = cmd_info.describe()['entryoff']

            entrypoint_data = ''
            if entrypoint_type == 'new':
                entrypoint_offset = offset + entrypoint_address
                entrypoint_data = macho_data[
                    entrypoint_offset:entrypoint_offset + 500]
            elif entrypoint_type == 'old':
                found_section = False
                for sec in all_sections:
                    if entrypoint_address >= sec[
                            'addr'] and entrypoint_address < (sec['addr'] +
                                                              sec['size']):
                        found_section = True
                        entrypoint_address = (entrypoint_address -
                                              sec['addr']) + sec['offset']
                        break

                if found_section:
                    entrypoint_offset = offset + entrypoint_address
                    entrypoint_data = macho_data[
                        entrypoint_offset:entrypoint_offset + 500]

            mode = CS_MODE_32
            if magic == 0xcffaedfe:
                mode = CS_MODE_64

            md = Cs(CS_ARCH_X86, mode)
            match = []
            try:
                for (address, size, mnemonic,
                     op_str) in md.disasm_lite(entrypoint_data, 0x1000):
                    match.append(mnemonic.encode('utf-8').strip())
            except Exception as e:
                print str(e)
            print 'mnemonics = ' + ','.join(match[:30])
Ejemplo n.º 10
0
Archivo: mpesm.py Proyecto: tsmolka/tic
def main():
    BYTES = 500
    NUM_MNEM = 30
    SIG_FILE = "./mpesm.sig"
    THRESHOLD = .85
    VERBOSE = False
    DIR_PROCESSING = False
    signatures = {}
    file_list = []
    nos = 0
    ep = 0
    ep_ava = 0

    parser = ArgumentParser(description="Mnemonic PE Signature Matching")
    parser.add_argument("-n",
                        "--num-mnem",
                        dest="num_mnem",
                        help="Use a lenght of 'n' mnemonics (default: " +
                        str(NUM_MNEM) + ')')
    parser.add_argument("-s",
                        "--signatures",
                        dest="sig_file",
                        help="signature file to use (default: " + SIG_FILE +
                        ')')
    parser.add_argument(
        "-b",
        "--bytes",
        dest="bytes",
        help=
        "Grab and disassemble x bytes from EP, you should only need to change this if you give a super large number for -n (default: "
        + str(BYTES) + ')')
    parser.add_argument(
        "-t",
        "--threshold",
        dest="threshold",
        help=
        "Display all matches greater than -t supplied similarity (default: " +
        str(THRESHOLD) + ')')
    parser.add_argument("-v",
                        "--verbose",
                        dest="verbose",
                        help="Verbose output",
                        action='store_true')
    parser.add_argument("file", nargs=1, help='File to analyze')
    args = parser.parse_args()

    if args.sig_file:
        SIG_FILE = args.sig_file
    if args.threshold:
        THRESHOLD = float(args.threshold)
    if args.bytes:
        BYTES = args.bytes
    if args.num_mnem:
        NUM_MNEM = args.num_mnem
    if args.verbose:
        VERBOSE = True

    config = ConfigParser.RawConfigParser()
    config.read(SIG_FILE)

    if len(config.sections()) == 0:
        print "Error Reading from config file: %s, it's either empty or not present" % (
            SIG_FILE)
        sys.exit(1)
    for s in config.sections():
        signatures[s] = {}
        signatures[s]['mnemonics'] = config.get(s, 'mnemonics').split(',')
        if config.has_option(s, 'num_mnemonics'):
            signatures[s]['num_mnemonics'] = config.getint(s, 'num_mnemonics')
        if config.has_option(s, 'major_linker'):
            signatures[s]['major_linker'] = config.getint(s, 'major_linker')
        if config.has_option(s, 'minor_linker'):
            signatures[s]['minor_linker'] = config.getint(s, 'minor_linker')
        if config.has_option(s, 'numberofsections'):
            signatures[s]['numberofsections'] = config.getint(
                s, 'numberofsections')

    if os.path.isdir(args.file[0]):
        file_list = glob.glob(args.file[0] + '/*')
        DIR_PROCESSING = True
    else:
        file_list.append(args.file[0])

    for f in file_list:
        file_type = None
        if VERBOSE:
            print '[*] Processing: ' + f
        try:
            fe = pefile.PE(f)
            file_type = 'PE'
        except Exception as e:
            if VERBOSE:
                sys.stderr.write("[*] Error with %s - %s\n" % (f, str(e)))

        if not file_type:
            try:
                fe = macholib.MachO.MachO(f)
                file_type = 'MACHO'

            except Exception as e:
                if VERBOSE:
                    sys.stderr.write("[*] Error with %s - %s\n" % (f, str(e)))

        if not file_type:
            sys.stderr.write("[*] Error with %s - not a PE or Mach-O\n" % f)

        if file_type == 'PE':
            try:
                minor_linker = 0
                major_linker = 0
                try:
                    minor_linker = fe.OPTIONAL_HEADER.MinorLinkerVersion
                    major_linker = fe.OPTIONAL_HEADER.MajorLinkerVersion
                except Exception as e:
                    pass
                if hasattr(fe, 'FILE_HEADER') and hasattr(
                        fe.FILE_HEADER, 'NumberOfSections'):
                    nos = fe.FILE_HEADER.NumberOfSections
                if hasattr(fe, 'OPTIONAL_HEADER') and hasattr(
                        fe.OPTIONAL_HEADER, 'AddressOfEntryPoint'):
                    ep = fe.OPTIONAL_HEADER.AddressOfEntryPoint
                if hasattr(fe, 'OPTIONAL_HEADER') and hasattr(
                        fe.OPTIONAL_HEADER, 'ImageBase') and ep > 0:
                    ep_ava = ep + fe.OPTIONAL_HEADER.ImageBase
                    data = fe.get_memory_mapped_image()[ep:ep + BYTES]
                    #
                    # Determine if the file is 32bit or 64bit
                    #
                    mode = CS_MODE_32
                    if fe.OPTIONAL_HEADER.Magic == 0x20b:
                        mode = CS_MODE_64

                    md = Cs(CS_ARCH_X86, mode)
                    match = []
                    for (address, size, mnemonic,
                         op_str) in md.disasm_lite(data, 0x1000):
                        match.append(mnemonic.encode('utf-8').strip())

                    for s in signatures:
                        m = match
                        sig = signatures[s]['mnemonics']
                        if m and m[0] == sig[0] or THRESHOLD < .7:
                            additional_info = []
                            if 'minor_linker' in signatures[s]:
                                if minor_linker == signatures[s][
                                        'minor_linker']:
                                    additional_info.append(
                                        'Minor Linker Version Match: True')
                                else:
                                    additional_info.append(
                                        'Minor Linker Version Match: False')
                            if 'major_linker' in signatures[s]:
                                if major_linker == signatures[s][
                                        'major_linker']:
                                    additional_info.append(
                                        'Major Linker Version Match: True')
                                else:
                                    additional_info.append(
                                        'Major Linker Version Match: False')
                            if 'numberofsections' in signatures[s]:
                                if nos == signatures[s]['numberofsections']:
                                    additional_info.append(
                                        'Number Of Sections Match: True')
                                else:
                                    additional_info.append(
                                        'Number Of Sections Match: False')

                            if 'num_mnemonics' in signatures[s]:
                                nm = signatures[s]['num_mnemonics']
                                m = match[:nm]
                                sig = signatures[s]['mnemonics'][:nm]
                            else:
                                m = match[:NUM_MNEM]
                                sig = signatures[s]['mnemonics'][:NUM_MNEM]
                            distance = tapered_levenshtein(sig, m)
                            similarity = 1.0 - distance / float(
                                max(len(sig), len(m)))
                            if similarity > THRESHOLD:
                                if DIR_PROCESSING:
                                    print "[%s] [%s] (Edits: %s | Similarity: %0.3f) (%s)" % (
                                        f, s, distance, similarity,
                                        ' | '.join(additional_info))
                                else:
                                    print "[%s] (Edits: %s | Similarity: %0.3f) (%s)" % (
                                        s, distance, similarity,
                                        ' | '.join(additional_info))
                                if VERBOSE:
                                    print "%s\n%s\n" % (sig, m)
            except Exception as e:
                print str(e)
        elif file_type == 'MACHO':
            macho_file = open(f, 'rb')
            macho_data = macho_file.read()
            macho_file.close()
            for header in fe.headers:
                # Limit it to X86
                if header.header.cputype not in [7, 0x01000007]:
                    continue

                # Limit it to Object and Executable files
                if header.header.filetype not in [1, 2]:
                    continue

                magic = int(header.MH_MAGIC)
                offset = int(header.offset)

                all_sections = []
                entrypoint_type = ''
                entrypoint_address = 0
                for cmd in header.commands:
                    load_cmd = cmd[0]
                    cmd_info = cmd[1]
                    cmd_data = cmd[2]
                    cmd_name = load_cmd.get_cmd_name()
                    if cmd_name in ('LC_SEGMENT', 'LC_SEGMENT_64'):
                        for section_data in cmd_data:
                            sd = section_data.describe()
                            all_sections.append(sd)

                    elif cmd_name in ('LC_THREAD', 'LC_UNIXTHREAD'):
                        entrypoint_type = 'old'
                        flavor = int(
                            struct.unpack(header.endian + 'I',
                                          cmd_data[0:4])[0])
                        count = int(
                            struct.unpack(header.endian + 'I',
                                          cmd_data[4:8])[0])
                        if flavor == 1:
                            entrypoint_address = int(
                                struct.unpack(header.endian + 'I',
                                              cmd_data[48:52])[0])
                        elif flavor == 4:
                            entrypoint_address = int(
                                struct.unpack(header.endian + 'Q',
                                              cmd_data[136:144])[0])

                    elif cmd_name == 'LC_MAIN':
                        entrypoint_type = 'new'
                        entrypoint_address = cmd_info.describe()['entryoff']

                entrypoint_data = ''
                if entrypoint_type == 'new':
                    entrypoint_offset = offset + entrypoint_address
                    entrypoint_data = macho_data[
                        entrypoint_offset:entrypoint_offset + 500]
                elif entrypoint_type == 'old':
                    found_section = False
                    for sec in all_sections:
                        if entrypoint_address >= sec[
                                'addr'] and entrypoint_address < (sec['addr'] +
                                                                  sec['size']):
                            found_section = True
                            entrypoint_address = (entrypoint_address -
                                                  sec['addr']) + sec['offset']
                            break

                    if found_section:
                        entrypoint_offset = offset + entrypoint_address
                        entrypoint_data = macho_data[
                            entrypoint_offset:entrypoint_offset + 500]

                mode = CS_MODE_32
                if magic == 0xcffaedfe:
                    mode = CS_MODE_64

                md = Cs(CS_ARCH_X86, mode)
                match = []
                if entrypoint_data:
                    try:
                        for (address, size, mnemonic,
                             op_str) in md.disasm_lite(entrypoint_data,
                                                       0x1000):
                            match.append(mnemonic.encode('utf-8').strip())
                    except Exception as e:
                        print str(e)

                    for s in signatures:
                        m = match
                        sig = signatures[s]['mnemonics']
                        if m and m[0] == sig[0] or THRESHOLD < .7:
                            additional_info = []
                            if 'num_mnemonics' in signatures[s]:
                                nm = signatures[s]['num_mnemonics']
                                m = match[:nm]
                                sig = signatures[s]['mnemonics'][:nm]
                            else:
                                m = match[:NUM_MNEM]
                                sig = signatures[s]['mnemonics'][:NUM_MNEM]

                            distance = tapered_levenshtein(sig, m)
                            similarity = 1.0 - distance / float(
                                max(len(sig), len(m)))
                            if similarity > THRESHOLD:
                                if DIR_PROCESSING:
                                    print "[%s] [%s] (Edits: %s | Similarity: %0.3f) (%s)" % (
                                        f, s, distance, similarity,
                                        ' | '.join(additional_info))
                                else:
                                    print "[%s] (Edits: %s | Similarity: %0.3f) (%s)" % (
                                        s, distance, similarity,
                                        ' | '.join(additional_info))
                                if VERBOSE:
                                    print "%s\n%s\n" % (sig, m)
Ejemplo n.º 11
0
class IntelDisassembler(object):
    def __init__(self, config, forced_bitness=None):
        self.config = config
        self._forced_bitness = forced_bitness
        self.capstone = None
        self._tfidf = None
        self.binary_info = None
        self.label_providers = []
        self._addLabelProviders()
        self.fc_manager = None
        self.tailcall_analyzer = None
        self.indcall_analyzer = None
        self.jumptable_analyzer = None
        self.disassembly = DisassemblyResult()
        self.disassembly.smda_version = config.VERSION
        self.disassembly.setConfidenceThreshold(config.CONFIDENCE_THRESHOLD)

    def _initCapstone(self):
        self.capstone = Cs(
            CS_ARCH_X86,
            CS_MODE_64) if self.disassembly.binary_info.bitness == 64 else Cs(
                CS_ARCH_X86, CS_MODE_32)

    def _initTfIdf(self):
        self._tfidf = MnemonicTfIdf(
            bitness=64
        ) if self.disassembly.binary_info.bitness == 64 else MnemonicTfIdf(
            bitness=32)

    def getBitMask(self):
        if self.disassembly.binary_info.bitness == 64:
            return 0xFFFFFFFFFFFFFFFF
        return 0xFFFFFFFF

    def _addLabelProviders(self):
        self.label_providers.append(WinApiResolver(self.config))
        self.label_providers.append(ElfApiResolver(self.config))
        self.label_providers.append(ElfSymbolProvider(self.config))
        self.label_providers.append(PdbSymbolProvider(self.config))

    def _updateLabelProviders(self, binary_info):
        for provider in self.label_providers:
            provider.update(binary_info)

    def addPdbFile(self, binary_info, pdb_path):
        LOGGER.debug("adding PDB file: %s", pdb_path)
        if pdb_path and binary_info.base_addr:
            pdb_info = BinaryInfo(b"")
            pdb_info.file_path = pdb_path
            pdb_info.base_addr = binary_info.base_addr
            for provider in self.label_providers:
                provider.update(pdb_info)

    def resolveApi(self, to_address, api_address):
        for provider in self.label_providers:
            if not provider.isApiProvider():
                continue
            dll, api = provider.getApi(to_address, api_address)
            if dll or api:
                return (dll, api)

        return ("", "")

    def resolveSymbol(self, address):
        for provider in self.label_providers:
            if not provider.isSymbolProvider():
                continue
            result = provider.getSymbol(address)
            if result:
                return result
        return ""

    def getSymbolCandidates(self):
        symbol_offsets = set([])
        for provider in self.label_providers:
            if not provider.isSymbolProvider():
                continue
            function_symbols = provider.getFunctionSymbols()
            symbol_offsets.update(list(function_symbols.keys()))
        return list(symbol_offsets)

    def getReferencedAddr(self, op_str):
        referenced_addr = re.search(r"0x[a-fA-F0-9]+", op_str)
        if referenced_addr:
            return int(referenced_addr.group(), 16)
        return 0

    def resolveIndirectSwitch(self, addr_switch_array, size):
        indirect_switch_bytes = []
        current_offset = addr_switch_array + size * 4
        if self.disassembly.isAddrWithinMemoryImage(current_offset):
            LOGGER.debug(
                "0x%08x analyzing potentially indirect switch table (size: 0x%08x).",
                current_offset, size)
            current_byte = self.disassembly.getByte(current_offset)
            if isinstance(current_byte, str):
                current_byte = ord(current_byte)
            while current_byte < size and not current_offset in self.fc_manager.getFunctionStartCandidates(
            ):
                indirect_switch_bytes.append(current_offset)
                current_offset += 1
                current_byte = self.disassembly.getByte(current_offset)
                if isinstance(current_byte, str):
                    current_byte = ord(current_byte)
            LOGGER.debug("0x%08x found %d bytes.", current_offset,
                         len(indirect_switch_bytes))
        return indirect_switch_bytes

    def _analyzeCallInstruction(self, i, state):
        i_address, i_size, i_mnemonic, i_op_str = i
        state.setLeaf(False)
        # case = "FALLTHROUGH"
        call_destination = self.getReferencedAddr(i_op_str)
        if ":" in i_op_str:
            # case = "LONG-CALL"
            pass
        if i_op_str.startswith("dword ptr ["):
            # reg+offset is currently ignored as it is a minority of calls
            # case = "DWORD-PTR-REG"
            if i_op_str.startswith("dword ptr [0x"):
                # case = "DWORD-PTR"
                dereferenced = self.disassembly.dereferenceDword(
                    call_destination)
                if dereferenced is not None:
                    state.addCodeRef(i_address, dereferenced)
                    self._handleCallTarget(state, i_address, dereferenced)
                    self._handleApiTarget(i_address, call_destination,
                                          dereferenced)
        elif i_op_str.startswith("qword ptr [rip"):
            rip = i_address + i_size
            call_destination = rip + self.getReferencedAddr(i_op_str)
            dereferenced = self.disassembly.dereferenceQword(call_destination)
            state.addCodeRef(i_address, call_destination)
            if dereferenced is not None:
                self._handleApiTarget(i_address, call_destination,
                                      dereferenced)
        elif i_op_str.startswith("0x"):
            # case = "DIRECT"
            self._handleCallTarget(state, i_address, call_destination)
            self._handleApiTarget(i_address, call_destination,
                                  call_destination)
        elif i_op_str.lower() in REGS_32BIT or i_op_str.lower() in REGS_64BIT:
            # case = "REG"
            # this is resolved by backtracking at the end of function analysis.
            state.call_register_ins.append(i_address)

    def _handleCallTarget(self, state, from_addr, to_addr):
        if to_addr and self.disassembly.isAddrWithinMemoryImage(to_addr):
            state.addCodeRef(from_addr, to_addr)
        if state.start_addr == to_addr:
            state.setRecursion(True)

    def _handleApiTarget(self, from_addr, to_addr, dereferenced):
        if to_addr:
            # identify API calls on the fly
            dll, api = self.resolveApi(to_addr, dereferenced)
            if dll or api:
                self._updateApiInformation(from_addr, dereferenced, dll, api)
                return (dll, api)
            elif not self.disassembly.isAddrWithinMemoryImage(to_addr):
                LOGGER.debug("potentially uncovered DLL address: 0x%08x",
                             to_addr)

    def _updateApiInformation(self, from_addr, to_addr, dll, api):
        api_entry = {"referencing_addr": [], "dll_name": dll, "api_name": api}
        if to_addr in self.disassembly.apis:
            api_entry = self.disassembly.apis[to_addr]
        if from_addr not in api_entry["referencing_addr"]:
            api_entry["referencing_addr"].append(from_addr)
        self.disassembly.apis[to_addr] = api_entry

    def _analyzeCondJmpInstruction(self, i, state):
        i_address, i_size, i_mnemonic, i_op_str = i
        state.addBlockToQueue(i_address + i_size)
        jump_destination = self.getReferencedAddr(i_op_str)
        # case = "FALLTHROUGH"
        self.tailcall_analyzer.addJump(i_address, jump_destination)
        if jump_destination:
            if jump_destination in self.disassembly.functions:
                # case = "TAILCALL!"
                state.setSanelyEnding(True)
            elif jump_destination in self.fc_manager.getFunctionStartCandidates(
            ):
                # it's tough to decide whether this should be disassembled here or not. topic of "code-sharing functions".
                # case = "TAILCALL?"
                pass
            else:
                # case = "OFFSET-QUEUE"
                state.addBlockToQueue(int(i_op_str, 16))
            state.addCodeRef(i_address, int(i_op_str, 16), by_jump=True)
        state.setBlockEndingInstruction(True)

    def _analyzeLoopInstruction(self, i, state):
        i_address, i_size, i_mnemonic, i_op_str = i
        jump_destination = self.getReferencedAddr(i_op_str)
        if jump_destination:
            state.addCodeRef(i_address, int(i_op_str, 16), by_jump=True)
        # loops have two exits and should thus be handled as block ending instruction
        state.addBlockToQueue(i_address + i_size)
        state.setBlockEndingInstruction(True)

    def _analyzeJmpInstruction(self, i, state):
        i_address, i_size, i_mnemonic, i_op_str = i
        # case = "FALLTHROUGH"
        if ":" in i_op_str:
            # case = "LONG-JMP"
            pass
        elif i_op_str.startswith("dword ptr [0x"):
            # case = "DWORD-PTR"
            # Handles mostly jmp-to-api, stubs or tailcalls, all should be handled sanely this way.
            jump_destination = self.getReferencedAddr(i_op_str)
            dereferenced = self.disassembly.dereferenceDword(jump_destination)
            state.addCodeRef(i_address, jump_destination, by_jump=True)
            self.tailcall_analyzer.addJump(i_address, jump_destination)
            if dereferenced is not None:
                self._handleApiTarget(i_address, jump_destination,
                                      dereferenced)
        elif i_op_str.startswith("qword ptr [rip"):
            # case = "QWORD-PTR, RIP-relative"
            # Handles mostly jmp-to-api, stubs or tailcalls, all should be handled sanely this way.
            rip = i_address + i_size
            jump_destination = rip + self.getReferencedAddr(i_op_str)
            dereferenced = self.disassembly.dereferenceQword(jump_destination)
            state.addCodeRef(i_address, jump_destination, by_jump=True)
            self.tailcall_analyzer.addJump(i_address, jump_destination)
            if dereferenced is not None:
                self._handleApiTarget(i_address, jump_destination,
                                      dereferenced)
        elif i_op_str.startswith("0x"):
            jump_destination = self.getReferencedAddr(i_op_str)
            self.tailcall_analyzer.addJump(i_address, jump_destination)
            if jump_destination in self.disassembly.functions:
                # case = "TAILCALL!"
                state.setSanelyEnding(True)
            elif jump_destination in self.fc_manager.getFunctionStartCandidates(
            ):
                # case = "TAILCALL?"
                pass
            else:
                if state.isFirstInstruction():
                    # case = "STUB-TAILCALL!"
                    pass
                else:
                    # case = "OFFSET-QUEUE"
                    state.addBlockToQueue(int(i_op_str, 16))
            state.addCodeRef(i_address, int(i_op_str, 16), by_jump=True)
        else:
            jumptable_targets = self.jumptable_analyzer.getJumpTargets(
                i, state)
            for target in jumptable_targets:
                if self.disassembly.isAddrWithinMemoryImage(target):
                    state.addBlockToQueue(target)
                    state.addCodeRef(i_address, target, by_jump=True)
        state.setNextInstructionReachable(False)
        state.setBlockEndingInstruction(True)

    def _analyzeEndInstruction(self, state):
        state.setSanelyEnding(True)
        state.setNextInstructionReachable(False)
        state.setBlockEndingInstruction(True)

    def _getDisasmWindowBuffer(self, addr):
        relative_start = addr - self.disassembly.binary_info.base_addr
        relative_end = relative_start + 15
        return self.disassembly.binary_info.binary[relative_start:relative_end]

    def analyzeFunction(self, start_addr, as_gap=False):
        LOGGER.debug(
            "analyzeFunction() starting analysis of candidate @0x%08x",
            start_addr)
        self.tailcall_analyzer.initFunction()
        i = None
        state = FunctionAnalysisState(start_addr, self.disassembly)
        if state.isProcessedFunction():
            self.fc_manager.updateAnalysisAborted(
                start_addr,
                "collision with existing code of function 0x{:08x}".format(
                    self.disassembly.ins2fn[start_addr]))
            return []
        while state.hasUnprocessedBlocks():
            LOGGER.debug(
                "  current block queue: %s",
                ", ".join(["0x%x" % addr for addr in state.block_queue]))
            state.chooseNextBlock()
            LOGGER.debug("  analyzeFunction() now processing block @0x%08x",
                         state.block_start)
            # in capstone, disassembly is more expensive than calling the function, so we use maximum x86/64 instruction size (14 bytes) as lookeahead.
            # disasm_lite() also provides up to 30% faster disassembly than disasm(), so we work with tuples instead of objects
            cache = [
                i for i in self.capstone.disasm_lite(
                    self._getDisasmWindowBuffer(state.block_start),
                    state.block_start)
            ]
            cache_pos = 0
            previous_address = None
            previous_mnemonic = None
            previous_op_str = None
            while True:
                for i in cache:
                    i_address, i_size, i_mnemonic, i_op_str = i
                    i_op_str = i_op_str.strip()
                    i_relative_address = i_address - self.disassembly.binary_info.base_addr
                    i_bytes = self.disassembly.binary_info.binary[
                        i_relative_address:i_relative_address + i_size]
                    LOGGER.debug(
                        "  analyzeFunction() now processing instruction @0x%08x: %s",
                        i_address, i_mnemonic + " " + i_op_str)
                    cache_pos += i_size
                    state.setNextInstructionReachable(True)
                    # count appearences of "suspicious" byte patterns (like 00 00) that indicate non-function code
                    if i_bytes == DOUBLE_ZERO:
                        state.suspicious_ins_count += 1
                        LOGGER.debug(
                            "    analyzeFunction() found suspicious function @0x%08x",
                            i_address)
                        if state.suspicious_ins_count > 1:
                            self.fc_manager.updateAnalysisAborted(
                                start_addr,
                                "too many suspicious instructions @0x%08x" %
                                i_address)
                            return state
                    if i_mnemonic in CALL_INS:
                        self._analyzeCallInstruction(i, state)
                    elif i_mnemonic in JMP_INS:
                        self._analyzeJmpInstruction(i, state)
                    elif i_mnemonic in LOOP_INS:
                        self._analyzeLoopInstruction(i, state)
                    elif i_mnemonic in CJMP_INS:
                        self._analyzeCondJmpInstruction(i, state)
                    elif i_mnemonic.startswith("j"):
                        LOGGER.error(
                            "unsupported jump @0x%08x (0x%08x): %s %s",
                            i_address, start_addr, i_mnemonic, i_op_str)
                        # we do not analyze any potential exception handler (tricks), so treat breakpoints as exit condition
                    elif i_mnemonic in RET_INS:
                        self._analyzeEndInstruction(state)
                        LOGGER.debug(
                            "  analyzeFunction() found ending instruction @0x%08x",
                            i_address)
                        if previous_address and previous_mnemonic == "push":
                            push_ret_destination = self.getReferencedAddr(
                                previous_op_str)
                            if self.disassembly.isAddrWithinMemoryImage(
                                    push_ret_destination):
                                LOGGER.debug(
                                    "  analyzeFunction() found push-return jump obfuscation: @0x%08x",
                                    i_address)
                                state.addBlockToQueue(push_ret_destination)
                                state.addCodeRef(i_address,
                                                 push_ret_destination,
                                                 by_jump=True)
                    elif i_mnemonic in ["int3", "hlt"]:
                        self._analyzeEndInstruction(state)
                        LOGGER.debug(
                            "  analyzeFunction() found ending instruction @0x%08x",
                            i_address)
                    elif previous_address and i_address != start_addr and previous_mnemonic == "call":
                        instruction_sequence = [
                            ins for ins in self.capstone.disasm(
                                self._getDisasmWindowBuffer(i_address),
                                i_address)
                        ]
                        if self.fc_manager.isAlignmentSequence(
                                instruction_sequence
                        ) or self.fc_manager.isFunctionCandidate(i_address):
                            # LLVM and GCC sometimes tends to produce lots of tailcalls that basically mess with function end detection, we cut whenever we find effective nops after calls
                            LOGGER.debug(
                                "    current function: 0x%x ---> ran into alignment sequence after call -> 0x%08x, cutting block here.",
                                start_addr, i_address)
                            state.setBlockEndingInstruction(True)
                            state.endBlock()
                            state.setSanelyEnding(True)
                            if self.fc_manager.isAlignmentSequence(
                                    instruction_sequence):
                                next_aligned_address = previous_address + (
                                    16 - previous_address % 16)
                                LOGGER.debug("  Adding: 0x%x as candidate.",
                                             next_aligned_address)
                                self.fc_manager.addCandidate(
                                    next_aligned_address, is_gap=True)
                            break
                    previous_address = i_address
                    previous_mnemonic = i_mnemonic
                    previous_op_str = i_op_str
                    if not i_address in self.disassembly.code_map and not i_address in self.disassembly.data_map and not state.isProcessed(
                            i_address):
                        LOGGER.debug(
                            "  analyzeFunction() booked instruction @0x%08x: %s for processed state",
                            i_address, i_mnemonic + " " + i_op_str)
                        state.addInstruction(i_address, i_size, i_mnemonic,
                                             i_op_str, i_bytes)
                    elif i_address in self.disassembly.code_map:
                        LOGGER.debug(
                            "  analyzeFunction() was already present?! instruction @0x%08x: %s (function: 0x%08x)",
                            i_address, i_mnemonic + " " + i_op_str,
                            self.disassembly.ins2fn[i_address])
                        state.setBlockEndingInstruction(True)
                        state.setCollision(True)
                    else:
                        LOGGER.debug(
                            "  analyzeFunction() was already present in local function."
                        )
                        state.setBlockEndingInstruction(True)
                    if state.isBlockEndingInstruction():
                        state.endBlock()
                        break
                else:
                    #if the inner loop did not break, we need to refill the cache in order to finish the block-analysis
                    cache = [
                        i for i in self.capstone.disasm_lite(
                            self._getDisasmWindowBuffer(state.block_start +
                                                        cache_pos),
                            state.block_start + cache_pos)
                    ]
                    if not cache:
                        break
                    continue
                #if the inner loop did break, the cache didn't run empty and thus block-analysis is finished
                break
            if not state.isBlockEndingInstruction():
                if i is not None:
                    LOGGER.debug(
                        "No block submitted, last instruction: 0x%08x -> 0x%08x %s || %s",
                        start_addr, i_address, i_mnemonic + " " + i_op_str,
                        self.fc_manager.getFunctionCandidate(start_addr))
                else:
                    LOGGER.debug(
                        "No block submitted with no ins, last instruction: 0x%08x || %s",
                        start_addr,
                        self.fc_manager.getFunctionCandidate(start_addr))
        state.label = self.resolveSymbol(state.start_addr)
        analysis_result = state.finalizeAnalysis(as_gap)
        if analysis_result and self.config.RESOLVE_REGISTER_CALLS:
            self.indcall_analyzer.resolveRegisterCalls(state)
            self.tailcall_analyzer.finalizeFunction(state)
        self.fc_manager.updateAnalysisFinished(start_addr)
        self.fc_manager.updateCandidates(state)
        return state

    def analyzeBuffer(self, binary_info, cbAnalysisTimeout=None):
        LOGGER.debug("Analyzing buffer with %d bytes @0x%08x",
                     binary_info.binary_size, binary_info.base_addr)
        self._updateLabelProviders(binary_info)
        self.disassembly = DisassemblyResult()
        self.disassembly.smda_version = self.config.VERSION
        self.disassembly.setBinaryInfo(binary_info)
        self.disassembly.binary_info.architecture = "intel"
        self.disassembly.analysis_start_ts = datetime.datetime.utcnow()
        if self.disassembly.binary_info.bitness not in [32, 64]:
            bitness_analyzer = BitnessAnalyzer()
            self.disassembly.binary_info.bitness = bitness_analyzer.determineBitnessFromDisassembly(
                self.disassembly)
            LOGGER.debug("Automatically Recognized Bitness as: %d",
                         self.disassembly.binary_info.bitness)
        else:
            LOGGER.debug("Using defined Bitness as: %d",
                         self.disassembly.binary_info.bitness)
        if self._forced_bitness:
            self.disassembly.binary_info.bitness = self._forced_bitness
            LOGGER.debug("Forced Bitness override to: %d",
                         self.disassembly.binary_info.bitness)

        self.tailcall_analyzer = TailcallAnalyzer()
        self.indcall_analyzer = IndirectCallAnalyzer(self)
        self.jumptable_analyzer = JumpTableAnalyzer(self)

        self.fc_manager = FunctionCandidateManager(self.config)
        if self.config.USE_SYMBOLS_AS_CANDIDATES:
            self.fc_manager.symbol_addresses = self.getSymbolCandidates()
        self.fc_manager.init(self.disassembly)
        self._initCapstone()
        self._initTfIdf()
        # first pass, analyze locations identifiable by heuristics (e.g. call-reference, common prologue)
        for candidate in self.fc_manager.getNextFunctionStartCandidate():
            if cbAnalysisTimeout and cbAnalysisTimeout():
                break
            state = self.analyzeFunction(candidate.addr)
        LOGGER.debug("Finished heuristical analysis, functions: %d",
                     len(self.disassembly.functions))
        # second pass, analyze remaining gaps for additional candidates in an iterative way
        gap_candidate = self.fc_manager.nextGapCandidate()
        while gap_candidate is not None:
            if cbAnalysisTimeout and cbAnalysisTimeout():
                break
            LOGGER.debug(
                "based on gap, performing function analysis of 0x%08x",
                gap_candidate)
            state = self.analyzeFunction(gap_candidate, as_gap=True)
            function_blocks = state.getBlocks()
            if function_blocks:
                LOGGER.debug("+ got some blocks here -> 0x%08x", gap_candidate)
            if gap_candidate in self.disassembly.functions:
                fn_min = self.disassembly.function_borders[gap_candidate][0]
                fn_max = self.disassembly.function_borders[gap_candidate][1]
                LOGGER.debug("+++ YAY, is now a function! -> 0x%08x - 0x%08x",
                             fn_min, fn_max)
                # start looking directly after our new function
            else:
                self.fc_manager.updateAnalysisAborted(
                    gap_candidate,
                    "Gap candidate did not fulfil function criteria.")
            next_gap = self.fc_manager.getNextGap(dont_skip=True)
            gap_candidate = self.fc_manager.nextGapCandidate(next_gap)
        LOGGER.debug("Finished gap analysis, functions: %d",
                     len(self.disassembly.functions))
        # third pass, fix potential tailcall functions that were identified during analysis
        if self.config.RESOLVE_TAILCALLS:
            tailcalled_functions = self.tailcall_analyzer.resolveTailcalls(
                self)
            for addr in tailcalled_functions:
                self.fc_manager.addTailcallCandidate(addr)
            LOGGER.debug("Finished tailcall analysis, functions.")
        self.disassembly.failed_analysis_addr = self.fc_manager.getAbortedCandidates(
        )
        # package up and finish
        for addr, candidate in self.fc_manager.candidates.items():
            if addr in self.disassembly.functions:
                function_blocks = self.disassembly.getBlocksAsDict(addr)
                function_tfidf = self._tfidf.getTfIdfFromBlocks(
                    function_blocks)
                candidate.setTfIdf(function_tfidf)
                candidate.getConfidence()
            self.disassembly.candidates[addr] = candidate
        self.disassembly.analysis_end_ts = datetime.datetime.utcnow()
        if cbAnalysisTimeout():
            self.disassembly.analysis_timeout = True
        return self.disassembly