def dissemble_code(self, code, baseaddr): md = Cs(capstone.CS_ARCH_PPC, capstone.CS_MODE_32 | capstone.CS_MODE_BIG_ENDIAN) md.syntax = capstone.CS_OPT_SYNTAX_INTEL for (address, size, mnemonic, op_str) in md.disasm_lite(code, baseaddr): print "0x%x:\t%s\t%s" % (address, mnemonic, op_str)
class GenericDisassembler: def __init__(self, arch, mode): self.arch = arch self.mode = mode self.capstone = Cs(self.arch, self.mode) self.prologues = { # Triple backslash (\\\) are needed to escape bytes in the compiled regex CS_MODE_32: [ "\\\x55\\\x89\\\xE5", # push ebp & mov ebp, esp "\\\x55\\\x8B\\\xEC", # push ebp & mov ebp, esp "\\\x55\\\x8b\\\x6c\\\x24", # push ebp & mov ebp, [esp+?] ], CS_MODE_64: [ "\\\x55\\\x48\\\x89\\\xE5", # push rbp & mov rbp, rsp ] }[mode] self.conditional_jmp_mnemonics = { 'jz', 'je', 'jcxz', 'jecxz', 'jrcxz', 'jnz', 'jp', 'jpe', 'jnp', 'ja', 'jae', 'jb', 'jbe', 'jg', 'jge', 'jl', 'jle', 'js', 'jns', 'jo', 'jno', 'jecxz', 'loop', 'loopne', 'loope', 'jne' } self.x86_32_registers = { 'eax', 'ebx', 'ecx', 'edx', 'esi', 'edi', 'esp', 'ebp' } self.max_instruction_size = 16 def linear_sweep_cache(self, data, offset, insts, bin_instance, verbose=False): curr_offset = offset try: inst_va = self.get_va_from_offset(bin_instance, curr_offset) instructions = self.capstone.disasm_lite(data[offset:], inst_va) curr_offset = offset for (address, size, mnemonic, op_str) in instructions: inst = Instruction( offset=curr_offset, va=inst_va, address=address, mnemonic=mnemonic, op_str=op_str, size=size, bytes=data[curr_offset:curr_offset + size], cache_only=True, ) insts[curr_offset] = inst curr_offset += size inst_va += size except Exception, e: print "WARNING:", repr(e) return insts
def __gadgetsFinding(self, section, gadgets, arch, mode): PREV_BYTES = 9 # Number of bytes prior to the gadget to store. opcodes = section["opcodes"] sec_vaddr = section["vaddr"] ret = [] md = Cs(arch, mode) for gad_op, gad_size, gad_align in gadgets: allRefRet = [m.start() for m in re.finditer(gad_op, opcodes)] for ref in allRefRet: end = ref + gad_size for i in range(self.__options.depth): start = ref - (i * gad_align) if (sec_vaddr+start) % gad_align == 0: code = opcodes[start:end] decodes = md.disasm_lite(code, sec_vaddr+ref) decodes = list(decodes) if sum(size for _, size, _, _ in decodes) != i*gad_align + gad_size: # We've read less instructions than planned so something went wrong continue if self.passClean(decodes): continue off = self.__offset vaddr = off+sec_vaddr+start g = {"vaddr" : vaddr} #if not self.__options.noinstr: g["gadget"] = " ; ".join("{}{}{}".format(mnemonic, " " if op_str else "", op_str) for _, _, mnemonic, op_str in decodes).replace(" ", " ") if self.__options.callPreceded: prevBytesAddr = max(sec_vaddr, vaddr - PREV_BYTES) g["prev"] = opcodes[prevBytesAddr-sec_vaddr:vaddr-sec_vaddr] #if self.__options.dump: g["bytes"] = code ret.append(g) return ret
class GenericDisassembler: def __init__(self, arch, mode): self.arch = arch self.mode = mode self.capstone = Cs(self.arch, self.mode) self.prologues = { # Triple backslash (\\\) are needed to escape bytes in the compiled regex CS_MODE_32: [ b"\x55\x89\xE5", # push ebp & mov ebp, esp b"\x55\x8B\xEC", # push ebp & mov ebp, esp b"\x55\x8b\x6c\x24", # push ebp & mov ebp, [esp+?] ], CS_MODE_64: [ b"\x55\x48\x89\xE5", # push rbp & mov rbp, rsp ] }[mode] self.conditional_jmp_mnemonics = { 'jz', 'je', 'jcxz', 'jecxz', 'jrcxz', 'jnz', 'jp', 'jpe', 'jnp', 'ja', 'jae', 'jb', 'jbe', 'jg', 'jge', 'jl', 'jle', 'js', 'jns', 'jo', 'jno', 'jecxz', 'loop', 'loopne', 'loope', 'jne' } self.x86_32_registers = { 'eax', 'ebx', 'ecx', 'edx', 'esi', 'edi', 'esp', 'ebp' } self.max_instruction_size = 16 def linear_sweep_cache(self, data, offset, insts, bin_instance, verbose=False): section_offset_last = self.get_section_offset_last( bin_instance, offset) curr_offset = offset try: inst_va = self.get_va_from_offset(bin_instance, curr_offset) instructions = self.capstone.disasm_lite(data[offset:], inst_va) curr_offset = offset for (address, size, mnemonic, op_str) in instructions: inst = Instruction( offset=curr_offset, va=inst_va, address=address, mnemonic=mnemonic, op_str=op_str, size=size, bytes=data[curr_offset:curr_offset + size], cache_only=True, ) insts[curr_offset] = inst curr_offset += size inst_va += size if section_offset_last is not None and curr_offset > section_offset_last: break except Exception as e: print("WARNING:", repr(e)) return insts def _dis(self, data, offset, insts, bin_instance, iat_api=dict(), verbose=False, ifrom=None, from_pred=True, is_rva=False): ''' <insts> is a dict like {'offset': <Instruction>} ''' args_queue = [] args_queue.append((offset, ifrom, from_pred)) while args_queue != []: offset, ifrom, from_pred = args_queue.pop(0) if offset is None: continue inst = None if offset in insts: inst = insts[offset] if inst.cache_only: inst.cache_only = False else: if ifrom: inst.add_ifrom(ifrom.offset) insts[ifrom.offset].add_ito(inst.offset, from_pred) continue if inst is None: try: inst_va = self.get_va_from_offset(bin_instance, offset) (address, size, mnemonic, op_str) = next( self.capstone.disasm_lite( data[offset:offset + self.max_instruction_size], inst_va, count=1)) inst = Instruction( offset=offset, va=inst_va, address=address, mnemonic=mnemonic, op_str=op_str, size=size, bytes=data[offset:offset + size], cache_only=False, ) insts[inst.offset] = inst except Exception as e: if verbose: print("WARNING:", repr(e)) continue if ifrom: insts[inst.offset].add_ifrom(ifrom.offset) insts[ifrom.offset].add_ito(inst.offset, from_pred) # No child if inst.mnemonic in ['ret', 'retf']: pass # 1 remote child elif inst.mnemonic in ['jmp', 'jmpf']: if "word ptr [0x" in inst.op_str: iat_va = int(inst.op_str.split('[')[1].split(']')[0], 16) if iat_va in iat_api: inst.op_str = iat_api[iat_va] else: try: remote_offset = self.get_offset_from_va( bin_instance, int(inst.op_str, 16)) if remote_offset is not None: args_queue.insert( 0, (remote_offset, insts[inst.offset], False)) except Exception as e: if verbose: print("WARNING:", repr(e)) pass # 2 children (next, then remote) - except call elif inst.mnemonic in self.conditional_jmp_mnemonics: next_offset = inst.offset + inst.size args_queue.insert(0, (next_offset, insts[inst.offset], True)) # Call to Imported API (in IAT) # dword ptr [0x........] or qword ptr [0x........] if "word ptr [0x" in inst.op_str: iat_va = int(inst.op_str.split('[')[1].split(']')[0], 16) if iat_va in iat_api: inst.op_str = iat_api[iat_va] elif inst.op_str in [ 'eax', 'ebx', 'ecx', 'edx', 'esi', 'edi', 'esp', 'ebp' ]: pass else: try: remote_offset = self.get_offset_from_va( bin_instance, int(inst.op_str, 16)) except Exception as e: if verbose: print("WARNING:", repr(e)) continue args_queue.insert(1, (remote_offset, insts[inst.offset], False)) # 2 children (next, then remote) - call elif inst.mnemonic in ['call']: next_offset = inst.offset + inst.size remote_offset = None args_queue.insert(0, (next_offset, insts[inst.offset], True)) # Call to Imported API (in IAT) # dword ptr [0x........] or qword ptr [0x........] if "word ptr [0x" in inst.op_str: iat_va = int(inst.op_str.split('[')[1].split(']')[0], 16) if iat_va in iat_api: inst.op_str = iat_api[iat_va] elif inst.op_str in self.x86_32_registers: pass else: try: remote_offset = self.get_offset_from_va( bin_instance, int(inst.op_str, 16)) except Exception as e: if verbose: print("WARNING:", repr(e)) pass if remote_offset: args_queue.insert( 1, (remote_offset, insts[inst.offset], False)) # 1 child (next) - basic instruction else: next_offset = inst.offset + inst.size args_queue.insert(0, (next_offset, insts[inst.offset], True)) return insts def dis_prologues(self, data, bin_instance, iat_api, insts, verbose): prologues_re = "|".encode().join(self.prologues) compiled_re = re.compile(prologues_re) for m in compiled_re.finditer(data): function_offset = m.start() inst = insts.get(function_offset, None) if inst is None or inst.cache_only: insts = self._dis(data=data, offset=function_offset, iat_api=iat_api, bin_instance=bin_instance, insts=insts, verbose=verbose) return insts def dis(self, data, offset, iat_api, bin_instance, verbose=False): ''' data: raw binary of full PE va: va of the instruction located at <data[index]> iat_api: dict of imported API like {VA_IN_IAT: API_NAME} ''' insts = dict() insts = self.linear_sweep_cache(data=data, offset=offset, insts=insts, bin_instance=bin_instance, verbose=verbose) insts = self._dis(data=data, offset=offset, iat_api=iat_api, bin_instance=bin_instance, insts=insts, verbose=verbose) # Exploration of the exported functions self._dis_exported_funcs(bin_instance=bin_instance, insts=insts, data=data, verbose=verbose, iat_api=iat_api) # Search for unrecognized functions from their prolog function insts = self.dis_prologues(data=data, bin_instance=bin_instance, iat_api=iat_api, insts=insts, verbose=verbose) return insts def display(self, insts, offset_from=0): for offset, inst in sorted(insts.items()): if offset >= offset_from: print(inst) def export_to_dot(self, insts, oep_offset, displayable=True): ''' Export the intruction graph to DOT format ''' nodes = io.StringIO() edges = io.StringIO() dot = io.StringIO() header = "digraph G {\n" footer = "}" if displayable: for offset, inst in sorted(insts.items()): if not inst.cache_only: if inst.op_str == "": inst_str = "%s" % inst.mnemonic else: inst_str = "%s %s" % (inst.mnemonic, inst.op_str) if offset != oep_offset: nodes.write( ('"%X" [label="%s", address="0x%X", inst="%s", ' 'style="", shape=box, fillcolor="white"]\n') % (inst.va, "%016X: %s %s" % (inst.va, inst.mnemonic, inst.op_str), inst.va, inst_str)) else: nodes.write(( '"%X" [label="%s", address="0x%X", inst="%s", ' 'style="", shape=box, fillcolor="white", root=true]\n' ) % (inst.va, "%016X: %s %s" % (inst.va, inst.mnemonic, inst.op_str), inst.va, inst_str)) if inst.to_succ is not None: edges.write(( '"%X" -> "%X" [label=0, color=%s, child_number=1]\n' ) % (inst.va, insts[inst.to_succ].va, "black")) if inst.to_other is not None: edges.write(( '"%X" -> "%X" [label=1, color=%s, child_number=2]\n' ) % (inst.va, insts[inst.to_other].va, "red")) else: for offset, inst in sorted(insts.items()): if not inst.cache_only: if inst.op_str == "": inst_str = "%s" % inst.mnemonic else: inst_str = "%s %s" % (inst.mnemonic, inst.op_str) if offset != oep_offset: nodes.write(('"%X" [inst="%s", address="0x%X"]\n') % (inst.va, inst_str, inst.va)) else: nodes.write( ('"%X" [inst="%s", address="0x%X", root=true]\n') % (inst.va, inst_str, inst.va)) if inst.to_succ is not None: edges.write(('"%X" -> "%X" [child_number=1]\n') % (inst.va, insts[inst.to_succ].va)) if inst.to_other is not None: edges.write(('"%X" -> "%X" [child_number=2]\n') % (inst.va, insts[inst.to_other].va)) dot.write(header) dot.write(nodes.getvalue()) dot.write(edges.getvalue()) dot.write(footer) return dot.getvalue()
class FunctionCandidateManager(object): def __init__(self, config): self.config = config self.lang_analyzer = None self.disassembly = None self.bitness = None self._code_areas = [] self.candidates = {} self.candidate_queue = [] self.cached_candidates = None self._candidate_offsets = [] self.candidate_index = 0 self._all_call_refs = {} self.symbol_addresses = [] self.identified_alignment = 0 # gap filling self.function_gaps = None self.max_function_addr = 0 self.gap_pointer = None self.previously_analyzed_gap = 0 self.capstone = None def init(self, disassembly): if disassembly.binary_info.code_areas: self._code_areas = disassembly.binary_info.code_areas self.disassembly = disassembly self.lang_analyzer = LanguageAnalyzer(disassembly) self.disassembly.language = self.lang_analyzer.identify() self.bitness = disassembly.binary_info.bitness self.capstone = Cs(CS_ARCH_X86, CS_MODE_32) if self.bitness == 64: self.capstone = Cs(CS_ARCH_X86, CS_MODE_64) self.locateCandidates() self.disassembly.identified_alignment = self.identified_alignment self._buildQueue() def _passesCodeFilter(self, addr): if addr is None: return False if self._code_areas: for area in self._code_areas: if area[0] <= addr < area[1]: return True return False return True def getBitMask(self): if self.bitness == 64: return 0xFFFFFFFFFFFFFFFF return 0xFFFFFFFF def setInitialCandidate(self, addr): if addr in self.candidates: self.candidates[addr].setInitialCandidate(True) def isFunctionCandidate(self, addr): return addr in self.candidates def getFunctionCandidate(self, addr): if addr in self.candidates: return self.candidates[addr] return None def getAbortedCandidates(self): aborted = [] for addr, candidate in self.candidates.items(): if candidate.analysis_aborted: aborted.append(addr) return aborted def updateAnalysisAborted(self, addr, reason): LOGGER.debug("function analysis of 0x%08x aborted: %s", addr, reason) if addr in self.candidates: self.candidates[addr].setAnalysisAborted(reason) def updateAnalysisFinished(self, addr): LOGGER.debug("function analysis of 0x%08x successfully completed.", addr) if addr in self.candidates: self.candidates[addr].setAnalysisCompleted() def updateCandidates(self, state): if self.config.HIGH_ACCURACY: conflicts = state.identifyCallConflicts(self._all_call_refs) if conflicts: for candidate_addr, conflict in conflicts.items(): self.candidates[candidate_addr].removeCallRefs(conflict) self.candidate_queue.update() def addCandidate(self, addr, is_gap=False, reference_source=None): if not self._passesCodeFilter(addr): return False self.ensureCandidate(addr) self.candidates[addr].setIsGapCandidate(is_gap) if reference_source: self.candidates[addr].addCallRef(reference_source) self.candidate_queue.add(self.candidates[addr]) self.candidate_queue.update() def getNextFunctionStartCandidate(self): for candidate in self.candidate_queue: if not (candidate.isFinished() or candidate.getScore() == 0): if self.identified_alignment and candidate.alignment < self.identified_alignment: continue yield candidate def _logCandidateStats(self): LOGGER.debug("Candidate Statistics:") try: maxc = max([c.getScore() for c in self.candidates.values()]) minc = min([c.getScore() for c in self.candidates.values()]) candidates_2 = len([ c.getScore() for c in self.candidates.values() if c.getScore() == 2 ]) candidates_1 = len([ c.getScore() for c in self.candidates.values() if c.getScore() == 1 ]) candidates_0 = len([ c.getScore() for c in self.candidates.values() if c.getScore() == 0 ]) LOGGER.debug(" Max: %f, Min: %f", maxc, minc) LOGGER.debug(" 2: %d, 1: %d, 0: %d", candidates_2, candidates_1, candidates_0) except: LOGGER.debug(" No candidates found.") def getFunctionStartCandidates(self): return self._candidate_offsets def updateFunctionGaps(self): gaps = [] prev_ins = 0 min_code = min(self.disassembly.code_map ) if self.disassembly.code_map else self.getBitMask() max_code = max( self.disassembly.code_map) if self.disassembly.code_map else 0 for code_area in self._code_areas: if code_area[0] < min_code < code_area[ 1] and min_code != code_area[0]: gaps.append([code_area[0], min_code, min_code - code_area[0]]) if code_area[0] < max_code < code_area[ 1] and max_code != code_area[1]: gaps.append([max_code, code_area[1], code_area[1] - max_code]) for ins in sorted(self.disassembly.code_map.keys()): if prev_ins != 0: if ins - prev_ins > 1: gaps.append([prev_ins + 1, ins, ins - prev_ins]) prev_ins = ins self.function_gaps = sorted(gaps) def initGapSearch(self): if self.gap_pointer is None: LOGGER.debug("initGapSearch()") self.gap_pointer = self.getBitMask() self.updateFunctionGaps() if self.function_gaps: self.gap_pointer = self.function_gaps[0][0] LOGGER.debug("initGapSearch() gaps are:") for gap in self.function_gaps: LOGGER.debug("initGapSearch() 0x%08x - 0x%08x == %d", gap[0], gap[1], gap[2]) return def getNextGap(self, dont_skip=False): next_gap = self.getBitMask() for gap in self.function_gaps: if gap[0] > self.gap_pointer: next_gap = gap[0] break LOGGER.debug("getNextGap(%s) for 0x%08x based on gap_map: 0x%08x", dont_skip, self.gap_pointer, next_gap) # we potentially just disassembled a function and want to continue directly behind it in case we would otherwise miss more if dont_skip: if self.gap_pointer in self.disassembly.code_map: function = self.disassembly.ins2fn[self.gap_pointer] next_gap = min(next_gap, self.disassembly.function_borders[function][1]) LOGGER.debug( "getNextGap(%s) without skip => after checking versus code map: 0x%08x", dont_skip, next_gap) LOGGER.debug("getNextGap(%s) final gap_ptr: 0x%08x", dont_skip, next_gap) return next_gap def isEffectiveNop(self, byte_sequence): if byte_sequence in GAP_SEQUENCES[len(byte_sequence)]: return True return False def isAlignmentSequence(self, instruction_sequence): is_alignment_sequence = False if len(instruction_sequence) > 0: current_offset = instruction_sequence[0].address for instruction in instruction_sequence: if instruction.bytes in GAP_SEQUENCES[len(instruction.bytes)]: current_offset += len(instruction.bytes) if current_offset % 16 == 0: is_alignment_sequence = True break else: break return is_alignment_sequence def nextGapCandidate(self, start_gap_pointer=None): if self.gap_pointer is None: self.initGapSearch() if start_gap_pointer: self.gap_pointer = start_gap_pointer LOGGER.debug( "nextGapCandidate() finding new gap candidate, current gap_ptr: 0x%08x", self.gap_pointer) while True: if self.disassembly.binary_info.base_addr + self.disassembly.binary_info.binary_size < self.gap_pointer: LOGGER.debug("nextGapCandidate() gap_ptr: 0x%08x - finishing", self.gap_pointer) return None gap_offset = self.gap_pointer - self.disassembly.binary_info.base_addr if gap_offset >= self.disassembly.binary_info.binary_size: return None # compatibility with python2/3... try: byte = self.disassembly.getRawByte(gap_offset) except: print("0x%08x" % self.disassembly.binary_info.base_addr, "0x%08x" % self.disassembly.binary_info.binary_size, "0x%08x" % self.gap_pointer, "0x%08x" % gap_offset) # try to find padding symbols and skip them if isinstance(byte, int): byte = struct.pack("B", byte) if byte in GAP_SEQUENCES[1]: LOGGER.debug( "nextGapCandidate() found 0xCC / 0x00 - gap_ptr += 1: 0x%08x", self.gap_pointer) self.gap_pointer += 1 continue # try to find instructions that directly encode as NOP and skip them ins_buf = [ i for i in self.capstone.disasm_lite( self.disassembly.getRawBytes(gap_offset, 15), gap_offset) ] if ins_buf: i_address, i_size, i_mnemonic, i_op_str = ins_buf[0] if i_mnemonic == "nop": nop_instruction = i_mnemonic + " " + i_op_str nop_length = i_size LOGGER.debug( "nextGapCandidate() found nop instruction (%s) - gap_ptr += %d: 0x%08x", nop_instruction, nop_length, self.gap_pointer) self.gap_pointer += nop_length continue # try to find effective NOPs and skip them. found_multi_byte_nop = False for gap_length in range(max(GAP_SEQUENCES.keys()), 1, -1): if self.disassembly.getRawBytes( gap_offset, gap_length) in GAP_SEQUENCES[gap_length]: LOGGER.debug( "nextGapCandidate() found %d byte effective nop - gap_ptr += %d: 0x%08x", gap_length, gap_length, self.gap_pointer) self.gap_pointer += gap_length found_multi_byte_nop = True break if found_multi_byte_nop: continue # we know this place from data already if self.gap_pointer in self.disassembly.data_map: LOGGER.debug( "nextGapCandidate() gap_ptr is already inside data map: 0x%08x", self.gap_pointer) self.gap_pointer += 1 continue if self.gap_pointer in self.disassembly.code_map: LOGGER.debug( "nextGapCandidate() gap_ptr is already inside code map: 0x%08x", self.gap_pointer) self.gap_pointer = self.getNextGap() continue # we may have a candidate here LOGGER.debug("nextGapCandidate() using 0x%08x as candidate", self.gap_pointer) start_byte = self.disassembly.getRawByte(gap_offset) has_common_prologue = True # start_byte in FunctionCandidate(self.gap_pointer, start_byte, self.bitness).common_gap_starts[self.bitness] if self.previously_analyzed_gap == self.gap_pointer: LOGGER.debug( "--- HRM, nextGapCandidate() gap_ptr at: 0x%08x was previously analyzed", self.gap_pointer) self.gap_pointer = self.getNextGap(dont_skip=True) elif not has_common_prologue: LOGGER.debug( "--- HRM, nextGapCandidate() gap_ptr at: 0x%08x has no common prologue (0x%08x)", self.gap_pointer, ord(start_byte)) self.gap_pointer = self.getNextGap(dont_skip=True) else: self.previously_analyzed_gap = self.gap_pointer self.addGapCandidate(self.gap_pointer) return self.gap_pointer return None def checkFunctionOverlap(self): function_boundaries = [] for function in self.disassembly.functions: min_addr = self.getBitMask() max_addr = 0 for block in self.disassembly.functions[function]: min_addr = min(min_addr, min([instruction[0] for instruction in block])) max_addr = max( max_addr, max([ instruction[0] + instruction[1] for instruction in block ])) function_boundaries.append((min_addr, max_addr)) current_entry = (0, 0) for entry in sorted(function_boundaries): if current_entry[1] > entry[0]: return True current_entry = entry return False def checkCodePadding(self): pattern_count = 0 pattern_functions = [] for pattern in re.finditer(r"((\xCC){2,}|(\x90){2,})", self.disassembly.binary_info.binary): pattern_count += 1 pattern_functions.append(pattern.span()[1] + 1) def ensureCandidate(self, addr): """ create candidate if it does not exist yet, returns True if newly created, else False """ if addr not in self.candidates: self.candidates[addr] = FunctionCandidate( self.disassembly.binary_info, addr) return True return False def addGapCandidate(self, addr): if not self._passesCodeFilter(addr): return False self.ensureCandidate(addr) self.candidates[addr].setIsGapCandidate(True) def addTailcallCandidate(self, addr): if not self._passesCodeFilter(addr): return False self.ensureCandidate(addr) self.candidates[addr].setIsTailcallCandidate(True) def addReferenceCandidate(self, addr, source_ref): if not self._passesCodeFilter(addr): return False if self.ensureCandidate(addr): self._all_call_refs[source_ref] = addr self.candidates[addr].addCallRef(source_ref) def addLanguageSpecCandidate(self, addr, lang_spec): if not self._passesCodeFilter(addr): return False self.ensureCandidate(addr) self.candidates[addr].setLanguageSpec(lang_spec) def addPrologueCandidate(self, addr): if not self._passesCodeFilter(addr): return False return self.ensureCandidate(addr) def addSymbolCandidate(self, addr): if not self._passesCodeFilter(addr): return False self.ensureCandidate(addr) self.candidates[addr].setIsSymbol(True) self.candidates[addr].setInitialCandidate(True) def addExceptionCandidate(self, addr): if not self._passesCodeFilter(addr): return False self.ensureCandidate(addr) self.candidates[addr].setIsExceptionHandler(True) self.candidates[addr].setInitialCandidate(True) def resolvePointerReference(self, offset): if self.bitness == 32: addr_block = self.disassembly.getRawBytes(offset + 2, 4) function_pointer = struct.unpack("I", addr_block)[0] return self.disassembly.dereferenceDword(function_pointer) if self.bitness == 64: addr_block = self.disassembly.getRawBytes(offset + 2, 4) function_pointer = struct.unpack("i", addr_block)[0] # we need to calculate RIP + offset + 7 (48 ff 25 ** ** ** **) if self.disassembly.getRawBytes(offset, 2) == "\xFF\x25": function_pointer += offset + 7 elif self.disassembly.getRawBytes(offset, 2) == "\xFF\x15": function_pointer += offset + 6 else: raise Exception( "resolvePointerReference: should only be used on call/jmp * ptr" ) return self.disassembly.binary_info.base_addr + function_pointer raise Exception("resolvePointerReference: undefined bitness") def _identifyAlignment(self): identified_alignment = 0 if self.config.USE_ALIGNMENT: num_candidates = sum([ 1 for addr, candidate in self.candidates.items() if len(candidate.call_ref_sources) > 1 ]) num_aligned_16_candidates = sum([ 1 for addr, candidate in self.candidates.items() if len(candidate.call_ref_sources) > 1 and candidate.alignment == 16 ]) num_aligned_4_candidates = sum([ 1 for addr, candidate in self.candidates.items() if len(candidate.call_ref_sources) > 1 and candidate.alignment >= 4 ]) if num_candidates: alignment_16_ratio = 1.0 * num_aligned_16_candidates / num_candidates alignment_4_ratio = 1.0 * num_aligned_4_candidates / num_candidates if num_candidates > 20 and alignment_4_ratio > 0.95: identified_alignment = 4 if num_candidates > 20 and alignment_16_ratio > 0.95: identified_alignment = 16 return identified_alignment def locateCandidates(self): self.locateSymbolCandidates() self.locateReferenceCandidates() self.locatePrologueCandidates() self.locateLangSpecCandidates() self.locateStubChainCandidates() self.locateExceptionHandlerCandidates() self.identified_alignment = self._identifyAlignment() def _buildQueue(self): LOGGER.debug("Located %d function candidates", len(self.candidates)) # increase lookup speed with static list self._candidate_offsets = [c.addr for c in self.candidates.values()] self.cached_candidates = list(self.candidates.values()) self.candidate_queue = PriorityQueue(content=self.cached_candidates) def locateSymbolCandidates(self): for symbol_addr in self.symbol_addresses: self.addSymbolCandidate(symbol_addr) def locateReferenceCandidates(self): # check for potential call instructions and check if their destinations have a common function prologue for call_match in re.finditer(b"\xE8", self.disassembly.binary_info.binary): if not self._passesCodeFilter( self.disassembly.binary_info.base_addr + call_match.start()): continue if len(self.disassembly.binary_info.binary) - call_match.start( ) > 5: packed_call = self.disassembly.getRawBytes( call_match.start() + 1, 4) rel_call_offset = struct.unpack("i", packed_call)[0] # ignore zero offset calls, as they will likely not lead to functions but are rather used for positioning in shellcode etc if rel_call_offset == 0: continue call_destination = (self.disassembly.binary_info.base_addr + rel_call_offset + call_match.start() + 5) & self.getBitMask() if self.disassembly.isAddrWithinMemoryImage(call_destination): self.addReferenceCandidate( call_destination, self.disassembly.binary_info.base_addr + call_match.start()) self.setInitialCandidate(call_destination) # also check for "jmp dword ptr <offset>", as they sometimes point to local functions (i.e. non-API) if self.bitness == 32: for match in re.finditer(b"\xFF\x25", self.disassembly.binary_info.binary): function_addr = self.resolvePointerReference(match.start()) if not self._passesCodeFilter(function_addr): continue if self.disassembly.isAddrWithinMemoryImage(function_addr): self.addReferenceCandidate( function_addr, self.disassembly.binary_info.base_addr + match.start()) self.setInitialCandidate(function_addr) # also check for "call dword ptr <offset>", as they sometimes point to local functions (i.e. non-API) for match in re.finditer(b"\xFF\x15", self.disassembly.binary_info.binary): function_addr = self.resolvePointerReference(match.start()) if not self._passesCodeFilter(function_addr): continue if self.disassembly.isAddrWithinMemoryImage(function_addr): self.addReferenceCandidate( function_addr, self.disassembly.binary_info.base_addr + match.start()) self.setInitialCandidate(function_addr) def locatePrologueCandidates(self): # next check for the default function prologue regardless of references for re_prologue in DEFAULT_PROLOGUES: for prologue_match in re.finditer( re_prologue, self.disassembly.binary_info.binary): if not self._passesCodeFilter( self.disassembly.binary_info.base_addr + prologue_match.start()): continue self.addPrologueCandidate( (self.disassembly.binary_info.base_addr + prologue_match.start()) & self.getBitMask()) self.setInitialCandidate( (self.disassembly.binary_info.base_addr + prologue_match.start()) & self.getBitMask()) def locateLangSpecCandidates(self): # if the sample is highly likely delphi, extract t-string-objects and use their function-addresses as high-confidence function starts delphi_candidates = set([]) if self.lang_analyzer.checkDelphi(): LOGGER.debug( "Programming language recognized as Delphi, adding function start addresses from TObjects" ) t_objects = self.lang_analyzer.getDelphiObjects() for t_string in t_objects: delphi_candidates.update(set(t_objects[t_string])) LOGGER.debug("delphi candidates based on TObject analysis: %d", len(delphi_candidates)) for obj in delphi_candidates: self.addLanguageSpecCandidate(obj, "delphi") def locateStubChainCandidates(self): # binaries often contain long sequences of stubs, consisting only of jmp dword ptr <offset>, add such chains as candidates for block in re.finditer(b"(?P<block>(\xFF\x25[\S\s]{4}){2,})", self.disassembly.binary_info.binary): for match in re.finditer(b"\xFF\x25(?P<function>[\S\s]{4})", block.group("block")): stub_addr = self.disassembly.binary_info.base_addr + block.start( ) + match.start() if not self._passesCodeFilter(stub_addr): continue self.addPrologueCandidate(stub_addr & self.getBitMask()) self.setInitialCandidate(stub_addr & self.getBitMask()) self.candidates[stub_addr].setIsStub(True) # structure for plt entries is similar but interleaved with additional code not considered functions for block in re.finditer( b"(?P<block>(\xFF\x25[\S\s]{4}\x68[\S\s]{4}\xE9[\S\s]{4}){2,})", self.disassembly.binary_info.binary): for match in re.finditer(b"\xFF\x25(?P<function>[\S\s]{4})", block.group("block")): stub_addr = self.disassembly.binary_info.base_addr + block.start( ) + match.start() if not self._passesCodeFilter(stub_addr): continue self.addPrologueCandidate(stub_addr & self.getBitMask()) self.setInitialCandidate(stub_addr & self.getBitMask()) self.candidates[stub_addr].setIsStub(True) # define data bytes inbetween for offset in range(10): self.disassembly.data_map.add(stub_addr + 6 + offset) def locateExceptionHandlerCandidates(self): # 64bit only - if we have a .pdata section describing exception handlers, we extract entries of guaranteed function starts from it. # TODO 2020-10-29 continue here and extract function start candidates if self.disassembly.binary_info.bitness == 64: for section_info in self.disassembly.binary_info.getSections(): section_name, section_va_start, section_va_end = section_info if section_name == ".pdata": rva_start = section_va_start - self.disassembly.binary_info.base_addr rva_end = section_va_end - self.disassembly.binary_info.base_addr for offset in range(rva_start, rva_end + 1, 12): rva_function_candidate = struct.unpack( "I", self.disassembly.binary_info.binary[offset:offset + 4])[0] self.addExceptionCandidate( self.disassembly.binary_info.base_addr + rva_function_candidate) if not rva_function_candidate: break
class IdaExporter(object): def __init__(self, config, bitness=None): self.config = config self.ida_interface = IdaInterface() self.bitness = bitness if bitness else self.ida_interface.getBitness() self.capstone = None self.disassembly = DisassemblyResult() self.disassembly.smda_version = config.VERSION self._initCapstone() def _initCapstone(self): self.capstone = Cs(CS_ARCH_X86, CS_MODE_32) if self.bitness == 64: self.capstone = Cs(CS_ARCH_X86, CS_MODE_64) def _convertIdaInsToSmda(self, offset, instruction_bytes): cache = [ i for i in self.capstone.disasm_lite(instruction_bytes, offset) ] if cache: i_address, i_size, i_mnemonic, i_op_str = [] smda_ins = (i_address, i_size, i_mnemonic, i_op_str, instruction_bytes) else: # record error and emit placeholder instruction bytes_as_hex = "".join( ["%02x" % c for c in bytearray(instruction_bytes)]) print("missing capstone disassembly output at 0x%x (%s)" % (offset, bytes_as_hex)) self.disassembly.errors[offset] = { "type": "capstone disassembly failure", "instruction_bytes": bytes_as_hex } smda_ins = (offset, len(instruction_bytes), "error", "error", bytearray(instruction_bytes)) return smda_ins def analyzeBuffer(self, binary_info, cb_analysis_timeout=None): """ instead of performing a full analysis, simply collect all data from IDA and convert it into a report """ self.disassembly.analysis_start_ts = datetime.datetime.utcnow() self.disassembly.binary_info = binary_info self.disassembly.binary_info.architecture = self.ida_interface.getArchitecture( ) if not self.disassembly.binary_info.base_addr: self.disassembly.binary_info.base_addr = self.ida_interface.getBaseAddr( ) if not self.disassembly.binary_info.binary: self.disassembly.binary_info.binary = self.ida_interface.getBinary( ) if not self.disassembly.binary_info.bitness: self.disassembly.binary_info.bitness = self.bitness self.disassembly.function_symbols = self.ida_interface.getFunctionSymbols( ) api_map = self.ida_interface.getApiMap() for function_offset in self.ida_interface.getFunctions(): if self.ida_interface.isExternalFunction(function_offset): continue converted_function = [] for block in self.ida_interface.getBlocks(function_offset): converted_block = [] for instruction_offset in block: instruction_bytes = self.ida_interface.getInstructionBytes( instruction_offset) smda_instruction = self._convertIdaInsToSmda( instruction_offset, instruction_bytes) converted_block.append(smda_instruction) self.disassembly.instructions[smda_instruction[0]] = ( smda_instruction[2], smda_instruction[1]) in_refs = self.ida_interface.getCodeInRefs( smda_instruction[0]) for in_ref in in_refs: self.disassembly.addCodeRefs(in_ref[0], in_ref[1]) out_refs = self.ida_interface.getCodeOutRefs( smda_instruction[0]) for out_ref in out_refs: self.disassembly.addCodeRefs(out_ref[0], out_ref[1]) if out_ref[1] in api_map: self.disassembly.addr_to_api[ instruction_offset] = api_map[out_ref[1]] converted_function.append(converted_block) self.disassembly.functions[function_offset] = converted_function if self.disassembly.isRecursiveFunction(function_offset): self.disassembly.recursive_functions.add(function_offset) if self.disassembly.isLeafFunction(function_offset): self.disassembly.leaf_functions.add(function_offset) self.disassembly.analysis_end_ts = datetime.datetime.utcnow() return self.disassembly
def open_x64dbg_trace(filename): """Opens x64dbg trace file Args: filename: name of trace file Returns: TraceData object """ with open(filename, "rb") as f: trace_data = TraceData() trace_data.filename = filename # check first 4 bytes magic = f.read(4) if magic != b"TRAC": raise ValueError("Error, wrong file format.") json_length_bytes = f.read(4) json_length = int.from_bytes(json_length_bytes, "little") # read JSON blob json_blob = f.read(json_length) json_str = str(json_blob, "utf-8") arch = json.loads(json_str)["arch"] reg_indexes = {} if arch == "x64": regs = prefs.X64_REGS ip_reg = "rip" capstone_mode = CS_MODE_64 pointer_size = 8 # qword else: regs = prefs.X32_REGS ip_reg = "eip" capstone_mode = CS_MODE_32 pointer_size = 4 # dword for i, reg in enumerate(regs): reg_indexes[reg] = i trace_data.arch = arch trace_data.ip_reg = ip_reg trace_data.regs = reg_indexes trace_data.pointer_size = pointer_size md = Cs(CS_ARCH_X86, capstone_mode) reg_values = [None] * len(reg_indexes) trace = [] row_id = 0 while f.read(1) == b"\x00": register_changes = int.from_bytes(f.read(1), "little") memory_accesses = int.from_bytes(f.read(1), "little") flags_and_opcode_size = int.from_bytes(f.read(1), "little") # Bitfield thread_id_bit = (flags_and_opcode_size >> 7) & 1 # msb opcode_size = flags_and_opcode_size & 15 # 4 lsbs if thread_id_bit > 0: thread_id = int.from_bytes(f.read(4), "little") opcodes = f.read(opcode_size) register_change_position = [] for _ in range(register_changes): register_change_position.append( int.from_bytes(f.read(1), "little")) register_change_new_data = [] for _ in range(register_changes): register_change_new_data.append( int.from_bytes(f.read(pointer_size), "little")) memory_access_flags = [] for _ in range(memory_accesses): memory_access_flags.append(int.from_bytes(f.read(1), "little")) memory_access_addresses = [] for _ in range(memory_accesses): memory_access_addresses.append( int.from_bytes(f.read(pointer_size), "little")) memory_access_old_data = [] for _ in range(memory_accesses): memory_access_old_data.append( int.from_bytes(f.read(pointer_size), "little")) memory_access_new_data = [] for i in range(memory_accesses): if memory_access_flags[i] & 1 == 0: memory_access_new_data.append( int.from_bytes(f.read(pointer_size), "little")) reg_id = 0 for i, change in enumerate(register_change_position): reg_id += change if reg_id + i < len(reg_indexes): reg_values[reg_id + i] = register_change_new_data[i] # disassemble ip_value = reg_values[reg_indexes[ip_reg]] for (_address, _size, mnemonic, op_str) in md.disasm_lite(opcodes, ip_value): disasm = mnemonic if op_str: disasm += " " + op_str mems = [] mem = {} new_data_counter = 0 for i in range(memory_accesses): flag = memory_access_flags[i] value = memory_access_old_data[i] mem["access"] = "READ" if flag & 1 == 0: value = memory_access_new_data[new_data_counter] mem["access"] = "WRITE" new_data_counter += 1 else: pass # memory value didn't change # (it is read or overwritten with identical value) # this has to be fixed somehow in x64dbg mem["addr"] = memory_access_addresses[i] # fix value (x64dbg saves all values as qwords) if "qword" in disasm: pass elif "dword" in disasm: value &= 0xFFFFFFFF elif "word" in disasm: value &= 0xFFFF elif "byte" in disasm: value &= 0xFF mem["value"] = value mems.append(mem.copy()) trace_row = {} trace_row["id"] = row_id trace_row["ip"] = ip_value trace_row["disasm"] = disasm trace_row["regs"] = reg_values.copy() trace_row["opcodes"] = opcodes.hex() trace_row["mem"] = mems.copy() trace_row["comment"] = "" trace.append(trace_row) row_id += 1 trace_data.trace = trace return trace_data
def main(): BYTES = 500 NUM_MNEM = 30 SIG_FILE = "./mpesm.sig" THRESHOLD = .85 VERBOSE = False DIR_PROCESSING = False signatures = {} file_list = [] nos = 0 ep = 0 ep_ava = 0 parser = ArgumentParser(description="Mnemonic PE Signature Matching") parser.add_argument("-n", "--num-mnem", dest="num_mnem", help="Use a lenght of 'n' mnemonics (default: " + str(NUM_MNEM) + ')') parser.add_argument("-s", "--signatures", dest="sig_file", help="signature file to use (default: " + SIG_FILE + ')') parser.add_argument("-b", "--bytes", dest="bytes", help="Grab and disassemble x bytes from EP, you should only need to change this if you give a super large number for -n (default: " + str(BYTES) + ')') parser.add_argument("-t", "--threshold", dest="threshold", help="Display all matches greater than -t supplied similarity (default: " + str(THRESHOLD) + ')') parser.add_argument("-v", "--verbose", dest="verbose", help="Verbose output", action='store_true') parser.add_argument("file", nargs=1, help='File to analyze') args = parser.parse_args() if args.sig_file: SIG_FILE = args.sig_file if args.threshold: THRESHOLD = float(args.threshold) if args.bytes: BYTES = args.bytes if args.num_mnem: NUM_MNEM = args.num_mnem if args.verbose: VERBOSE = True config = ConfigParser.RawConfigParser() config.read(SIG_FILE) if len(config.sections()) == 0: print "Error Reading from config file: %s, it's either empty or not present" %(SIG_FILE) sys.exit(1) for s in config.sections(): signatures[s] = {} signatures[s]['mnemonics'] = config.get(s, 'mnemonics').split(',') if config.has_option(s, 'num_mnemonics'): signatures[s]['num_mnemonics'] = config.getint(s, 'num_mnemonics') if config.has_option(s, 'major_linker'): signatures[s]['major_linker'] = config.getint(s, 'major_linker') if config.has_option(s, 'minor_linker'): signatures[s]['minor_linker'] = config.getint(s, 'minor_linker') if config.has_option(s, 'numberofsections'): signatures[s]['numberofsections'] = config.getint(s, 'numberofsections') if os.path.isdir(args.file[0]): file_list = glob.glob(args.file[0]+'/*') DIR_PROCESSING = True else: file_list.append(args.file[0]) for f in file_list: file_type = None if VERBOSE: print '[*] Processing: ' + f try: fe = pefile.PE(f) file_type = 'PE' except Exception as e: if VERBOSE: sys.stderr.write("[*] Error with %s - %s\n" %(f, str(e))) if not file_type: try: fe = macholib.MachO.MachO(f) file_type = 'MACHO' except Exception as e: if VERBOSE: sys.stderr.write("[*] Error with %s - %s\n" %(f, str(e))) if not file_type: sys.stderr.write("[*] Error with %s - not a PE or Mach-O\n" % f) if file_type == 'PE': try: minor_linker = 0 major_linker = 0 try: minor_linker = fe.OPTIONAL_HEADER.MinorLinkerVersion major_linker = fe.OPTIONAL_HEADER.MajorLinkerVersion except Exception as e: pass if hasattr(fe, 'FILE_HEADER') and hasattr(fe.FILE_HEADER, 'NumberOfSections'): nos = fe.FILE_HEADER.NumberOfSections if hasattr(fe, 'OPTIONAL_HEADER') and hasattr(fe.OPTIONAL_HEADER, 'AddressOfEntryPoint'): ep = fe.OPTIONAL_HEADER.AddressOfEntryPoint if hasattr(fe, 'OPTIONAL_HEADER') and hasattr(fe.OPTIONAL_HEADER, 'ImageBase') and ep > 0: ep_ava = ep+fe.OPTIONAL_HEADER.ImageBase data = fe.get_memory_mapped_image()[ep:ep+BYTES] # # Determine if the file is 32bit or 64bit # mode = CS_MODE_32 if fe.OPTIONAL_HEADER.Magic == 0x20b: mode = CS_MODE_64 md = Cs(CS_ARCH_X86, mode) match = [] for (address, size, mnemonic, op_str) in md.disasm_lite(data, 0x1000): match.append(mnemonic.encode('utf-8').strip()) for s in signatures: m = match sig = signatures[s]['mnemonics'] if m and m[0] == sig[0] or THRESHOLD < .7: additional_info = [] if 'minor_linker' in signatures[s]: if minor_linker == signatures[s]['minor_linker']: additional_info.append('Minor Linker Version Match: True') else: additional_info.append('Minor Linker Version Match: False') if 'major_linker' in signatures[s]: if major_linker == signatures[s]['major_linker']: additional_info.append('Major Linker Version Match: True') else: additional_info.append('Major Linker Version Match: False') if 'numberofsections' in signatures[s]: if nos == signatures[s]['numberofsections']: additional_info.append('Number Of Sections Match: True') else: additional_info.append('Number Of Sections Match: False') if 'num_mnemonics' in signatures[s]: nm = signatures[s]['num_mnemonics'] m = match[:nm] sig = signatures[s]['mnemonics'][:nm] else: m = match[:NUM_MNEM] sig = signatures[s]['mnemonics'][:NUM_MNEM] distance = tapered_levenshtein(sig, m) similarity = 1.0 - distance/float(max(len(sig), len(m))) if similarity > THRESHOLD: if DIR_PROCESSING: print "[%s] [%s] (Edits: %s | Similarity: %0.3f) (%s)" %(f, s, distance, similarity, ' | '.join(additional_info)) else: print "[%s] (Edits: %s | Similarity: %0.3f) (%s)" %(s, distance, similarity, ' | '.join(additional_info)) if VERBOSE: print "%s\n%s\n" %(sig, m) except Exception as e: print str(e) elif file_type == 'MACHO': macho_file = open(f, 'rb') macho_data = macho_file.read() macho_file.close() for header in fe.headers: # Limit it to X86 if header.header.cputype not in [7, 0x01000007]: continue # Limit it to Object and Executable files if header.header.filetype not in [1, 2]: continue magic = int(header.MH_MAGIC) offset = int(header.offset) all_sections = [] entrypoint_type = '' entrypoint_address = 0 for cmd in header.commands: load_cmd = cmd[0] cmd_info = cmd[1] cmd_data = cmd[2] cmd_name = load_cmd.get_cmd_name() if cmd_name in ('LC_SEGMENT', 'LC_SEGMENT_64'): for section_data in cmd_data: sd = section_data.describe() all_sections.append(sd) elif cmd_name in ('LC_THREAD', 'LC_UNIXTHREAD'): entrypoint_type = 'old' flavor = int(struct.unpack(header.endian + 'I', cmd_data[0:4])[0]) count = int(struct.unpack(header.endian + 'I', cmd_data[4:8])[0]) if flavor == 1: entrypoint_address = int(struct.unpack(header.endian + 'I', cmd_data[48:52])[0]) elif flavor == 4: entrypoint_address = int(struct.unpack(header.endian + 'Q', cmd_data[136:144])[0]) elif cmd_name == 'LC_MAIN': entrypoint_type = 'new' entrypoint_address = cmd_info.describe()['entryoff'] entrypoint_data = '' if entrypoint_type == 'new': entrypoint_offset = offset + entrypoint_address entrypoint_data = macho_data[entrypoint_offset:entrypoint_offset+500] elif entrypoint_type == 'old': found_section = False for sec in all_sections: if entrypoint_address >= sec['addr'] and entrypoint_address < (sec['addr'] + sec['size']): found_section = True entrypoint_address = (entrypoint_address - sec['addr']) + sec['offset'] break if found_section: entrypoint_offset = offset + entrypoint_address entrypoint_data = macho_data[entrypoint_offset:entrypoint_offset+500] mode = CS_MODE_32 if magic == 0xcffaedfe: mode = CS_MODE_64 md = Cs(CS_ARCH_X86, mode) match = [] if entrypoint_data: try: for (address, size, mnemonic, op_str) in md.disasm_lite(entrypoint_data, 0x1000): match.append(mnemonic.encode('utf-8').strip()) except Exception as e: print str(e) for s in signatures: m = match sig = signatures[s]['mnemonics'] if m and m[0] == sig[0] or THRESHOLD < .7: additional_info = [] if 'num_mnemonics' in signatures[s]: nm = signatures[s]['num_mnemonics'] m = match[:nm] sig = signatures[s]['mnemonics'][:nm] else: m = match[:NUM_MNEM] sig = signatures[s]['mnemonics'][:NUM_MNEM] distance = tapered_levenshtein(sig, m) similarity = 1.0 - distance/float(max(len(sig), len(m))) if similarity > THRESHOLD: if DIR_PROCESSING: print "[%s] [%s] (Edits: %s | Similarity: %0.3f) (%s)" %(f, s, distance, similarity, ' | '.join(additional_info)) else: print "[%s] (Edits: %s | Similarity: %0.3f) (%s)" %(s, distance, similarity, ' | '.join(additional_info)) if VERBOSE: print "%s\n%s\n" %(sig, m)
def main(): parser = ArgumentParser( description="Mnemonic PE Signature Matching, signature generator") parser.add_argument("-n", "--num-mnem", dest="num_mnem", help="Use a length of 'n' mnemonics (default: None)") parser.add_argument("-t", "--title", dest="sig_title", help="Title (name) to use for the signature") parser.add_argument( "-l", "--linker", dest="linker", help="Use Major and Minor linker versions in the signature", action="store_true") parser.add_argument( "-s", "--numofsections", dest="nos", help="Use the number of sections in the PE file in the signature", action="store_true") parser.add_argument("file", nargs=1, help='File to analyze') args = parser.parse_args() file_type = None filename = args.file[0] error = '' try: fe = pefile.PE(filename) file_type = 'PE' except Exception as e: error = str(e) pass if not file_type: try: fe = macholib.MachO.MachO(filename) file_type = 'MACHO' except Exception: error = str(e) pass if not file_type: sys.stderr.write("[*] Error with %s - not a PE or Mach-O\n" % sys.argv[1]) sys.exit(1) if file_type == 'PE': try: if args.sig_title and len(args.sig_title) > 0: print "[%s]" % (args.sig_title) if args.linker: maj_linker = 0 min_linker = 0 try: maj_linker = fe.OPTIONAL_HEADER.MajorLinkerVersion min_linker = fe.OPTIONAL_HEADER.MinorLinkerVersion except Exception as e: pass print "major_linker = %s" % (maj_linker) print "minor_linker = %s" % (min_linker) if args.nos: try: print "numberofsections = %s" % ( fe.FILE_HEADER.NumberOfSections) except Exception as e: sys.stderr.write( "Image File Header not found in PE file\n") ep = fe.OPTIONAL_HEADER.AddressOfEntryPoint ep_ava = ep + fe.OPTIONAL_HEADER.ImageBase data = fe.get_memory_mapped_image()[ep:ep + 500] # # Determine if the file is 32bit or 64bit # mode = CS_MODE_32 if fe.OPTIONAL_HEADER.Magic == 0x20b: mode = CS_MODE_64 md = Cs(CS_ARCH_X86, mode) match = [] for (address, size, mnemonic, op_str) in md.disasm_lite(data, 0x1000): match.append(mnemonic.encode('utf-8').strip()) print 'mnemonics = ' + ','.join(match[:30]) except Exception as e: print str(e) elif file_type == 'MACHO': f = open(filename, 'rb') macho_data = f.read() f.close() for header in fe.headers: # Limit it to X86 if header.header.cputype not in [7, 0x01000007]: continue # Limit it to Object and Executable files if header.header.filetype not in [1, 2]: continue magic = int(header.MH_MAGIC) offset = int(header.offset) all_sections = [] entrypoint_type = '' entrypoint_address = 0 for cmd in header.commands: load_cmd = cmd[0] cmd_info = cmd[1] cmd_data = cmd[2] cmd_name = load_cmd.get_cmd_name() if cmd_name in ('LC_SEGMENT', 'LC_SEGMENT_64'): for section_data in cmd_data: sd = section_data.describe() all_sections.append(sd) elif cmd_name in ('LC_THREAD', 'LC_UNIXTHREAD'): entrypoint_type = 'old' flavor = int( struct.unpack(header.endian + 'I', cmd_data[0:4])[0]) count = int( struct.unpack(header.endian + 'I', cmd_data[4:8])[0]) if flavor == 1: entrypoint_address = int( struct.unpack(header.endian + 'I', cmd_data[48:52])[0]) elif flavor == 4: entrypoint_address = int( struct.unpack(header.endian + 'Q', cmd_data[136:144])[0]) elif cmd_name == 'LC_MAIN': entrypoint_type = 'new' entrypoint_address = cmd_info.describe()['entryoff'] entrypoint_data = '' if entrypoint_type == 'new': entrypoint_offset = offset + entrypoint_address entrypoint_data = macho_data[ entrypoint_offset:entrypoint_offset + 500] elif entrypoint_type == 'old': found_section = False for sec in all_sections: if entrypoint_address >= sec[ 'addr'] and entrypoint_address < (sec['addr'] + sec['size']): found_section = True entrypoint_address = (entrypoint_address - sec['addr']) + sec['offset'] break if found_section: entrypoint_offset = offset + entrypoint_address entrypoint_data = macho_data[ entrypoint_offset:entrypoint_offset + 500] mode = CS_MODE_32 if magic == 0xcffaedfe: mode = CS_MODE_64 md = Cs(CS_ARCH_X86, mode) match = [] try: for (address, size, mnemonic, op_str) in md.disasm_lite(entrypoint_data, 0x1000): match.append(mnemonic.encode('utf-8').strip()) except Exception as e: print str(e) print 'mnemonics = ' + ','.join(match[:30])
def main(): BYTES = 500 NUM_MNEM = 30 SIG_FILE = "./mpesm.sig" THRESHOLD = .85 VERBOSE = False DIR_PROCESSING = False signatures = {} file_list = [] nos = 0 ep = 0 ep_ava = 0 parser = ArgumentParser(description="Mnemonic PE Signature Matching") parser.add_argument("-n", "--num-mnem", dest="num_mnem", help="Use a lenght of 'n' mnemonics (default: " + str(NUM_MNEM) + ')') parser.add_argument("-s", "--signatures", dest="sig_file", help="signature file to use (default: " + SIG_FILE + ')') parser.add_argument( "-b", "--bytes", dest="bytes", help= "Grab and disassemble x bytes from EP, you should only need to change this if you give a super large number for -n (default: " + str(BYTES) + ')') parser.add_argument( "-t", "--threshold", dest="threshold", help= "Display all matches greater than -t supplied similarity (default: " + str(THRESHOLD) + ')') parser.add_argument("-v", "--verbose", dest="verbose", help="Verbose output", action='store_true') parser.add_argument("file", nargs=1, help='File to analyze') args = parser.parse_args() if args.sig_file: SIG_FILE = args.sig_file if args.threshold: THRESHOLD = float(args.threshold) if args.bytes: BYTES = args.bytes if args.num_mnem: NUM_MNEM = args.num_mnem if args.verbose: VERBOSE = True config = ConfigParser.RawConfigParser() config.read(SIG_FILE) if len(config.sections()) == 0: print "Error Reading from config file: %s, it's either empty or not present" % ( SIG_FILE) sys.exit(1) for s in config.sections(): signatures[s] = {} signatures[s]['mnemonics'] = config.get(s, 'mnemonics').split(',') if config.has_option(s, 'num_mnemonics'): signatures[s]['num_mnemonics'] = config.getint(s, 'num_mnemonics') if config.has_option(s, 'major_linker'): signatures[s]['major_linker'] = config.getint(s, 'major_linker') if config.has_option(s, 'minor_linker'): signatures[s]['minor_linker'] = config.getint(s, 'minor_linker') if config.has_option(s, 'numberofsections'): signatures[s]['numberofsections'] = config.getint( s, 'numberofsections') if os.path.isdir(args.file[0]): file_list = glob.glob(args.file[0] + '/*') DIR_PROCESSING = True else: file_list.append(args.file[0]) for f in file_list: file_type = None if VERBOSE: print '[*] Processing: ' + f try: fe = pefile.PE(f) file_type = 'PE' except Exception as e: if VERBOSE: sys.stderr.write("[*] Error with %s - %s\n" % (f, str(e))) if not file_type: try: fe = macholib.MachO.MachO(f) file_type = 'MACHO' except Exception as e: if VERBOSE: sys.stderr.write("[*] Error with %s - %s\n" % (f, str(e))) if not file_type: sys.stderr.write("[*] Error with %s - not a PE or Mach-O\n" % f) if file_type == 'PE': try: minor_linker = 0 major_linker = 0 try: minor_linker = fe.OPTIONAL_HEADER.MinorLinkerVersion major_linker = fe.OPTIONAL_HEADER.MajorLinkerVersion except Exception as e: pass if hasattr(fe, 'FILE_HEADER') and hasattr( fe.FILE_HEADER, 'NumberOfSections'): nos = fe.FILE_HEADER.NumberOfSections if hasattr(fe, 'OPTIONAL_HEADER') and hasattr( fe.OPTIONAL_HEADER, 'AddressOfEntryPoint'): ep = fe.OPTIONAL_HEADER.AddressOfEntryPoint if hasattr(fe, 'OPTIONAL_HEADER') and hasattr( fe.OPTIONAL_HEADER, 'ImageBase') and ep > 0: ep_ava = ep + fe.OPTIONAL_HEADER.ImageBase data = fe.get_memory_mapped_image()[ep:ep + BYTES] # # Determine if the file is 32bit or 64bit # mode = CS_MODE_32 if fe.OPTIONAL_HEADER.Magic == 0x20b: mode = CS_MODE_64 md = Cs(CS_ARCH_X86, mode) match = [] for (address, size, mnemonic, op_str) in md.disasm_lite(data, 0x1000): match.append(mnemonic.encode('utf-8').strip()) for s in signatures: m = match sig = signatures[s]['mnemonics'] if m and m[0] == sig[0] or THRESHOLD < .7: additional_info = [] if 'minor_linker' in signatures[s]: if minor_linker == signatures[s][ 'minor_linker']: additional_info.append( 'Minor Linker Version Match: True') else: additional_info.append( 'Minor Linker Version Match: False') if 'major_linker' in signatures[s]: if major_linker == signatures[s][ 'major_linker']: additional_info.append( 'Major Linker Version Match: True') else: additional_info.append( 'Major Linker Version Match: False') if 'numberofsections' in signatures[s]: if nos == signatures[s]['numberofsections']: additional_info.append( 'Number Of Sections Match: True') else: additional_info.append( 'Number Of Sections Match: False') if 'num_mnemonics' in signatures[s]: nm = signatures[s]['num_mnemonics'] m = match[:nm] sig = signatures[s]['mnemonics'][:nm] else: m = match[:NUM_MNEM] sig = signatures[s]['mnemonics'][:NUM_MNEM] distance = tapered_levenshtein(sig, m) similarity = 1.0 - distance / float( max(len(sig), len(m))) if similarity > THRESHOLD: if DIR_PROCESSING: print "[%s] [%s] (Edits: %s | Similarity: %0.3f) (%s)" % ( f, s, distance, similarity, ' | '.join(additional_info)) else: print "[%s] (Edits: %s | Similarity: %0.3f) (%s)" % ( s, distance, similarity, ' | '.join(additional_info)) if VERBOSE: print "%s\n%s\n" % (sig, m) except Exception as e: print str(e) elif file_type == 'MACHO': macho_file = open(f, 'rb') macho_data = macho_file.read() macho_file.close() for header in fe.headers: # Limit it to X86 if header.header.cputype not in [7, 0x01000007]: continue # Limit it to Object and Executable files if header.header.filetype not in [1, 2]: continue magic = int(header.MH_MAGIC) offset = int(header.offset) all_sections = [] entrypoint_type = '' entrypoint_address = 0 for cmd in header.commands: load_cmd = cmd[0] cmd_info = cmd[1] cmd_data = cmd[2] cmd_name = load_cmd.get_cmd_name() if cmd_name in ('LC_SEGMENT', 'LC_SEGMENT_64'): for section_data in cmd_data: sd = section_data.describe() all_sections.append(sd) elif cmd_name in ('LC_THREAD', 'LC_UNIXTHREAD'): entrypoint_type = 'old' flavor = int( struct.unpack(header.endian + 'I', cmd_data[0:4])[0]) count = int( struct.unpack(header.endian + 'I', cmd_data[4:8])[0]) if flavor == 1: entrypoint_address = int( struct.unpack(header.endian + 'I', cmd_data[48:52])[0]) elif flavor == 4: entrypoint_address = int( struct.unpack(header.endian + 'Q', cmd_data[136:144])[0]) elif cmd_name == 'LC_MAIN': entrypoint_type = 'new' entrypoint_address = cmd_info.describe()['entryoff'] entrypoint_data = '' if entrypoint_type == 'new': entrypoint_offset = offset + entrypoint_address entrypoint_data = macho_data[ entrypoint_offset:entrypoint_offset + 500] elif entrypoint_type == 'old': found_section = False for sec in all_sections: if entrypoint_address >= sec[ 'addr'] and entrypoint_address < (sec['addr'] + sec['size']): found_section = True entrypoint_address = (entrypoint_address - sec['addr']) + sec['offset'] break if found_section: entrypoint_offset = offset + entrypoint_address entrypoint_data = macho_data[ entrypoint_offset:entrypoint_offset + 500] mode = CS_MODE_32 if magic == 0xcffaedfe: mode = CS_MODE_64 md = Cs(CS_ARCH_X86, mode) match = [] if entrypoint_data: try: for (address, size, mnemonic, op_str) in md.disasm_lite(entrypoint_data, 0x1000): match.append(mnemonic.encode('utf-8').strip()) except Exception as e: print str(e) for s in signatures: m = match sig = signatures[s]['mnemonics'] if m and m[0] == sig[0] or THRESHOLD < .7: additional_info = [] if 'num_mnemonics' in signatures[s]: nm = signatures[s]['num_mnemonics'] m = match[:nm] sig = signatures[s]['mnemonics'][:nm] else: m = match[:NUM_MNEM] sig = signatures[s]['mnemonics'][:NUM_MNEM] distance = tapered_levenshtein(sig, m) similarity = 1.0 - distance / float( max(len(sig), len(m))) if similarity > THRESHOLD: if DIR_PROCESSING: print "[%s] [%s] (Edits: %s | Similarity: %0.3f) (%s)" % ( f, s, distance, similarity, ' | '.join(additional_info)) else: print "[%s] (Edits: %s | Similarity: %0.3f) (%s)" % ( s, distance, similarity, ' | '.join(additional_info)) if VERBOSE: print "%s\n%s\n" % (sig, m)
class IntelDisassembler(object): def __init__(self, config, forced_bitness=None): self.config = config self._forced_bitness = forced_bitness self.capstone = None self._tfidf = None self.binary_info = None self.label_providers = [] self._addLabelProviders() self.fc_manager = None self.tailcall_analyzer = None self.indcall_analyzer = None self.jumptable_analyzer = None self.disassembly = DisassemblyResult() self.disassembly.smda_version = config.VERSION self.disassembly.setConfidenceThreshold(config.CONFIDENCE_THRESHOLD) def _initCapstone(self): self.capstone = Cs( CS_ARCH_X86, CS_MODE_64) if self.disassembly.binary_info.bitness == 64 else Cs( CS_ARCH_X86, CS_MODE_32) def _initTfIdf(self): self._tfidf = MnemonicTfIdf( bitness=64 ) if self.disassembly.binary_info.bitness == 64 else MnemonicTfIdf( bitness=32) def getBitMask(self): if self.disassembly.binary_info.bitness == 64: return 0xFFFFFFFFFFFFFFFF return 0xFFFFFFFF def _addLabelProviders(self): self.label_providers.append(WinApiResolver(self.config)) self.label_providers.append(ElfApiResolver(self.config)) self.label_providers.append(ElfSymbolProvider(self.config)) self.label_providers.append(PdbSymbolProvider(self.config)) def _updateLabelProviders(self, binary_info): for provider in self.label_providers: provider.update(binary_info) def addPdbFile(self, binary_info, pdb_path): LOGGER.debug("adding PDB file: %s", pdb_path) if pdb_path and binary_info.base_addr: pdb_info = BinaryInfo(b"") pdb_info.file_path = pdb_path pdb_info.base_addr = binary_info.base_addr for provider in self.label_providers: provider.update(pdb_info) def resolveApi(self, to_address, api_address): for provider in self.label_providers: if not provider.isApiProvider(): continue dll, api = provider.getApi(to_address, api_address) if dll or api: return (dll, api) return ("", "") def resolveSymbol(self, address): for provider in self.label_providers: if not provider.isSymbolProvider(): continue result = provider.getSymbol(address) if result: return result return "" def getSymbolCandidates(self): symbol_offsets = set([]) for provider in self.label_providers: if not provider.isSymbolProvider(): continue function_symbols = provider.getFunctionSymbols() symbol_offsets.update(list(function_symbols.keys())) return list(symbol_offsets) def getReferencedAddr(self, op_str): referenced_addr = re.search(r"0x[a-fA-F0-9]+", op_str) if referenced_addr: return int(referenced_addr.group(), 16) return 0 def resolveIndirectSwitch(self, addr_switch_array, size): indirect_switch_bytes = [] current_offset = addr_switch_array + size * 4 if self.disassembly.isAddrWithinMemoryImage(current_offset): LOGGER.debug( "0x%08x analyzing potentially indirect switch table (size: 0x%08x).", current_offset, size) current_byte = self.disassembly.getByte(current_offset) if isinstance(current_byte, str): current_byte = ord(current_byte) while current_byte < size and not current_offset in self.fc_manager.getFunctionStartCandidates( ): indirect_switch_bytes.append(current_offset) current_offset += 1 current_byte = self.disassembly.getByte(current_offset) if isinstance(current_byte, str): current_byte = ord(current_byte) LOGGER.debug("0x%08x found %d bytes.", current_offset, len(indirect_switch_bytes)) return indirect_switch_bytes def _analyzeCallInstruction(self, i, state): i_address, i_size, i_mnemonic, i_op_str = i state.setLeaf(False) # case = "FALLTHROUGH" call_destination = self.getReferencedAddr(i_op_str) if ":" in i_op_str: # case = "LONG-CALL" pass if i_op_str.startswith("dword ptr ["): # reg+offset is currently ignored as it is a minority of calls # case = "DWORD-PTR-REG" if i_op_str.startswith("dword ptr [0x"): # case = "DWORD-PTR" dereferenced = self.disassembly.dereferenceDword( call_destination) if dereferenced is not None: state.addCodeRef(i_address, dereferenced) self._handleCallTarget(state, i_address, dereferenced) self._handleApiTarget(i_address, call_destination, dereferenced) elif i_op_str.startswith("qword ptr [rip"): rip = i_address + i_size call_destination = rip + self.getReferencedAddr(i_op_str) dereferenced = self.disassembly.dereferenceQword(call_destination) state.addCodeRef(i_address, call_destination) if dereferenced is not None: self._handleApiTarget(i_address, call_destination, dereferenced) elif i_op_str.startswith("0x"): # case = "DIRECT" self._handleCallTarget(state, i_address, call_destination) self._handleApiTarget(i_address, call_destination, call_destination) elif i_op_str.lower() in REGS_32BIT or i_op_str.lower() in REGS_64BIT: # case = "REG" # this is resolved by backtracking at the end of function analysis. state.call_register_ins.append(i_address) def _handleCallTarget(self, state, from_addr, to_addr): if to_addr and self.disassembly.isAddrWithinMemoryImage(to_addr): state.addCodeRef(from_addr, to_addr) if state.start_addr == to_addr: state.setRecursion(True) def _handleApiTarget(self, from_addr, to_addr, dereferenced): if to_addr: # identify API calls on the fly dll, api = self.resolveApi(to_addr, dereferenced) if dll or api: self._updateApiInformation(from_addr, dereferenced, dll, api) return (dll, api) elif not self.disassembly.isAddrWithinMemoryImage(to_addr): LOGGER.debug("potentially uncovered DLL address: 0x%08x", to_addr) def _updateApiInformation(self, from_addr, to_addr, dll, api): api_entry = {"referencing_addr": [], "dll_name": dll, "api_name": api} if to_addr in self.disassembly.apis: api_entry = self.disassembly.apis[to_addr] if from_addr not in api_entry["referencing_addr"]: api_entry["referencing_addr"].append(from_addr) self.disassembly.apis[to_addr] = api_entry def _analyzeCondJmpInstruction(self, i, state): i_address, i_size, i_mnemonic, i_op_str = i state.addBlockToQueue(i_address + i_size) jump_destination = self.getReferencedAddr(i_op_str) # case = "FALLTHROUGH" self.tailcall_analyzer.addJump(i_address, jump_destination) if jump_destination: if jump_destination in self.disassembly.functions: # case = "TAILCALL!" state.setSanelyEnding(True) elif jump_destination in self.fc_manager.getFunctionStartCandidates( ): # it's tough to decide whether this should be disassembled here or not. topic of "code-sharing functions". # case = "TAILCALL?" pass else: # case = "OFFSET-QUEUE" state.addBlockToQueue(int(i_op_str, 16)) state.addCodeRef(i_address, int(i_op_str, 16), by_jump=True) state.setBlockEndingInstruction(True) def _analyzeLoopInstruction(self, i, state): i_address, i_size, i_mnemonic, i_op_str = i jump_destination = self.getReferencedAddr(i_op_str) if jump_destination: state.addCodeRef(i_address, int(i_op_str, 16), by_jump=True) # loops have two exits and should thus be handled as block ending instruction state.addBlockToQueue(i_address + i_size) state.setBlockEndingInstruction(True) def _analyzeJmpInstruction(self, i, state): i_address, i_size, i_mnemonic, i_op_str = i # case = "FALLTHROUGH" if ":" in i_op_str: # case = "LONG-JMP" pass elif i_op_str.startswith("dword ptr [0x"): # case = "DWORD-PTR" # Handles mostly jmp-to-api, stubs or tailcalls, all should be handled sanely this way. jump_destination = self.getReferencedAddr(i_op_str) dereferenced = self.disassembly.dereferenceDword(jump_destination) state.addCodeRef(i_address, jump_destination, by_jump=True) self.tailcall_analyzer.addJump(i_address, jump_destination) if dereferenced is not None: self._handleApiTarget(i_address, jump_destination, dereferenced) elif i_op_str.startswith("qword ptr [rip"): # case = "QWORD-PTR, RIP-relative" # Handles mostly jmp-to-api, stubs or tailcalls, all should be handled sanely this way. rip = i_address + i_size jump_destination = rip + self.getReferencedAddr(i_op_str) dereferenced = self.disassembly.dereferenceQword(jump_destination) state.addCodeRef(i_address, jump_destination, by_jump=True) self.tailcall_analyzer.addJump(i_address, jump_destination) if dereferenced is not None: self._handleApiTarget(i_address, jump_destination, dereferenced) elif i_op_str.startswith("0x"): jump_destination = self.getReferencedAddr(i_op_str) self.tailcall_analyzer.addJump(i_address, jump_destination) if jump_destination in self.disassembly.functions: # case = "TAILCALL!" state.setSanelyEnding(True) elif jump_destination in self.fc_manager.getFunctionStartCandidates( ): # case = "TAILCALL?" pass else: if state.isFirstInstruction(): # case = "STUB-TAILCALL!" pass else: # case = "OFFSET-QUEUE" state.addBlockToQueue(int(i_op_str, 16)) state.addCodeRef(i_address, int(i_op_str, 16), by_jump=True) else: jumptable_targets = self.jumptable_analyzer.getJumpTargets( i, state) for target in jumptable_targets: if self.disassembly.isAddrWithinMemoryImage(target): state.addBlockToQueue(target) state.addCodeRef(i_address, target, by_jump=True) state.setNextInstructionReachable(False) state.setBlockEndingInstruction(True) def _analyzeEndInstruction(self, state): state.setSanelyEnding(True) state.setNextInstructionReachable(False) state.setBlockEndingInstruction(True) def _getDisasmWindowBuffer(self, addr): relative_start = addr - self.disassembly.binary_info.base_addr relative_end = relative_start + 15 return self.disassembly.binary_info.binary[relative_start:relative_end] def analyzeFunction(self, start_addr, as_gap=False): LOGGER.debug( "analyzeFunction() starting analysis of candidate @0x%08x", start_addr) self.tailcall_analyzer.initFunction() i = None state = FunctionAnalysisState(start_addr, self.disassembly) if state.isProcessedFunction(): self.fc_manager.updateAnalysisAborted( start_addr, "collision with existing code of function 0x{:08x}".format( self.disassembly.ins2fn[start_addr])) return [] while state.hasUnprocessedBlocks(): LOGGER.debug( " current block queue: %s", ", ".join(["0x%x" % addr for addr in state.block_queue])) state.chooseNextBlock() LOGGER.debug(" analyzeFunction() now processing block @0x%08x", state.block_start) # in capstone, disassembly is more expensive than calling the function, so we use maximum x86/64 instruction size (14 bytes) as lookeahead. # disasm_lite() also provides up to 30% faster disassembly than disasm(), so we work with tuples instead of objects cache = [ i for i in self.capstone.disasm_lite( self._getDisasmWindowBuffer(state.block_start), state.block_start) ] cache_pos = 0 previous_address = None previous_mnemonic = None previous_op_str = None while True: for i in cache: i_address, i_size, i_mnemonic, i_op_str = i i_op_str = i_op_str.strip() i_relative_address = i_address - self.disassembly.binary_info.base_addr i_bytes = self.disassembly.binary_info.binary[ i_relative_address:i_relative_address + i_size] LOGGER.debug( " analyzeFunction() now processing instruction @0x%08x: %s", i_address, i_mnemonic + " " + i_op_str) cache_pos += i_size state.setNextInstructionReachable(True) # count appearences of "suspicious" byte patterns (like 00 00) that indicate non-function code if i_bytes == DOUBLE_ZERO: state.suspicious_ins_count += 1 LOGGER.debug( " analyzeFunction() found suspicious function @0x%08x", i_address) if state.suspicious_ins_count > 1: self.fc_manager.updateAnalysisAborted( start_addr, "too many suspicious instructions @0x%08x" % i_address) return state if i_mnemonic in CALL_INS: self._analyzeCallInstruction(i, state) elif i_mnemonic in JMP_INS: self._analyzeJmpInstruction(i, state) elif i_mnemonic in LOOP_INS: self._analyzeLoopInstruction(i, state) elif i_mnemonic in CJMP_INS: self._analyzeCondJmpInstruction(i, state) elif i_mnemonic.startswith("j"): LOGGER.error( "unsupported jump @0x%08x (0x%08x): %s %s", i_address, start_addr, i_mnemonic, i_op_str) # we do not analyze any potential exception handler (tricks), so treat breakpoints as exit condition elif i_mnemonic in RET_INS: self._analyzeEndInstruction(state) LOGGER.debug( " analyzeFunction() found ending instruction @0x%08x", i_address) if previous_address and previous_mnemonic == "push": push_ret_destination = self.getReferencedAddr( previous_op_str) if self.disassembly.isAddrWithinMemoryImage( push_ret_destination): LOGGER.debug( " analyzeFunction() found push-return jump obfuscation: @0x%08x", i_address) state.addBlockToQueue(push_ret_destination) state.addCodeRef(i_address, push_ret_destination, by_jump=True) elif i_mnemonic in ["int3", "hlt"]: self._analyzeEndInstruction(state) LOGGER.debug( " analyzeFunction() found ending instruction @0x%08x", i_address) elif previous_address and i_address != start_addr and previous_mnemonic == "call": instruction_sequence = [ ins for ins in self.capstone.disasm( self._getDisasmWindowBuffer(i_address), i_address) ] if self.fc_manager.isAlignmentSequence( instruction_sequence ) or self.fc_manager.isFunctionCandidate(i_address): # LLVM and GCC sometimes tends to produce lots of tailcalls that basically mess with function end detection, we cut whenever we find effective nops after calls LOGGER.debug( " current function: 0x%x ---> ran into alignment sequence after call -> 0x%08x, cutting block here.", start_addr, i_address) state.setBlockEndingInstruction(True) state.endBlock() state.setSanelyEnding(True) if self.fc_manager.isAlignmentSequence( instruction_sequence): next_aligned_address = previous_address + ( 16 - previous_address % 16) LOGGER.debug(" Adding: 0x%x as candidate.", next_aligned_address) self.fc_manager.addCandidate( next_aligned_address, is_gap=True) break previous_address = i_address previous_mnemonic = i_mnemonic previous_op_str = i_op_str if not i_address in self.disassembly.code_map and not i_address in self.disassembly.data_map and not state.isProcessed( i_address): LOGGER.debug( " analyzeFunction() booked instruction @0x%08x: %s for processed state", i_address, i_mnemonic + " " + i_op_str) state.addInstruction(i_address, i_size, i_mnemonic, i_op_str, i_bytes) elif i_address in self.disassembly.code_map: LOGGER.debug( " analyzeFunction() was already present?! instruction @0x%08x: %s (function: 0x%08x)", i_address, i_mnemonic + " " + i_op_str, self.disassembly.ins2fn[i_address]) state.setBlockEndingInstruction(True) state.setCollision(True) else: LOGGER.debug( " analyzeFunction() was already present in local function." ) state.setBlockEndingInstruction(True) if state.isBlockEndingInstruction(): state.endBlock() break else: #if the inner loop did not break, we need to refill the cache in order to finish the block-analysis cache = [ i for i in self.capstone.disasm_lite( self._getDisasmWindowBuffer(state.block_start + cache_pos), state.block_start + cache_pos) ] if not cache: break continue #if the inner loop did break, the cache didn't run empty and thus block-analysis is finished break if not state.isBlockEndingInstruction(): if i is not None: LOGGER.debug( "No block submitted, last instruction: 0x%08x -> 0x%08x %s || %s", start_addr, i_address, i_mnemonic + " " + i_op_str, self.fc_manager.getFunctionCandidate(start_addr)) else: LOGGER.debug( "No block submitted with no ins, last instruction: 0x%08x || %s", start_addr, self.fc_manager.getFunctionCandidate(start_addr)) state.label = self.resolveSymbol(state.start_addr) analysis_result = state.finalizeAnalysis(as_gap) if analysis_result and self.config.RESOLVE_REGISTER_CALLS: self.indcall_analyzer.resolveRegisterCalls(state) self.tailcall_analyzer.finalizeFunction(state) self.fc_manager.updateAnalysisFinished(start_addr) self.fc_manager.updateCandidates(state) return state def analyzeBuffer(self, binary_info, cbAnalysisTimeout=None): LOGGER.debug("Analyzing buffer with %d bytes @0x%08x", binary_info.binary_size, binary_info.base_addr) self._updateLabelProviders(binary_info) self.disassembly = DisassemblyResult() self.disassembly.smda_version = self.config.VERSION self.disassembly.setBinaryInfo(binary_info) self.disassembly.binary_info.architecture = "intel" self.disassembly.analysis_start_ts = datetime.datetime.utcnow() if self.disassembly.binary_info.bitness not in [32, 64]: bitness_analyzer = BitnessAnalyzer() self.disassembly.binary_info.bitness = bitness_analyzer.determineBitnessFromDisassembly( self.disassembly) LOGGER.debug("Automatically Recognized Bitness as: %d", self.disassembly.binary_info.bitness) else: LOGGER.debug("Using defined Bitness as: %d", self.disassembly.binary_info.bitness) if self._forced_bitness: self.disassembly.binary_info.bitness = self._forced_bitness LOGGER.debug("Forced Bitness override to: %d", self.disassembly.binary_info.bitness) self.tailcall_analyzer = TailcallAnalyzer() self.indcall_analyzer = IndirectCallAnalyzer(self) self.jumptable_analyzer = JumpTableAnalyzer(self) self.fc_manager = FunctionCandidateManager(self.config) if self.config.USE_SYMBOLS_AS_CANDIDATES: self.fc_manager.symbol_addresses = self.getSymbolCandidates() self.fc_manager.init(self.disassembly) self._initCapstone() self._initTfIdf() # first pass, analyze locations identifiable by heuristics (e.g. call-reference, common prologue) for candidate in self.fc_manager.getNextFunctionStartCandidate(): if cbAnalysisTimeout and cbAnalysisTimeout(): break state = self.analyzeFunction(candidate.addr) LOGGER.debug("Finished heuristical analysis, functions: %d", len(self.disassembly.functions)) # second pass, analyze remaining gaps for additional candidates in an iterative way gap_candidate = self.fc_manager.nextGapCandidate() while gap_candidate is not None: if cbAnalysisTimeout and cbAnalysisTimeout(): break LOGGER.debug( "based on gap, performing function analysis of 0x%08x", gap_candidate) state = self.analyzeFunction(gap_candidate, as_gap=True) function_blocks = state.getBlocks() if function_blocks: LOGGER.debug("+ got some blocks here -> 0x%08x", gap_candidate) if gap_candidate in self.disassembly.functions: fn_min = self.disassembly.function_borders[gap_candidate][0] fn_max = self.disassembly.function_borders[gap_candidate][1] LOGGER.debug("+++ YAY, is now a function! -> 0x%08x - 0x%08x", fn_min, fn_max) # start looking directly after our new function else: self.fc_manager.updateAnalysisAborted( gap_candidate, "Gap candidate did not fulfil function criteria.") next_gap = self.fc_manager.getNextGap(dont_skip=True) gap_candidate = self.fc_manager.nextGapCandidate(next_gap) LOGGER.debug("Finished gap analysis, functions: %d", len(self.disassembly.functions)) # third pass, fix potential tailcall functions that were identified during analysis if self.config.RESOLVE_TAILCALLS: tailcalled_functions = self.tailcall_analyzer.resolveTailcalls( self) for addr in tailcalled_functions: self.fc_manager.addTailcallCandidate(addr) LOGGER.debug("Finished tailcall analysis, functions.") self.disassembly.failed_analysis_addr = self.fc_manager.getAbortedCandidates( ) # package up and finish for addr, candidate in self.fc_manager.candidates.items(): if addr in self.disassembly.functions: function_blocks = self.disassembly.getBlocksAsDict(addr) function_tfidf = self._tfidf.getTfIdfFromBlocks( function_blocks) candidate.setTfIdf(function_tfidf) candidate.getConfidence() self.disassembly.candidates[addr] = candidate self.disassembly.analysis_end_ts = datetime.datetime.utcnow() if cbAnalysisTimeout(): self.disassembly.analysis_timeout = True return self.disassembly