def __init__(self, filename, raw_type, raw_base, raw_big_endian, database): import capstone as CAPSTONE arch_lookup = { "x86": CAPSTONE.CS_ARCH_X86, "x64": CAPSTONE.CS_ARCH_X86, "ARM": CAPSTONE.CS_ARCH_ARM, "MIPS32": CAPSTONE.CS_ARCH_MIPS, "MIPS64": CAPSTONE.CS_ARCH_MIPS, } mode_lookup = { "x86": CAPSTONE.CS_MODE_32, "x64": CAPSTONE.CS_MODE_64, "ARM": CAPSTONE.CS_ARCH_ARM, "MIPS32": CAPSTONE.CS_MODE_MIPS32, "MIPS64": CAPSTONE.CS_MODE_MIPS64, } word_size_lookup = { "x86": 4, "x64": 8, "ARM": 4, "MIPS32": 4, "MIPS64": 8, } self.capstone_inst = {} # capstone instruction cache self.db = database if database.loaded: self.mem = database.mem else: self.mem = Memory() database.mem = self.mem self.instanciate_binary(filename, raw_type, raw_base, raw_big_endian) if self.binary.arch not in ("x86", "x64", "MIPS32", "MIPS64", "ARM"): raise ExcArch(self.binary.arch) self.wordsize = word_size_lookup.get(self.binary.arch, None) self.binary.wordsize = self.wordsize self.is_mips = self.binary.arch in ("MIPS32", "MIPS64") self.is_x86 = self.binary.arch in ("x86", "x64") self.is_arm = self.binary.arch in ("ARM") self.is_big_endian = self.binary.is_big_endian() self.binary.load_section_names() self.jmptables = database.jmptables self.user_inline_comments = database.user_inline_comments self.internal_inline_comments = database.internal_inline_comments self.user_previous_comments = database.user_previous_comments self.internal_previous_comments = database.internal_previous_comments self.functions = database.functions self.func_id = database.func_id self.end_functions = database.end_functions self.xrefs = database.xrefs self.mem.xrefs = database.xrefs self.mem.data_sub_xrefs = database.data_sub_xrefs self.mips_gp = database.mips_gp if not database.loaded: self.load_symbols() database.symbols = self.binary.symbols database.reverse_symbols = self.binary.reverse_symbols database.demangled = self.binary.demangled database.reverse_demangled = self.binary.reverse_demangled database.imports = self.binary.imports else: self.binary.symbols = database.symbols self.binary.reverse_symbols = database.reverse_symbols self.binary.demangled = database.demangled self.binary.reverse_demangled = database.reverse_demangled self.binary.imports = database.imports cs_arch = arch_lookup.get(self.binary.arch, None) cs_mode = mode_lookup.get(self.binary.arch, None) if self.is_big_endian: cs_mode |= CAPSTONE.CS_MODE_BIG_ENDIAN else: cs_mode |= CAPSTONE.CS_MODE_LITTLE_ENDIAN self.capstone = CAPSTONE self.md = CAPSTONE.Cs(cs_arch, cs_mode) self.md.detail = True for s in self.binary.iter_sections(): s.big_endian = cs_mode & CAPSTONE.CS_MODE_BIG_ENDIAN
class Disassembler(): def __init__(self, filename, raw_type, raw_base, raw_big_endian, database): import capstone as CAPSTONE arch_lookup = { "x86": CAPSTONE.CS_ARCH_X86, "x64": CAPSTONE.CS_ARCH_X86, "ARM": CAPSTONE.CS_ARCH_ARM, "MIPS32": CAPSTONE.CS_ARCH_MIPS, "MIPS64": CAPSTONE.CS_ARCH_MIPS, } mode_lookup = { "x86": CAPSTONE.CS_MODE_32, "x64": CAPSTONE.CS_MODE_64, "ARM": CAPSTONE.CS_ARCH_ARM, "MIPS32": CAPSTONE.CS_MODE_MIPS32, "MIPS64": CAPSTONE.CS_MODE_MIPS64, } word_size_lookup = { "x86": 4, "x64": 8, "ARM": 4, "MIPS32": 4, "MIPS64": 8, } self.capstone_inst = {} # capstone instruction cache self.db = database if database.loaded: self.mem = database.mem else: self.mem = Memory() database.mem = self.mem self.instanciate_binary(filename, raw_type, raw_base, raw_big_endian) if self.binary.arch not in ("x86", "x64", "MIPS32", "MIPS64", "ARM"): raise ExcArch(self.binary.arch) self.wordsize = word_size_lookup.get(self.binary.arch, None) self.binary.wordsize = self.wordsize self.is_mips = self.binary.arch in ("MIPS32", "MIPS64") self.is_x86 = self.binary.arch in ("x86", "x64") self.is_arm = self.binary.arch in ("ARM") self.is_big_endian = self.binary.is_big_endian() self.binary.load_section_names() self.jmptables = database.jmptables self.user_inline_comments = database.user_inline_comments self.internal_inline_comments = database.internal_inline_comments self.user_previous_comments = database.user_previous_comments self.internal_previous_comments = database.internal_previous_comments self.functions = database.functions self.func_id = database.func_id self.end_functions = database.end_functions self.xrefs = database.xrefs self.mem.xrefs = database.xrefs self.mem.data_sub_xrefs = database.data_sub_xrefs self.mips_gp = database.mips_gp if not database.loaded: self.load_symbols() database.symbols = self.binary.symbols database.reverse_symbols = self.binary.reverse_symbols database.demangled = self.binary.demangled database.reverse_demangled = self.binary.reverse_demangled database.imports = self.binary.imports else: self.binary.symbols = database.symbols self.binary.reverse_symbols = database.reverse_symbols self.binary.demangled = database.demangled self.binary.reverse_demangled = database.reverse_demangled self.binary.imports = database.imports cs_arch = arch_lookup.get(self.binary.arch, None) cs_mode = mode_lookup.get(self.binary.arch, None) if self.is_big_endian: cs_mode |= CAPSTONE.CS_MODE_BIG_ENDIAN else: cs_mode |= CAPSTONE.CS_MODE_LITTLE_ENDIAN self.capstone = CAPSTONE self.md = CAPSTONE.Cs(cs_arch, cs_mode) self.md.detail = True for s in self.binary.iter_sections(): s.big_endian = cs_mode & CAPSTONE.CS_MODE_BIG_ENDIAN def instanciate_binary(self, filename, raw_type, raw_base, raw_big_endian): if raw_type != None: import plasma.lib.fileformat.raw as LIB_RAW self.binary = LIB_RAW.Raw(filename, raw_type, raw_base, raw_big_endian) self.type = T_BIN_RAW return start = time() ty = self.get_magic(filename) if ty == T_BIN_ELF: import plasma.lib.fileformat.elf as LIB_ELF self.binary = LIB_ELF.ELF(self.db, filename) elif ty == T_BIN_PE: import plasma.lib.fileformat.pe as LIB_PE self.binary = LIB_PE.PE(self.db, filename) else: raise ExcFileFormat() self.binary.type = ty elapsed = time() elapsed = elapsed - start debug__("Binary loaded in %fs" % elapsed) def load_symbols(self): start = time() self.binary.load_static_sym() self.binary.load_dyn_sym() self.binary.demangle_symbols() ep = self.binary.get_entry_point() if ep not in self.binary.reverse_symbols: name = "_start" n = name i = 0 while n in self.binary.symbols: n = "%s_%d" % (name, i) i += 1 name = n self.binary.symbols[name] = ep self.binary.reverse_symbols[ep] = name elapsed = time() elapsed = elapsed - start debug__("Found %d symbols in %fs" % (len(self.binary.symbols), elapsed)) def get_magic(self, filename): f = open(filename, "rb") magic = f.read(8) f.close() if magic.startswith(b"\x7fELF"): return T_BIN_ELF elif magic.startswith(b"MZ"): return T_BIN_PE return None # `func_ad` is the function address where the variable `name` # is supposed to be. def var_get_offset(self, func_ad, name): if func_ad not in self.functions: return None func_obj = self.functions[func_ad] if func_obj is None: return None for off, val in func_obj[FUNC_VARS].items(): if val[VAR_NAME] == name: return off return None def load_arch_module(self): if self.binary.arch in ("x86", "x64"): import plasma.lib.arch.x86 as ARCH elif self.binary.arch == "ARM": import plasma.lib.arch.arm as ARCH elif self.binary.arch in ("MIPS32", "MIPS64"): import plasma.lib.arch.mips as ARCH else: raise NotImplementedError return ARCH def dump_xrefs(self, ctx, ad): ARCH = self.load_arch_module() ARCH_OUTPUT = ARCH.output o = ARCH_OUTPUT.Output(ctx) o._new_line() o.print_labels = False xrefs = list(ctx.gctx.api.xrefsto(ad)) xrefs.sort() seen = set() for x in xrefs: x = self.mem.get_head_addr(x) if x in seen: continue seen.add(x) s = self.binary.get_section(x) ty = self.mem.get_type(x) # A PE import should not be displayed as a subroutine if not(self.binary.type == T_BIN_PE and x in self.binary.imports) \ and (ty == MEM_FUNC or ty == MEM_CODE): func_id = self.mem.get_func_id(x) if func_id != -1: fad = self.func_id[func_id] o._label(fad) diff = x - fad if diff >= 0: o._add(" + %d " % diff) else: o._add(" - %d " % (-diff)) o._pad_width(20) i = self.lazy_disasm(x, s.start) o._asm_inst(i) elif MEM_WOFFSET <= ty <= MEM_QOFFSET: o.set_line(x) o._pad_width(20) o._address(x) sz = self.mem.get_size(x) off = s.read_int(x, sz) if off is None: continue o._data_prefix(sz) o._add(" ") o._imm(off, sz, True, print_data=False, force_dont_print_data=True) o._new_line() elif ty == MEM_ARRAY: o.set_line(x) o._pad_width(20) o._address(x) o._label(x, print_colon=True) o._new_line() else: o._pad_width(20) o._address(x) o.set_line(x) sz = self.mem.get_size_from_type(ty) o._word(s.read_int(x, sz), sz) o._new_line() # remove the last empty line o.lines.pop(-1) o.token_lines.pop(-1) o.join_lines() return o def is_label(self, ad): return ad in self.db.reverse_symbols or ad in self.xrefs def dump_asm(self, ctx, lines=NB_LINES_TO_DISASM, until=-1): ARCH = self.load_arch_module() ARCH_OUTPUT = ARCH.output ARCH_UTILS = ARCH.utils ad = ctx.entry s = self.binary.get_section(ad) if s is None: # until is != -1 only from the visual mode # It allows to not go before the first section. if until != -1: return None # Get the next section, it's not mandatory that sections # are consecutives ! s = self.binary.get_next_section(ad) if s is None: return None ad = s.start o = ARCH_OUTPUT.Output(ctx) o._new_line() o.curr_section = s o.mode_dump = True l = 0 api = ctx.gctx.api # For mips: after a jump we add a newline, but for mips we should # add this newline after the prefetch instruction. prefetch_after_branch = False while 1: if ad == s.start: if not o.last_2_lines_are_empty(): o._new_line() o._dash() o._section(s.name) o._add(" 0x%x -> 0x%x" % (s.start, s.end)) o._new_line() o._new_line() while ((l < lines and until == -1) or (ad < until and until != -1)) \ and ad <= s.end: ty = self.mem.get_type(ad) # A PE import should not be displayed as a subroutine if not(self.binary.type == T_BIN_PE and ad in self.binary.imports) \ and self.mem.is_code(ad): is_func = ad in self.functions if is_func: if not o.last_2_lines_are_empty(): o._new_line() o._dash() o._user_comment("; SUBROUTINE") o._new_line() o._dash() i = self.lazy_disasm(ad, s.start) if not is_func and ad in self.xrefs and \ not o.last_2_lines_are_empty(): o._new_line() o._asm_inst(i) is_end = ad in self.end_functions # mips if prefetch_after_branch: prefetch_after_branch = False if not is_end: o._new_line() if is_end: for fad in self.end_functions[ad]: sy = api.get_symbol(fad) o._user_comment("; end function %s" % sy) o._new_line() o._new_line() elif ARCH_UTILS.is_uncond_jump(i) or ARCH_UTILS.is_ret(i): if self.is_mips: prefetch_after_branch = True else: o._new_line() elif ARCH_UTILS.is_call(i): op = i.operands[0] if op.type == self.capstone.CS_OP_IMM: imm = unsigned(op.value.imm) if imm in self.functions and self.is_noreturn(imm): if self.is_mips: prefetch_after_branch = True else: o._new_line() ad += i.size elif MEM_WOFFSET <= ty <= MEM_QOFFSET: prefetch_after_branch = False o._label_and_address(ad) o.set_line(ad) sz = self.mem.get_size(ad) off = s.read_int(ad, sz) if off is None: continue if ctx.gctx.print_bytes: o._bytes(s.read(ad, sz)) o._data_prefix(sz) o._add(" ") o._imm(off, sz, True, print_data=False, force_dont_print_data=True) o._new_line() ad += sz elif ty == MEM_ASCII: prefetch_after_branch = False o._label_and_address(ad) o.set_line(ad) sz = self.mem.get_size(ad) buf = self.binary.get_string(ad, sz) if buf is not None: if ctx.gctx.print_bytes: o._bytes(s.read(ad, sz)) # Split the string into multi lines splitted = buf.split("\n") j = 0 for i, st in enumerate(splitted): if i > 0 and len(st) != 0: o._new_line() o.set_line(ad + j) o._address(ad + j) ibs = 0 bs = 65 while ibs < len(st): if ibs > 0: o._new_line() o.set_line(ad + j) o._address(ad + j) blk = st[ibs:ibs + bs] if i < len(splitted) - 1 and ibs + bs >= len( st): o._string('"' + blk + '\\n"') j += len(blk) + 1 else: o._string('"' + blk + '"') j += len(blk) ibs += bs o._add(", 0") o._new_line() ad += sz elif ty == MEM_ARRAY: prefetch_after_branch = False o._label_and_address(ad) array_info = self.mem.mm[ad] total_size = array_info[0] entry_type = array_info[2] entry_size = self.mem.get_size_from_type(entry_type) n = int(total_size / entry_size) o.set_line(ad) o._data_prefix(entry_size) k = 0 while k < total_size: if o.curr_index > 70: o._new_line() o.set_line(ad) o._address(ad) o._data_prefix(entry_size) l += 1 val = s.read_int(ad, entry_size) if MEM_WOFFSET <= entry_type <= MEM_QOFFSET: o._add(" ") o._imm(val, entry_size, True, print_data=False, force_dont_print_data=True) else: o._word(val, entry_size, is_from_array=True) ad += entry_size k += entry_size if k < total_size: o._add(",") o._new_line() else: prefetch_after_branch = False o._label_and_address(ad) o.set_line(ad) sz = self.mem.get_size_from_type(ty) if ctx.gctx.print_bytes: o._bytes(s.read(ad, sz)) o._word(s.read_int(ad, sz), sz) o._new_line() ad += sz l += 1 s = self.binary.get_section(ad) if s is None: # Get the next section, it's not mandatory that sections # are consecutives ! s = self.binary.get_next_section(ad) if s is None: break o._new_line() ad = s.start if until != -1 and ad >= until: break if (l >= lines and until == -1) or (ad >= until and until != -1): break o.curr_section = s if until == ad: if self.mem.is_code(ad) and ad in self.xrefs or \ s is not None and ad == s.start: if not o.last_2_lines_are_empty(): o._new_line() # remove the last empty line o.lines.pop(-1) o.token_lines.pop(-1) o.join_lines() return o def hexdump(self, ctx, lines): MAX_NB_BYTES = 16 def print_line(ad, line): if not line: return print_no_end(color_addr(ad)) for by in line: print_no_end("%.2x " % by) if len(line) != MAX_NB_BYTES: print_no_end(" " * (MAX_NB_BYTES - len(line))) print_no_end("| ") for by in line: if by in BYTES_PRINTABLE_SET and by != 13 and by != 9 and by != 10: print_no_end("%c" % by) else: print_no_end(".") print() ad = ctx.entry s = self.binary.get_section(ad) off = ad - s.start l = 0 buf = [] first_ad = ad while off < s.real_size and l < lines: buf.append(s.data[off]) if len(buf) == MAX_NB_BYTES: l += 1 print_line(first_ad, buf) buf.clear() first_ad = s.start + off off += 1 print_line(first_ad, buf) def print_functions(self, api): total = 0 lst = list(self.functions) lst.sort() # TODO: race condition with the analyzer ? for ad in lst: print_no_end(color_addr(ad)) sy = api.get_symbol(ad) if ad in self.db.reverse_demangled: print_no_end( " %s (%s) " % (self.db.reverse_demangled[ad], color_comment(sy))) else: print_no_end(" " + sy) print() total += 1 print("Total:", total) # # sym_filter : search a symbol, non case-sensitive # if it starts with '-', it prints non-matching symbols # def print_symbols(self, sym_filter=None): if sym_filter is not None: sym_filter = sym_filter.lower() if sym_filter[0] == "-": invert_match = True sym_filter = sym_filter[1:] else: invert_match = False total = 0 # TODO: race condition with the analyzer ? for sy in list(self.db.symbols): ad = self.db.symbols[sy] if ad in self.db.reverse_demangled: dem = self.db.reverse_demangled[ad] else: dem = None print_sym = True if sym_filter is None or \ (invert_match and sym_filter not in sy.lower()) or \ (not invert_match and sym_filter in sy.lower()) or \ (dem is not None and ((invert_match and sym_filter not in dem.lower()) or \ (not invert_match and sym_filter in dem.lower()))): if sy: print_no_end(color_addr(ad)) if dem is not None: print_no_end(" %s (%s) " % (dem, color_comment(sy))) else: print_no_end(" " + sy) print() total += 1 print("Total:", total) def lazy_disasm(self, ad, stay_in_section=-1, s=None): s = self.binary.get_section(ad) if s is None: return None # if stay_in_section != -1 and s.start != stay_in_section: # return None, s if ad in self.capstone_inst: return self.capstone_inst[ad] # TODO: remove when it's too big ? if len(self.capstone_inst) > CAPSTONE_CACHE_SIZE: self.capstone_inst.clear() # Disassemble by block of N bytes N = 128 d = s.read(ad, N) gen = self.md.disasm(d, ad) try: first = next(gen) except StopIteration: return None self.capstone_inst[first.address] = first for i in gen: if i.address in self.capstone_inst: break self.capstone_inst[i.address] = i return first def __add_prefetch(self, addr_set, inst): if self.is_mips: prefetch = self.lazy_disasm(inst.address + inst.size) addr_set.add(prefetch.address) return prefetch return None def is_noreturn(self, ad): func_obj = self.functions[ad] if func_obj is None: return False return self.functions[ad][FUNC_FLAGS] & FUNC_FLAG_NORETURN # Generate a flow graph of the given function (addr) def get_graph(self, entry): ARCH_UTILS = self.load_arch_module().utils gph = Graph(self, entry) stack = [entry] start = time() prefetch = None addresses = set() # WARNING: this assume that on every architectures the jump # address is the last operand (operands[-1]) # Here each instruction is a node. Blocks will be created in the # function __simplify. while stack: ad = stack.pop() inst = self.lazy_disasm(ad) if inst is None: # Remove all previous instructions which have a link # to this instruction. if ad in gph.link_in: for i in gph.link_in[ad]: gph.link_out[i].remove(ad) for i in gph.link_in[ad]: if not gph.link_out[i]: del gph.link_out[i] del gph.link_in[ad] continue if gph.exists(inst): continue addresses.add(ad) if ARCH_UTILS.is_ret(inst): prefetch = self.__add_prefetch(addresses, inst) gph.new_node(inst, prefetch, None) elif ARCH_UTILS.is_uncond_jump(inst): prefetch = self.__add_prefetch(addresses, inst) gph.uncond_jumps_set.add(ad) op = inst.operands[-1] if op.type == self.capstone.CS_OP_IMM: nxt = unsigned(op.value.imm) if nxt in self.functions: gph.new_node(inst, prefetch, None) else: stack.append(nxt) gph.new_node(inst, prefetch, [nxt]) else: if inst.address in self.jmptables: table = self.jmptables[inst.address].table stack += table gph.new_node(inst, prefetch, table) else: # Can't interpret jmp ADDR|reg gph.new_node(inst, prefetch, None) elif ARCH_UTILS.is_cond_jump(inst): prefetch = self.__add_prefetch(addresses, inst) gph.cond_jumps_set.add(ad) op = inst.operands[-1] if op.type == self.capstone.CS_OP_IMM: if prefetch is None: direct_nxt = inst.address + inst.size else: direct_nxt = prefetch.address + prefetch.size nxt_jmp = unsigned(op.value.imm) stack.append(direct_nxt) if nxt_jmp in self.functions: gph.new_node(inst, prefetch, [direct_nxt]) else: stack.append(nxt_jmp) gph.new_node(inst, prefetch, [direct_nxt, nxt_jmp]) else: # Can't interpret jmp ADDR|reg gph.new_node(inst, prefetch, None) else: if ad != entry and ARCH_UTILS.is_call(inst): # TODO: like in the analyzer, simulate registers # -> during the analysis, save in the database # the immediate value. op = inst.operands[0] if op.type == self.capstone.CS_OP_IMM: imm = unsigned(op.value.imm) if imm in self.functions and self.is_noreturn(imm): prefetch = self.__add_prefetch(addresses, inst) gph.new_node(inst, prefetch, None) gph.exit_or_ret.add(ad) continue if op.type == self.capstone.CS_OP_MEM and \ op.mem.disp in self.binary.imports and \ self.binary.imports[op.mem.disp] & FUNC_FLAG_NORETURN: prefetch = self.__add_prefetch(addresses, inst) gph.new_node(inst, prefetch, None) gph.exit_or_ret.add(ad) continue nxt = inst.address + inst.size stack.append(nxt) gph.new_node(inst, None, [nxt]) if len(gph.nodes) == 0: return None, 0 if self.binary.type == T_BIN_PE: nb_new_syms = self.binary.reverse_stripped_list(self, addresses) else: nb_new_syms = 0 elapsed = time() elapsed = elapsed - start debug__("Graph built in %fs (%d instructions)" % (elapsed, len(gph.nodes))) return gph, nb_new_syms
def __init__(self, filename, raw_type, raw_base, raw_big_endian, database): import capstone as CAPSTONE arch_lookup = { "x86": CAPSTONE.CS_ARCH_X86, "x64": CAPSTONE.CS_ARCH_X86, "ARM": CAPSTONE.CS_ARCH_ARM, "MIPS32": CAPSTONE.CS_ARCH_MIPS, "MIPS64": CAPSTONE.CS_ARCH_MIPS, } mode_lookup = { "x86": CAPSTONE.CS_MODE_32, "x64": CAPSTONE.CS_MODE_64, "ARM": CAPSTONE.CS_ARCH_ARM, "MIPS32": CAPSTONE.CS_MODE_MIPS32, "MIPS64": CAPSTONE.CS_MODE_MIPS64, } word_size_lookup = { "x86": 4, "x64": 8, "ARM": 4, "MIPS32": 4, "MIPS64": 8, } self.capstone_inst = {} # capstone instruction cache self.db = database if database.loaded: self.mem = database.mem else: self.mem = Memory() database.mem = self.mem self.instanciate_binary(filename, raw_type, raw_base, raw_big_endian) if self.binary.arch not in ("x86", "x64", "MIPS32", "MIPS64", "ARM"): raise ExcArch(arch) self.wordsize = word_size_lookup.get(self.binary.arch, None) self.binary.wordsize = self.wordsize self.is_mips = self.binary.arch in ("MIPS32", "MIPS64") self.is_x86 = self.binary.arch in ("x86", "x64") self.is_arm = self.binary.arch in ("ARM") self.is_big_endian = self.binary.is_big_endian() self.binary.load_section_names() self.jmptables = database.jmptables self.user_inline_comments = database.user_inline_comments self.internal_inline_comments = database.internal_inline_comments self.user_previous_comments = database.user_previous_comments self.internal_previous_comments = database.internal_previous_comments self.functions = database.functions self.func_id = database.func_id self.end_functions = database.end_functions self.xrefs = database.xrefs self.mem.xrefs = database.xrefs self.mem.data_sub_xrefs = database.data_sub_xrefs self.mips_gp = database.mips_gp if not database.loaded: self.load_symbols() database.symbols = self.binary.symbols database.reverse_symbols = self.binary.reverse_symbols database.demangled = self.binary.demangled database.reverse_demangled = self.binary.reverse_demangled database.imports = self.binary.imports else: self.binary.symbols = database.symbols self.binary.reverse_symbols = database.reverse_symbols self.binary.demangled = database.demangled self.binary.reverse_demangled = database.reverse_demangled self.binary.imports = database.imports cs_arch = arch_lookup.get(self.binary.arch, None) cs_mode = mode_lookup.get(self.binary.arch, None) if self.is_big_endian: cs_mode |= CAPSTONE.CS_MODE_BIG_ENDIAN else: cs_mode |= CAPSTONE.CS_MODE_LITTLE_ENDIAN self.capstone = CAPSTONE self.md = CAPSTONE.Cs(cs_arch, cs_mode) self.md.detail = True for s in self.binary.iter_sections(): s.big_endian = cs_mode & CAPSTONE.CS_MODE_BIG_ENDIAN if self.binary.arch == "x86": warning("To compute correctly the value of esp, the frame size must") warning("be correct. But the heuristic is very simple actually.") warning("So every references to ebp should be correct but for esp it") warning("may have some errors.") warning("In the visual press I to show original instructions.")
class Disassembler(): def __init__(self, filename, raw_type, raw_base, raw_big_endian, database): import capstone as CAPSTONE arch_lookup = { "x86": CAPSTONE.CS_ARCH_X86, "x64": CAPSTONE.CS_ARCH_X86, "ARM": CAPSTONE.CS_ARCH_ARM, "MIPS32": CAPSTONE.CS_ARCH_MIPS, "MIPS64": CAPSTONE.CS_ARCH_MIPS, } mode_lookup = { "x86": CAPSTONE.CS_MODE_32, "x64": CAPSTONE.CS_MODE_64, "ARM": CAPSTONE.CS_ARCH_ARM, "MIPS32": CAPSTONE.CS_MODE_MIPS32, "MIPS64": CAPSTONE.CS_MODE_MIPS64, } word_size_lookup = { "x86": 4, "x64": 8, "ARM": 4, "MIPS32": 4, "MIPS64": 8, } self.capstone_inst = {} # capstone instruction cache self.db = database if database.loaded: self.mem = database.mem else: self.mem = Memory() database.mem = self.mem self.instanciate_binary(filename, raw_type, raw_base, raw_big_endian) if self.binary.arch not in ("x86", "x64", "MIPS32", "MIPS64", "ARM"): raise ExcArch(arch) self.wordsize = word_size_lookup.get(self.binary.arch, None) self.binary.wordsize = self.wordsize self.is_mips = self.binary.arch in ("MIPS32", "MIPS64") self.is_x86 = self.binary.arch in ("x86", "x64") self.is_arm = self.binary.arch in ("ARM") self.is_big_endian = self.binary.is_big_endian() self.binary.load_section_names() self.jmptables = database.jmptables self.user_inline_comments = database.user_inline_comments self.internal_inline_comments = database.internal_inline_comments self.user_previous_comments = database.user_previous_comments self.internal_previous_comments = database.internal_previous_comments self.functions = database.functions self.func_id = database.func_id self.end_functions = database.end_functions self.xrefs = database.xrefs self.mem.xrefs = database.xrefs self.mem.data_sub_xrefs = database.data_sub_xrefs self.mips_gp = database.mips_gp if not database.loaded: self.load_symbols() database.symbols = self.binary.symbols database.reverse_symbols = self.binary.reverse_symbols database.demangled = self.binary.demangled database.reverse_demangled = self.binary.reverse_demangled database.imports = self.binary.imports else: self.binary.symbols = database.symbols self.binary.reverse_symbols = database.reverse_symbols self.binary.demangled = database.demangled self.binary.reverse_demangled = database.reverse_demangled self.binary.imports = database.imports cs_arch = arch_lookup.get(self.binary.arch, None) cs_mode = mode_lookup.get(self.binary.arch, None) if self.is_big_endian: cs_mode |= CAPSTONE.CS_MODE_BIG_ENDIAN else: cs_mode |= CAPSTONE.CS_MODE_LITTLE_ENDIAN self.capstone = CAPSTONE self.md = CAPSTONE.Cs(cs_arch, cs_mode) self.md.detail = True for s in self.binary.iter_sections(): s.big_endian = cs_mode & CAPSTONE.CS_MODE_BIG_ENDIAN if self.binary.arch == "x86": warning("To compute correctly the value of esp, the frame size must") warning("be correct. But the heuristic is very simple actually.") warning("So every references to ebp should be correct but for esp it") warning("may have some errors.") warning("In the visual press I to show original instructions.") def instanciate_binary(self, filename, raw_type, raw_base, raw_big_endian): if raw_type != None: import plasma.lib.fileformat.raw as LIB_RAW self.binary = LIB_RAW.Raw(filename, raw_type, raw_base, raw_big_endian) self.type = T_BIN_RAW return start = time() ty = self.get_magic(filename) if ty == T_BIN_ELF: import plasma.lib.fileformat.elf as LIB_ELF self.binary = LIB_ELF.ELF(self.db, filename) elif ty == T_BIN_PE: import plasma.lib.fileformat.pe as LIB_PE self.binary = LIB_PE.PE(self.db, filename) else: raise ExcFileFormat() self.binary.type = ty elapsed = time() elapsed = elapsed - start debug__("Binary loaded in %fs" % elapsed) def load_symbols(self): start = time() self.binary.load_static_sym() self.binary.load_dyn_sym() self.binary.demangle_symbols() elapsed = time() elapsed = elapsed - start debug__("Found %d symbols in %fs" % (len(self.binary.symbols), elapsed)) def get_magic(self, filename): f = open(filename, "rb") magic = f.read(8) f.close() if magic.startswith(b"\x7fELF"): return T_BIN_ELF elif magic.startswith(b"MZ"): return T_BIN_PE return None # `func_ad` is the function address where the variable `name` # is supposed to be. def var_get_offset(self, func_ad, name): if func_ad not in self.functions: return None func_obj = self.functions[func_ad] if func_obj is None: return None for off, val in func_obj[FUNC_OFF_VARS].items(): if val[VAR_NAME] == name: return off return None def var_rename(self, func_ad, off, name): if func_ad not in self.functions: return func_obj = self.functions[func_ad] if func_obj is None: return func_obj[FUNC_OFF_VARS][off][VAR_NAME] = name def load_arch_module(self): if self.binary.arch in ("x86", "x64"): import plasma.lib.arch.x86 as ARCH elif self.binary.arch == "ARM": import plasma.lib.arch.arm as ARCH elif self.binary.arch in ("MIPS32", "MIPS64"): import plasma.lib.arch.mips as ARCH else: raise NotImplementedError return ARCH def dump_xrefs(self, ctx, ad): ARCH = self.load_arch_module() ARCH_OUTPUT = ARCH.output o = ARCH_OUTPUT.Output(ctx) o._new_line() o.print_labels = False xrefs = list(ctx.gctx.api.xrefsto(ad)) xrefs.sort() seen = set() for x in xrefs: x = self.mem.get_head_addr(x) if x in seen: continue seen.add(x) s = self.binary.get_section(x) ty = self.mem.get_type(x) # A PE import should not be displayed as a subroutine if not(self.binary.type == T_BIN_PE and ad in self.binary.imports) \ and (ty == MEM_FUNC or ty == MEM_CODE): func_id = self.mem.get_func_id(x) if func_id != -1: fad = self.func_id[func_id] o._label(fad) diff = x - fad if diff >= 0: o._add(" + %d " % diff) else: o._add(" - %d " % (-diff)) o._pad_width(20) i = self.lazy_disasm(x, s.start) o._asm_inst(i) elif MEM_WOFFSET <= ty <= MEM_QOFFSET: o.set_line(x) o._pad_width(20) o._address(x) sz = self.mem.get_size(x) off = s.read_int(x, sz) if off is None: continue o._data_prefix(sz) o._add(" ") o._imm(off, sz, True, print_data=False, force_dont_print_data=True) o._new_line() elif ty == MEM_ARRAY: o.set_line(x) o._pad_width(20) o._address(x) o._label(x, print_colon=True) o._new_line() else: o._pad_width(20) o._address(x) o.set_line(x) sz = self.mem.get_size_from_type(ty) o._word(s.read_int(x, sz), sz) o._new_line() # remove the last empty line o.lines.pop(-1) o.token_lines.pop(-1) o.join_lines() return o def is_label(self, ad): return ad in self.db.reverse_symbols or ad in self.xrefs def dump_asm(self, ctx, lines=NB_LINES_TO_DISASM, until=-1): ARCH = self.load_arch_module() ARCH_OUTPUT = ARCH.output ARCH_UTILS = ARCH.utils ad = ctx.entry s = self.binary.get_section(ad) if s is None: # until is != -1 only from the visual mode # It allows to not go before the first section. if until != -1: return None # Get the next section, it's not mandatory that sections # are consecutives ! s = self.binary.get_next_section(ad) if s is None: return None ad = s.start o = ARCH_OUTPUT.Output(ctx) o._new_line() o.curr_section = s o.mode_dump = True l = 0 api = ctx.gctx.api # For mips: after a jump we add a newline, but for mips we should # add this newline after the prefetch instruction. prefetch_after_branch = False while 1: if ad == s.start: if not o.is_last_2_line_empty(): o._new_line() o._dash() o._section(s.name) o._add(" 0x%x -> 0x%x" % (s.start, s.end)) o._new_line() o._new_line() while ((l < lines and until == -1) or (ad < until and until != -1)) \ and ad <= s.end: ty = self.mem.get_type(ad) # A PE import should not be displayed as a subroutine if not(self.binary.type == T_BIN_PE and ad in self.binary.imports) \ and self.mem.is_code(ad): is_func = ad in self.functions if is_func: if not o.is_last_2_line_empty(): o._new_line() o._dash() o._user_comment("; SUBROUTINE") o._new_line() o._dash() i = self.lazy_disasm(ad, s.start) if not is_func and ad in self.xrefs and \ not o.is_last_2_line_empty(): o._new_line() o._asm_inst(i) is_end = ad in self.end_functions # mips if prefetch_after_branch: prefetch_after_branch = False if not is_end: o._new_line() if is_end: for fad in self.end_functions[ad]: sy = api.get_symbol(fad) o._user_comment("; end function %s" % sy) o._new_line() o._new_line() elif ARCH_UTILS.is_uncond_jump(i) or ARCH_UTILS.is_ret(i): if self.is_mips: prefetch_after_branch = True else: o._new_line() elif ARCH_UTILS.is_call(i): op = i.operands[0] if op.type == self.capstone.CS_OP_IMM: imm = unsigned(op.value.imm) if imm in self.functions and self.is_noreturn(imm): if self.is_mips: prefetch_after_branch = True else: o._new_line() ad += i.size elif MEM_WOFFSET <= ty <= MEM_QOFFSET: prefetch_after_branch = False o._label_and_address(ad) o.set_line(ad) sz = self.mem.get_size(ad) off = s.read_int(ad, sz) if off is None: continue if ctx.gctx.print_bytes: o._bytes(s.read(ad, sz)) o._data_prefix(sz) o._add(" ") o._imm(off, sz, True, print_data=False, force_dont_print_data=True) o._new_line() ad += sz elif ty == MEM_ASCII: prefetch_after_branch = False o._label_and_address(ad) o.set_line(ad) sz = self.mem.get_size(ad) buf = self.binary.get_string(ad, sz) if buf is not None: if ctx.gctx.print_bytes: o._bytes(s.read(ad, sz)) # Split the string into multi lines splitted = buf.split("\n") j = 0 for i, st in enumerate(splitted): if i > 0 and len(st) != 0: o._new_line() o.set_line(ad + j) o._address(ad + j) ibs = 0 bs = 65 while ibs < len(st): if ibs > 0: o._new_line() o.set_line(ad + j) o._address(ad + j) blk = st[ibs:ibs + bs] if i < len(splitted) - 1 and ibs + bs >= len(st): o._string('"' + blk + '\\n"') j += len(blk) + 1 else: o._string('"' + blk + '"') j += len(blk) ibs += bs o._add(", 0") o._new_line() ad += sz elif ty == MEM_ARRAY: prefetch_after_branch = False o._label_and_address(ad) array_info = self.mem.mm[ad] total_size = array_info[0] entry_type = array_info[2] entry_size = self.mem.get_size_from_type(entry_type) n = int(total_size / entry_size) o.set_line(ad) o._data_prefix(entry_size) k = 0 while k < total_size: if o.curr_index > 70: o._new_line() o.set_line(ad) o._address(ad) o._data_prefix(entry_size) l += 1 val = s.read_int(ad, entry_size) if MEM_WOFFSET <= entry_type <= MEM_QOFFSET: o._add(" ") o._imm(val, entry_size, True, print_data=False, force_dont_print_data=True) else: o._word(val, entry_size, is_from_array=True) ad += entry_size k += entry_size if k < total_size: o._add(",") o._new_line() else: prefetch_after_branch = False o._label_and_address(ad) o.set_line(ad) sz = self.mem.get_size_from_type(ty) if ctx.gctx.print_bytes: o._bytes(s.read(ad, sz)) o._word(s.read_int(ad, sz), sz) o._new_line() ad += sz l += 1 s = self.binary.get_section(ad) if s is None: # Get the next section, it's not mandatory that sections # are consecutives ! s = self.binary.get_next_section(ad) if s is None: break o._new_line() ad = s.start if until != -1 and ad >= until: break if (l >= lines and until == -1) or (ad >= until and until != -1): break o.curr_section = s if until == ad: if self.mem.is_code(ad) and ad in self.xrefs or \ s is not None and ad == s.start: if not o.is_last_2_line_empty(): o._new_line() # remove the last empty line o.lines.pop(-1) o.token_lines.pop(-1) o.join_lines() return o def hexdump(self, ctx, lines): MAX_NB_BYTES = 16 def print_line(ad, line): if not line: return print_no_end(color_addr(ad)) for by in line: print_no_end("%.2x " % by) if len(line) != MAX_NB_BYTES: print_no_end(" " * (MAX_NB_BYTES - len(line))) print_no_end("| ") for by in line: if by in BYTES_PRINTABLE_SET and by != 13 and by != 9 and by != 10: print_no_end("%c" % by) else: print_no_end(".") print() ad = ctx.entry s = self.binary.get_section(ad) off = ad - s.start l = 0 buf = [] first_ad = ad while off < s.real_size and l < lines: buf.append(s.data[off]) if len(buf) == MAX_NB_BYTES: l += 1 print_line(first_ad, buf) buf.clear() first_ad = s.start + off off += 1 print_line(first_ad, buf) def print_functions(self, api): total = 0 lst = list(self.functions) lst.sort() # TODO: race condition with the analyzer ? for ad in lst: print_no_end(color_addr(ad)) sy = api.get_symbol(ad) if ad in self.db.reverse_demangled: print_no_end(" %s (%s) " % (self.db.reverse_demangled[ad], color_comment(sy))) else: print_no_end(" " + sy) print() total += 1 print("Total:", total) # # sym_filter : search a symbol, non case-sensitive # if it starts with '-', it prints non-matching symbols # def print_symbols(self, sym_filter=None): if sym_filter is not None: sym_filter = sym_filter.lower() if sym_filter[0] == "-": invert_match = True sym_filter = sym_filter[1:] else: invert_match = False total = 0 # TODO: race condition with the analyzer ? for sy in list(self.db.symbols): ad = self.db.symbols[sy] if ad in self.db.reverse_demangled: dem = self.db.reverse_demangled[ad] else: dem = None print_sym = True if sym_filter is None or \ (invert_match and sym_filter not in sy.lower()) or \ (not invert_match and sym_filter in sy.lower()) or \ (dem is not None and ((invert_match and sym_filter not in dem.lower()) or \ (not invert_match and sym_filter in dem.lower()))): if sy: print_no_end(color_addr(ad)) if dem is not None: print_no_end(" %s (%s) " % (dem, color_comment(sy))) else: print_no_end(" " + sy) print() total += 1 print("Total:", total) def lazy_disasm(self, ad, stay_in_section=-1, s=None): s = self.binary.get_section(ad) if s is None: return None # if stay_in_section != -1 and s.start != stay_in_section: # return None, s if ad in self.capstone_inst: return self.capstone_inst[ad] # TODO: remove when it's too big ? if len(self.capstone_inst) > CAPSTONE_CACHE_SIZE: self.capstone_inst.clear() # Disassemble by block of N bytes N = 128 d = s.read(ad, N) gen = self.md.disasm(d, ad) try: first = next(gen) except StopIteration: return None self.capstone_inst[first.address] = first for i in gen: if i.address in self.capstone_inst: break self.capstone_inst[i.address] = i return first def __add_prefetch(self, addr_set, inst): if self.is_mips: prefetch = self.lazy_disasm(inst.address + inst.size) addr_set.add(prefetch.address) return prefetch return None def is_noreturn(self, ad): func_obj = self.functions[ad] if func_obj is None: return False return self.functions[ad][FUNC_FLAGS] & FUNC_FLAG_NORETURN # Generate a flow graph of the given function (addr) def get_graph(self, entry): ARCH_UTILS = self.load_arch_module().utils gph = Graph(self, entry) stack = [entry] start = time() prefetch = None addresses = set() # WARNING: this assume that on every architectures the jump # address is the last operand (operands[-1]) # Here each instruction is a node. Blocks will be created in the # function __simplify. while stack: ad = stack.pop() inst = self.lazy_disasm(ad) if inst is None: # Remove all previous instructions which have a link # to this instruction. if ad in gph.link_in: for i in gph.link_in[ad]: gph.link_out[i].remove(ad) for i in gph.link_in[ad]: if not gph.link_out[i]: del gph.link_out[i] del gph.link_in[ad] continue if gph.exists(inst): continue addresses.add(ad) if ARCH_UTILS.is_ret(inst): prefetch = self.__add_prefetch(addresses, inst) gph.new_node(inst, prefetch, None) elif ARCH_UTILS.is_uncond_jump(inst): prefetch = self.__add_prefetch(addresses, inst) gph.uncond_jumps_set.add(ad) op = inst.operands[-1] if op.type == self.capstone.CS_OP_IMM: nxt = unsigned(op.value.imm) if nxt in self.functions: gph.new_node(inst, prefetch, None) else: stack.append(nxt) gph.new_node(inst, prefetch, [nxt]) else: if inst.address in self.jmptables: table = self.jmptables[inst.address].table stack += table gph.new_node(inst, prefetch, table) else: # Can't interpret jmp ADDR|reg gph.new_node(inst, prefetch, None) elif ARCH_UTILS.is_cond_jump(inst): prefetch = self.__add_prefetch(addresses, inst) gph.cond_jumps_set.add(ad) op = inst.operands[-1] if op.type == self.capstone.CS_OP_IMM: if prefetch is None: direct_nxt = inst.address + inst.size else: direct_nxt = prefetch.address + prefetch.size nxt_jmp = unsigned(op.value.imm) stack.append(direct_nxt) if nxt_jmp in self.functions: gph.new_node(inst, prefetch, [direct_nxt]) else: stack.append(nxt_jmp) gph.new_node(inst, prefetch, [direct_nxt, nxt_jmp]) else: # Can't interpret jmp ADDR|reg gph.new_node(inst, prefetch, None) else: if ad != entry and ARCH_UTILS.is_call(inst): op = inst.operands[0] if op.type == self.capstone.CS_OP_IMM: imm = unsigned(op.value.imm) if imm in self.functions and self.is_noreturn(imm): prefetch = self.__add_prefetch(addresses, inst) gph.new_node(inst, prefetch, None) continue nxt = inst.address + inst.size stack.append(nxt) gph.new_node(inst, None, [nxt]) if len(gph.nodes) == 0: return None, 0 if self.binary.type == T_BIN_PE: nb_new_syms = self.binary.reverse_stripped_list(self, addresses) else: nb_new_syms = 0 elapsed = time() elapsed = elapsed - start debug__("Graph built in %fs (%d instructions)" % (elapsed, len(gph.nodes))) return gph, nb_new_syms
def __init__(self, filename, raw_type, raw_base, raw_big_endian, database): import capstone as CAPSTONE arch_lookup = { "x86": CAPSTONE.CS_ARCH_X86, "x64": CAPSTONE.CS_ARCH_X86, "ARM": CAPSTONE.CS_ARCH_ARM, "MIPS32": CAPSTONE.CS_ARCH_MIPS, "MIPS64": CAPSTONE.CS_ARCH_MIPS, } mode_lookup = { "x86": CAPSTONE.CS_MODE_32, "x64": CAPSTONE.CS_MODE_64, "ARM": CAPSTONE.CS_ARCH_ARM, "MIPS32": CAPSTONE.CS_MODE_MIPS32, "MIPS64": CAPSTONE.CS_MODE_MIPS64, } word_size_lookup = { "x86": 4, "x64": 8, "ARM": 4, "MIPS32": 4, "MIPS64": 8, } self.capstone_inst = {} # capstone instruction cache self.db = database if database.loaded: self.mem = database.mem else: self.mem = Memory() database.mem = self.mem self.instanciate_binary(filename, raw_type, raw_base, raw_big_endian) if self.binary.arch not in ("x86", "x64", "MIPS32", "MIPS64", "ARM"): raise ExcArch(arch) self.wordsize = word_size_lookup.get(self.binary.arch, None) self.binary.wordsize = self.wordsize self.is_mips = self.binary.arch in ("MIPS32", "MIPS64") self.is_x86 = self.binary.arch in ("x86", "x64") self.is_arm = self.binary.arch in ("ARM") self.is_big_endian = self.binary.is_big_endian() self.binary.load_section_names() self.jmptables = database.jmptables self.user_inline_comments = database.user_inline_comments self.internal_inline_comments = database.internal_inline_comments self.user_previous_comments = database.user_previous_comments self.internal_previous_comments = database.internal_previous_comments self.functions = database.functions self.func_id = database.func_id self.end_functions = database.end_functions self.xrefs = database.xrefs self.mips_gp = database.mips_gp if not database.loaded: self.load_symbols() database.symbols = self.binary.symbols database.reverse_symbols = self.binary.reverse_symbols database.demangled = self.binary.demangled database.reverse_demangled = self.binary.reverse_demangled database.imports = self.binary.imports cs_arch = arch_lookup.get(self.binary.arch, None) cs_mode = mode_lookup.get(self.binary.arch, None) if self.is_big_endian: cs_mode |= CAPSTONE.CS_MODE_BIG_ENDIAN else: cs_mode |= CAPSTONE.CS_MODE_LITTLE_ENDIAN self.capstone = CAPSTONE self.md = CAPSTONE.Cs(cs_arch, cs_mode) self.md.detail = True for s in self.binary.iter_sections(): s.big_endian = cs_mode & CAPSTONE.CS_MODE_BIG_ENDIAN
def __load_memory(self, data): self.mem = Memory() self.mem.mm = data["mem"]