def add_file(self, filepath, offset): ''' For the ELF file at filepath, adds a mapping to self and loads debug symbols into GDB. offset is added to the start and end addresses for the mapping (offset is the offset that the ELF is loaded to) ''' bn = os.path.basename(filepath) if bn in self._files: return elf_sects = read_elf_sects(filepath) sects = {} for sect, (start, size) in elf_sects.items(): start += offset sects[sect] = start ad = AttrDict(start=start, end=start + size, size=size, offset=0, name=bn, sect=sect) self.append(ad) cmd = "add-symbol-file '{}' {:#x} {}".format( filepath, sects[".text"], " ".join("-s {} {:#x}".format(nm, st) for (nm, st) in sects.items() if nm != ".text")) gdb.execute(cmd, False, True) self._files.add(bn)
def __init__(self, backtrace_text, blacklist=None, major_depth=5): list.__init__(self) self.blacklist = blacklist hc = 0 i = 0 major = "0" minor = "0" self.abnormal_termination = False for line in backtrace_text.splitlines(): if "#" == line[0]: self.append(Frame(line)) for frame in self: # The check below is a workaround for a known libc/gdb runaway # backtrace issue, see # http://sourceware.org/ml/libc-alpha/2012-03/msg00573.html if frame.name() and "libc_start_main" in frame.name(): break if not self._in_blacklist(frame): if hc < major_depth: major = hashlib.md5( (major + str(hash(frame))).encode()).hexdigest() minor = hashlib.md5( (minor + str(hash(frame))).encode()).hexdigest() hc += 1 else: frame.blacklisted = True i += 1 self.hash = AttrDict(major=major, minor=minor)
def __init__(self, target, blocklist=None, major_depth=5, limit=0): ''' Uses the GDB API to populate self. Any frames in blocklist are marked as such. The first non-blocklisted, major_depth frames are used to calculate the backtrace's major hash. ''' list.__init__(self) self.blocklist = blocklist frame = self._next_frame() hc = 0 i = 0 major = "0" minor = "0" self.abnormal_termination = False while frame: frame = self._next_frame(target, frame, i) if not frame: break # The check below is a workaround for a known libc/gdb runaway # backtrace issue, see # http://sourceware.org/ml/libc-alpha/2012-03/msg00573.html if frame.name() and "libc_start_main" in frame.name(): break if not self._in_blocklist(frame): if hc < major_depth: major = hashlib.md5( (major + str(frame)).encode()).hexdigest() minor = hashlib.md5((minor + str(frame)).encode()).hexdigest() hc += 1 else: frame.blocklisted = True self.append(frame) # some versions of the GDB Python API do not expose a frame unwind # error sentinel; if it is not available a hardcoded value based on # an enum from GDB's gdb/frames.h is used unwind_error = getattr(gdb, "FRAME_UNWIND_FIRST_ERROR", 3) if frame.unwind_stop_reason() >= unwind_error: self.abnormal_termination = True break try: frame = frame.older() except RuntimeError: self.abnormal_termination = True break i += 1 if limit and i >= limit: break self.hash = AttrDict(major=major, minor=minor)
def _add_by_target(self, num_initial_stacks=0): ''' Searches for thread stacks and adds corresponding ranges to self. Also sets self.tgt_img to the name associated with the first thread stack. The first num_initial_stacks sections in the target (such as a core file) are treated as thread stacks. This logic is currently only used by the ASan logic, which is used by a specific user and largely untested. ''' # Collect thread stack info mapstr = str(gdb.execute("info target", False, True)) ranges = [] fn = None for m in self._re_info_target.finditer(mapstr): if m.group("file") is not None: fn = os.path.basename(m.group("file")) else: lib = m.group("lib") if lib: lib = os.path.basename(lib) ranges.append((int(m.group("start"), 16), int(m.group("end"), 16), fn, lib, m.group("section"))) # Append thread stack info to self for i, (st, en, fn, lib, sect) in enumerate(ranges): if i < num_initial_stacks: # We could correlate thread stack pointer to one of the ranges, # but we'll just assume for now that for num_initial_stacks, # the first num_intial_stacks sections in the core are the stacks name = "[stack]" elif lib is not None: name = lib else: name = fn ad = AttrDict(start=st, end=en, size=(en - st), offset=0, name=name, sect=sect) self.append(ad) #gdb.write("{}: {:#x}-{:#x} {}".format(file, ad.start, ad.end, sect)) self._files.add(os.path.basename(fn)) if ranges: self.tgt_img = ranges[0][2]
def __init__(self): ''' Queries the GDB Python API for the process address space, parses it, and appends it to self ''' self._common_init() mapstr = str(gdb.execute("info proc map", False, True)) header_pos = mapstr.find("Start Addr") if header_pos == -1: raise GdbWrapperError("Unable to parse \"info proc map\" string") maplines = mapstr[header_pos:].splitlines()[1:] for line in maplines: line = line.split() start, end, size, offset = tuple(int(i, 16) for i in line[0:4]) name = " ".join(line[4:]) ad = AttrDict(start=start, end=end, size=size, offset=offset, name=name) self.append(ad)
def __init__(self, mapstr): ''' Queries the GDB Python API for the process address space, parses it, and appends it to self ''' for line in mapstr.splitlines(): cols = line.strip().split() start, end = [int(i, 16) for i in cols[0].split("-")] size = end - start perms = cols[1] offset = int(cols[2], 16) dev = cols[3] inode = cols[4] name = " ".join(cols[5:]).strip() ad = AttrDict(start=start, end=end, size=size, offset=offset, name=name, perms=perms, dev=dev, inode=inode) self.append(ad)
def getRules(self, target): ''' Organizes the nested list of rules (dicts) for classification The rules specified in rules.py are organized into AttrDicts ("rules"). Each rule is composed of a tag and a match_function. ''' processed_rules = [] num_rules = sum(len(rl) for (_, rl) in rules.rules) ranking = 1 for cat, user_rule_list in rules.rules: for user_rule in user_rule_list: match_function = partial( getattr(target.analyzer, user_rule["match_function"])) tag_data = copy.deepcopy(user_rule) del tag_data["match_function"] tag_data["ranking"] = (ranking, num_rules) tag_data["category"] = cat rule = AttrDict(matches=match_function, tag=Tag(tag_data)) processed_rules.append(rule) ranking += 1 return processed_rules
def __init__(self, target): AttrDict.__init__(self) self.tags = []
class Target(object): ''' A wrapper for a Linux GDB Inferior. Includes of various convenience methods used for classification. WARNING: Methods in this object may change the state of GDB. For example, the disassembly flavor may be left as "intel" after this code is executed. ''' _re_info_frame = re.compile( r"""^\s*eip\s=\s([^\s;]*)(?:\sin\s)? # addr ([^\s;]*) # fname ([^\s;]*) # source_file:line """, re.VERBOSE) _re_gdb_info_sym = re.compile( r"""^\s*(?P<sym>.*?)\s+\+\s+(?P<off>[0-9]+)\s+ in\s+section\s+\.text(\s+ of\s+(?P<lib>.*?)\s*)?$""", re.VERBOSE) _re_gdb_addr_bit = re.compile(r"^gdbarch_dump: addr_bit = ([0-9]+)$", re.MULTILINE) _re_gdb_osabi = re.compile(r"\(currently \"(.*)\"\)") _re_gdb_arch = re.compile(r"\(currently\s+(.+)\)") # these functions and libs are not considered to be at fault for a crash blocklist = AttrDict( functions=("__kernel_vsyscall", "abort", "raise", "malloc", "free", "*__GI_abort", "*__GI_raise", "malloc_printerr", "__libc_message", "_int_malloc", "_int_free"), map_regex=re.compile(r".*/libc(\.|-).*|.*/libm(\.|-).*")) def __init__(self, bt_limit=0): self._check_inferior_state() self.bt_limit = bt_limit def _check_inferior_state(self): if len(gdb.inferiors()) != 1: raise GdbWrapperError( "Unsupported number of inferiors ({})".format( len(gdb.inferiors()))) if len(gdb.inferiors()[0].threads()) == 0: raise GdbWrapperError("No threads running") if not gdb.inferiors()[0].threads()[0].is_stopped: raise GdbWrapperError("Inferior's primary thread is not stopped") @memoized def backtrace(self): return Backtrace(self, self.blocklist, limit=self.bt_limit) def hash(self): return self.backtrace().hash @memoized def procmaps(self): return ProcMaps() @memoized def faulting_frame(self): for frame in self.backtrace(): if not frame.blocklisted: return frame warnings.warn("All frames blocklisted") return None @staticmethod def sym_addr(sym): try: return gdb_uint(gdb.parse_and_eval(str(sym))) except gdb.error: return None @memoized def current_instruction(self): try: gdbstr = gdb.execute("x/i 0x%x" % self.pc(), False, True).splitlines()[0] return self._getInstruction(gdbstr) except RuntimeError: return None def _getInstruction(self, gdbstr): return x86Instruction(gdbstr) @memoized def pc(self): return gdb_uint(gdb.parse_and_eval("$pc")) @memoized def stack_pointer(self): return gdb_uint(gdb.parse_and_eval("$sp")) @memoized def pid(self): return gdb.inferiors()[0].pid @memoized def pointer_size(self): return int( self._re_gdb_addr_bit.search( gdb.execute("maint print architecture", False, True)).group(1)) / 8 @memoized def si_signo(self): # This is a workaround to a bug in the GDB Python API: # The only reliable way to cause GDB to raise an exception when # $_siginfo is not available it to call __str__() -- otherwise # (such as when casting the Gdb.Value to another type), GDB may # force Python to abruptly exit rather than raising an exception signo = gdb.parse_and_eval("$_siginfo.si_signo") str(signo) return signo @memoized def si_addr(self): str(gdb.parse_and_eval("$_siginfo._sifields._sigfault.si_addr")) return gdb_uint( gdb.parse_and_eval("$_siginfo._sifields._sigfault.si_addr"))
def __init__(self, asan_output, bt_limit=0): self.__memo__ = { "isPossibleStackCorruption()": False, "isStackCorruption()": False, "isStackOverflow()": False, "si_signo()": 11 } if not asan_output: raise GdbWrapperError("no ASan data to analyze") # symbolize asan_message self.asan_stack = [] out = [] last = 0 all_frames = [] maps = self.procmaps() i = 0 for m in self._re_asan_bt.finditer(asan_output): frame, addr, img, offset = m.group("frame", "addr", "img", "offset") frame = int(frame) addr = int(addr, 16) #+ 1 if img: maps.add_file(img, addr - offset) out.append(asan_output[last:m.end("all")]) all_frames.append((frame, addr, offset, img, len(out))) out.append(None) last = m.end() i += 1 if i >= bt_limit: break if not all_frames: raise GdbWrapperError("No frames found in address sanitizer log") out.append(asan_output[last:]) frame = -1 for num, addr, offset, img, outpos in all_frames: region = maps.findByAddr(addr) symbol = gdb.execute("info symbol {:#x}".format(addr), False, True) symline = gdb.execute("info line *{:#x}".format(addr), False, True) if symline and symline.startswith("Line"): symline = "\n\t{}".format( self._re_symline_trim.sub("", symline)) else: symline = "" symbol_m = self._re_gdb_info_sym.search(symbol) if img: lib = img elif region: lib = region.name else: lib = None if symbol_m is None: sym = None off = offset else: sym = symbol_m.group("sym") off = int(symbol_m.group("off")) if frame == -1: self.asan_pc_img = lib, offset if frame is not None and num > frame: frame = num if lib: lib = os.path.basename(lib) self.asan_stack.append( AttrDict(addr=addr, lib=lib, off=off, name=sym)) else: frame = None out[outpos] = "{}){}".format( ASanFrame.create(self, addr, sym, off).terse(), symline) asan_output = "".join(out) gdb.write(asan_output) gdb.flush() # parse ASAN's analysis m = self._re_asan_fault.search(asan_output) self.__memo__["si_addr()"] = int(m.group("fault"), 16) self.asan_reason = m.group("desc") if self.asan_reason == "double-free": self.__memo__["pc()"] = self.asan_stack[1].addr self.__memo__["stack_pointer()"] = None # what to do? .... else: self.__memo__["pc()"] = int(m.group("pc"), 16) if m.group("bspid1") == "sp": self.__memo__["stack_pointer()"] = int(m.group("bsp1"), 16) else: self.__memo__["stack_pointer()"] = int(m.group("bsp2"), 16) if self.asan_reason != "SEGV": self.asan_operation = m.group("operation")
class Target(object): ''' A wrapper for a Linux GDB Inferior. Includes of various convenience methods used for classification. WARNING: Methods in this object may change the state of GDB. For example, the disassembly flavor may be left as "intel" after this code is executed. ''' _re_info_frame = re.compile( r"""^\s*eip\s=\s([^\s;]*)(?:\sin\s)? # addr ([^\s;]*) # fname ([^\s;]*) # source_file:line """, re.VERBOSE) _re_gdb_info_sym = re.compile( r"""^\s*(?P<sym>.*?)\s+\+\s+(?P<off>[0-9]+)\s+ in\s+section\s+\.text(\s+ of\s+(?P<lib>.*?)\s*)?$""", re.VERBOSE) _re_gdb_addr_bit = re.compile(r"^gdbarch_dump: addr_bit = ([0-9]+)$", re.MULTILINE) _re_gdb_osabi = re.compile(r"\(currently \"(.*)\"\)") _re_gdb_arch = re.compile(r"\(currently\s+(.+)\)") # these functions and libs are not considered to be at fault for a crash blacklist = AttrDict( functions=("__kernel_vsyscall", "abort", "raise", "malloc", "free", "*__GI_abort", "*__GI_raise", "malloc_printerr", "__libc_message", "_int_malloc", "_int_free", '__kernel_vsyscall', 'abort', 'raise', 'malloc', 'free', '*__GI_abort', '*__GI_raise', 'malloc_printerr', '__libc_message', 'malloc_consolidate', '_int_malloc', '__libc_calloc', '_dl_new_object', '_dl_map_object_from_fd', '_dl_catch_error', '_dl_open', 'do_dlopen', 'dlerror_run', '*__GI___libc_dlopen_mode', '_dl_map_object', 'dl_open_worker', 'munmap_chunk', '*__GI___backtrace', '_dl_addr_inside_object', '_int_free', '*__GI___libc_free', '__malloc_assert', 'sYSMALLOc', '_int_realloc', '*__GI___libc_malloc', '*__GI___libc_realloc', '_int_memalign', '*__GI___libc_memalign', '__posix_memalign', 'malloc_consolidate', '__libc_malloc', '__libc_realloc'), map_regex=re.compile(r".*/libc(\.|-).*|.*/libm(\.|-).*")) def __init__(self, bug_dirpath): basis = [('disassembly_text', '/Disassembly.txt'), ('stacktrace_text', '/Stacktrace.txt'), ('registers_text', '/Registers.txt'), ('procmap_text', '/ProcMaps.txt')] for k, v in basis: setattr(self, k, open(bug_dirpath + v, "rt").read()) self.metadata = json.load(open("%s/vulture.json" % bug_dirpath, "rt")) #print self.current_instruction() #print self.registers() #print "%x" % self.pc() #print self.procmaps() #print self.backtrace() #print self.signal() @memoized def arch(self): if "rip" in self.registers(): return "x64" elif "eip" in self.registers(): return "x86" else: raise NotImplementedError("Unknown arch: rip/eip not found") @memoized def registers(self): regs = {} for line in self.registers_text.splitlines(): parts = line.strip().split() name = parts[0] val = int(parts[1], 16) regs[name] = val return regs @memoized def backtrace(self): return Backtrace(self.stacktrace_text, self.blacklist) def hash(self): return self.backtrace().hash @memoized def procmaps(self): return ProcMaps(self.procmap_text) @memoized def faulting_frame(self): for frame in self.backtrace(): if not frame.blacklisted: return frame warnings.warn("All frames blacklisted") return None @staticmethod def sym_addr(sym): try: return gdb_uint(gdb.parse_and_eval(str(sym))) except gdb.error: return None @memoized def current_instruction(self): import re re_hex_int = re.compile(r"^(0x[A-Fa-f0-9]+).*$") lines = [l.strip() for l in self.disassembly_text.splitlines()] inst = None for i in range(0, len(lines)): if "=>" in lines[i]: inst = lines[i] # handle long instructions that break across lines j = 1 while not re.match(re_hex_int, lines[i + j]): inst += lines[i + j] j += 1 break # if no "=>", then try the first line if not inst: inst = lines[0] # handle long instructions that break across lines for i in range(1, len(lines)): if re.match(re_hex_int, lines[i]): break inst += lines[i] return x86Instruction(inst, self) @memoized def pc(self): if self.arch() == "x86": return self.registers()['eip'] else: return self.registers()['rip'] @memoized def stack_pointer(self): if self.arch() == "x86": return self.registers()['esp'] else: return self.registers()['rsp'] @memoized def counter(self): if self.arch() == "x86": return self.registers()['ecx'] else: return self.registers()['rcx'] @memoized def pid(self): return gdb.inferiors()[0].pid @memoized def pointer_size(self): if self.arch() == "x86": return 4 elif self.arch() == "x64": return 8 raise NotImplementedError("unsupported arch") @memoized def signal(self): return self.metadata['sigtext'] @memoized def si_addr(self): str(gdb.parse_and_eval("$_siginfo._sifields._sigfault.si_addr")) return gdb_uint( gdb.parse_and_eval("$_siginfo._sifields._sigfault.si_addr"))