class Collector(ida.action_handler_t): """Generic class to collect information from a binary""" def __init__(self): # Load the type library self.type_lib_file_name = os.path.join( os.environ["OUTPUT_DIR"], "types", os.environ["PREFIX"] + ".json.gz", ) try: with gzip.open(self.type_lib_file_name, "rt") as type_lib_file: self.type_lib = TypeLibCodec.decode(type_lib_file.read()) except Exception as e: print(e) print("Could not find type library, creating a new one") self.type_lib = TypeLib() super().__init__() def write_type_lib(self) -> None: """Dumps the type library to the file specified by the environment variable `TYPE_LIB`. """ with gzip.open(self.type_lib_file_name, "wt") as type_lib_file: encoded = TypeLibCodec.encode(self.type_lib) type_lib_file.write(encoded) type_lib_file.flush() def collect_variables( self, frsize: int, stkoff_delta: int, variables: Iterable[ida.lvar_t], ) -> DefaultDict[Location, Set[Variable]]: """Collects Variables from a list of tinfo_ts and adds their types to the type library.""" collected_vars: DefaultDict[Location, Set[Variable]] = defaultdict(set) for v in variables: if v.name == "" or not v.type(): continue # Add all types to the typelib self.type_lib.add_ida_type(v.type()) typ: TypeInfo = TypeLib.parse_ida_type(v.type()) loc: Optional[Location] = None if v.is_stk_var(): corrected = v.get_stkoff() - stkoff_delta offset = frsize - corrected loc = Stack(offset) if v.is_reg_var(): loc = Register(v.get_reg1()) if loc is not None: collected_vars[loc].add( Variable(typ=typ, name=v.name, user=v.has_user_info)) return collected_vars def activate(self, ctx) -> int: """Runs the collector""" raise NotImplementedError
def __init__(self): # Load the type library self.type_lib_file_name = os.path.join( os.environ["OUTPUT_DIR"], "types", os.environ["PREFIX"] + ".json.gz", ) try: with gzip.open(self.type_lib_file_name, "rt") as type_lib_file: self.type_lib = TypeLibCodec.decode(type_lib_file.read()) except Exception as e: print(e) print("Could not find type library, creating a new one") self.type_lib = TypeLib() super().__init__()
def collect_variables( self, frsize: int, stkoff_delta: int, variables: Iterable[ida.lvar_t], ) -> DefaultDict[Location, Set[Variable]]: """Collects Variables from a list of tinfo_ts and adds their types to the type library.""" collected_vars: DefaultDict[Location, Set[Variable]] = defaultdict(set) for v in variables: if v.name == "" or not v.type(): continue # Add all types to the typelib self.type_lib.add_ida_type(v.type()) typ: TypeInfo = TypeLib.parse_ida_type(v.type()) loc: Optional[Location] = None if v.is_stk_var(): corrected = v.get_stkoff() - stkoff_delta offset = frsize - corrected loc = Stack(offset) if v.is_reg_var(): loc = Register(v.get_reg1()) if loc is not None: collected_vars[loc].add( Variable(typ=typ, name=v.name, user=v.has_user_info)) return collected_vars
def activate(self, ctx) -> int: """Collects types, user-defined variables, their locations in addition to the AST and raw code. """ print("Collecting vars and types.") for ea in (ea for ea in idautils.Functions() if ea in self.debug_functions): # Decompile f = ida.get_func(ea) cfunc = None try: cfunc = ida.decompile(f) except ida.DecompilationFailure: continue if cfunc is None: continue # Function info name: str = ida.get_func_name(ea) self.type_lib.add_ida_type(cfunc.type.get_rettype()) return_type = TypeLib.parse_ida_type(cfunc.type.get_rettype()) arguments = self.collect_variables(f.frsize, cfunc.get_stkoff_delta(), cfunc.arguments) local_vars = self.collect_variables( f.frsize, cfunc.get_stkoff_delta(), [v for v in cfunc.get_lvars() if not v.is_arg_var], ) raw_code = "" for line in cfunc.get_pseudocode(): raw_code += f"{' '.join(ida_lines.tag_remove(line.line).split())}\n" ast = AST(function=cfunc) decompiler = Function( ast=ast, name=name, return_type=return_type, arguments=arguments, local_vars=local_vars, raw_code=raw_code, ) self.functions.append( CollectedFunction( ea=ea, debug=self.debug_functions[ea], decompiler=decompiler, )) self.write_info() return 1
def from_item(cls, item: ida.carg_t, ast: "AST") -> "Call.Arg": # type: ignore node_id = ast.next_id() is_vararg = item.is_vararg idx = None name = None if item.v: idx = item.v.idx assert ast.function is not None name = ast.function.lvars[idx].name formal_type = TypeLib.parse_ida_type(item.formal_type) return cls( node_id=node_id, is_vararg=is_vararg, idx=idx, name=name, formal_type=formal_type, )
def type_dumper(args): tgt_folder, fname = args typelib = TypeLib() with open(fname, "r") as f: for line in f: e = Example.from_json(json.loads(line)) for var in e.target.values(): typelib.add(var.typ) typelib.sort() with open(os.path.join(tgt_folder, "types", fname.split("/")[-1]), "w") as type_lib_file: encoded = TypeLibCodec.encode(typelib) type_lib_file.write(encoded)
def activate(self, ctx) -> int: """Collects types, user-defined variables, and their locations""" print("Collecting vars and types.") # `ea` is the start address of a single function for ea in idautils.Functions(): # Decompile f = ida.get_func(ea) cfunc = None try: cfunc = ida.decompile(f) except ida.DecompilationFailure: continue if cfunc is None: continue # Function info name: str = ida.get_func_name(ea) self.type_lib.add_ida_type(cfunc.type.get_rettype()) return_type = TypeLib.parse_ida_type(cfunc.type.get_rettype()) arguments = self.collect_variables(f.frsize, cfunc.get_stkoff_delta(), cfunc.arguments) local_vars = self.collect_variables( f.frsize, cfunc.get_stkoff_delta(), [v for v in cfunc.get_lvars() if not v.is_arg_var], ) self.functions[ea] = Function( name=name, return_type=return_type, arguments=arguments, local_vars=local_vars, ) self.write_type_lib() self.write_functions() return 1
def from_item(cls, item: ida.cexpr_t, ast: "AST") -> "Type": # type: ignore node_id = ast.next_id() return cls(node_id=node_id, typ=TypeLib.parse_ida_type(item.type))
def main(args): np.random.seed(1234) random.seed(1992) tgt_folder = args["TARGET_FOLDER"] input_folder = args["INPUT_FOLDER"] input_fnames_file = args["INPUT_FNAMES"] input_fnames = [] max_files = int(args["--max"]) with open(input_fnames_file) as f: for s in f: s = s.strip() if s.endswith(".gz"): input_fnames.append(s) if len(input_fnames) >= max_files: break shard_size = int(args["--shard-size"]) if os.path.exists(tgt_folder): op = input(f"{tgt_folder} exists. remove? (y/n) ") if op == "y": shutil.rmtree(tgt_folder) os.system(f"mkdir -p {tgt_folder}") os.system(f"mkdir -p {tgt_folder}/files") os.system(f"mkdir -p {tgt_folder}/types") num_workers = 16 valid_example_count = 0 print("loading examples") with multiprocessing.Pool(num_workers) as pool: json_iter = pool.imap( json_line_reader, ((input_folder, fname) for fname in input_fnames), chunksize=64, ) example_iter = pool.imap(example_generator, json_iter, chunksize=64) for examples in tqdm(example_iter): if not examples: continue json_file_name = examples[0].binary_file["file_name"].split( "/")[-1] with open(os.path.join(tgt_folder, "files/", json_file_name), "w") as f: for example in examples: f.write(dumps(example.to_json()) + "\n") all_functions.setdefault( json_file_name, dict())[example.name] = example.canonical_code valid_example_count += len(examples) print("valid examples: ", valid_example_count) cur_dir = os.getcwd() all_files = glob.glob(os.path.join(tgt_folder, "files/*.jsonl")) file_prefix = os.path.join(tgt_folder, "files/") sorted(all_files) # sort all files by names all_files = list(all_files) file_num = len(all_files) print("Total valid binary file num: ", file_num) test_file = args["--test-file"] if test_file: print(f"using test file {test_file}") with tarfile.open(test_file, "r") as f: test_files = [ os.path.join(file_prefix, x.name.split("/")[-1]) for x in f.getmembers() if x.name.endswith(".jsonl") ] dev_file_num = 0 else: print(f"randomly sample test file {test_file}") test_file_num = int(file_num * 0.1) dev_file_num = int(file_num * 0.1) test_files = list( np.random.choice(all_files, size=test_file_num, replace=False)) test_files_set = set(test_files) train_files = [fname for fname in all_files if fname not in test_files_set] if dev_file_num == 0: dev_file_num = int(len(train_files) * 0.1) np.random.shuffle(train_files) dev_files = train_files[-dev_file_num:] train_files = train_files[:-dev_file_num] # Create types from filtered training set with multiprocessing.Pool(num_workers) as pool: pool.map( type_dumper, ((tgt_folder, fname) for fname in train_files), chunksize=64, ) print("reading typelib") typelib = TypeLib() for fname in tqdm(train_files): fname = os.path.basename(fname) fname = fname[:fname.index(".")] + ".jsonl" typelib.add_json_file(os.path.join(tgt_folder, "types", fname)) typelib.prune(5) typelib.sort() print("dumping typelib") with open(os.path.join(tgt_folder, "typelib.json"), "w") as type_lib_file: encoded = TypeLibCodec.encode(typelib) type_lib_file.write(encoded) train_functions = dict() for train_file in train_files: file_name = train_file.split("/")[-1] for func_name, func in all_functions[file_name].items(): train_functions.setdefault(func_name, set()).add(func) print( f"number training: {len(train_files)}", f"number dev: {len(dev_files)}", f"number test: {len(test_files)}", sep=", ", ) print("dump training files") shards = [ train_files[i:i + shard_size] for i in range(0, len(train_files), shard_size) ] for shard_id, shard_files in enumerate(shards): print(f"Preparing shard {shard_id}, {len(shard_files)} files: ") with open(os.path.join(tgt_folder, "file_list.txt"), "w") as f: for file_name in shard_files: f.write(file_name.split("/")[-1] + "\n") os.chdir(os.path.join(tgt_folder, "files")) print("creating tar file...") os.system(f"tar cf ../train-shard-{shard_id}.tar -T ../file_list.txt") os.chdir(cur_dir) def _dump_dev_file(tgt_file_name, file_names): with open(os.path.join(tgt_folder, "file_list.txt"), "w") as f: for file_name in file_names: last_file_name = file_name.split("/")[-1] f.write(last_file_name + "\n") with open(file_name) as fr: all_lines = fr.readlines() replace_lines = [] for line in all_lines: json_dict = json.loads(line.strip()) func_name = json_dict["name"] canonical_code = all_functions[last_file_name][func_name] func_name_in_train = False func_body_in_train = False if func_name in train_functions: func_name_in_train = True if canonical_code in train_functions[func_name]: func_body_in_train = True json_dict["test_meta"] = dict( function_name_in_train=func_name_in_train, function_body_in_train=func_body_in_train, ) new_json_str = json.dumps(json_dict) replace_lines.append(new_json_str.strip()) with open(file_name, "w") as fw: for line in replace_lines: fw.write(line + "\n") os.chdir(os.path.join(tgt_folder, "files")) print("creating tar file...") os.system(f"tar cf ../{tgt_file_name} -T ../file_list.txt") os.chdir(cur_dir) print("dump dev files") _dump_dev_file("dev.tar", dev_files) print("dump test files") _dump_dev_file("test.tar", test_files)