Example #1
0
class Collector(ida.action_handler_t):
    """Generic class to collect information from a binary"""
    def __init__(self):
        # Load the type library
        self.type_lib_file_name = os.path.join(
            os.environ["OUTPUT_DIR"],
            "types",
            os.environ["PREFIX"] + ".json.gz",
        )
        try:
            with gzip.open(self.type_lib_file_name, "rt") as type_lib_file:
                self.type_lib = TypeLibCodec.decode(type_lib_file.read())
        except Exception as e:
            print(e)
            print("Could not find type library, creating a new one")
            self.type_lib = TypeLib()
        super().__init__()

    def write_type_lib(self) -> None:
        """Dumps the type library to the file specified by the environment variable
        `TYPE_LIB`.
        """
        with gzip.open(self.type_lib_file_name, "wt") as type_lib_file:
            encoded = TypeLibCodec.encode(self.type_lib)
            type_lib_file.write(encoded)
            type_lib_file.flush()

    def collect_variables(
        self,
        frsize: int,
        stkoff_delta: int,
        variables: Iterable[ida.lvar_t],
    ) -> DefaultDict[Location, Set[Variable]]:
        """Collects Variables from a list of tinfo_ts and adds their types to the type
        library."""
        collected_vars: DefaultDict[Location, Set[Variable]] = defaultdict(set)
        for v in variables:
            if v.name == "" or not v.type():
                continue
            # Add all types to the typelib
            self.type_lib.add_ida_type(v.type())
            typ: TypeInfo = TypeLib.parse_ida_type(v.type())

            loc: Optional[Location] = None
            if v.is_stk_var():
                corrected = v.get_stkoff() - stkoff_delta
                offset = frsize - corrected
                loc = Stack(offset)
            if v.is_reg_var():
                loc = Register(v.get_reg1())
            if loc is not None:
                collected_vars[loc].add(
                    Variable(typ=typ, name=v.name, user=v.has_user_info))
        return collected_vars

    def activate(self, ctx) -> int:
        """Runs the collector"""
        raise NotImplementedError
Example #2
0
 def __init__(self):
     # Load the type library
     self.type_lib_file_name = os.path.join(
         os.environ["OUTPUT_DIR"],
         "types",
         os.environ["PREFIX"] + ".json.gz",
     )
     try:
         with gzip.open(self.type_lib_file_name, "rt") as type_lib_file:
             self.type_lib = TypeLibCodec.decode(type_lib_file.read())
     except Exception as e:
         print(e)
         print("Could not find type library, creating a new one")
         self.type_lib = TypeLib()
     super().__init__()
Example #3
0
    def collect_variables(
        self,
        frsize: int,
        stkoff_delta: int,
        variables: Iterable[ida.lvar_t],
    ) -> DefaultDict[Location, Set[Variable]]:
        """Collects Variables from a list of tinfo_ts and adds their types to the type
        library."""
        collected_vars: DefaultDict[Location, Set[Variable]] = defaultdict(set)
        for v in variables:
            if v.name == "" or not v.type():
                continue
            # Add all types to the typelib
            self.type_lib.add_ida_type(v.type())
            typ: TypeInfo = TypeLib.parse_ida_type(v.type())

            loc: Optional[Location] = None
            if v.is_stk_var():
                corrected = v.get_stkoff() - stkoff_delta
                offset = frsize - corrected
                loc = Stack(offset)
            if v.is_reg_var():
                loc = Register(v.get_reg1())
            if loc is not None:
                collected_vars[loc].add(
                    Variable(typ=typ, name=v.name, user=v.has_user_info))
        return collected_vars
Example #4
0
    def activate(self, ctx) -> int:
        """Collects types, user-defined variables, their locations in addition to the
        AST and raw code.
        """
        print("Collecting vars and types.")
        for ea in (ea for ea in idautils.Functions()
                   if ea in self.debug_functions):
            # Decompile
            f = ida.get_func(ea)
            cfunc = None
            try:
                cfunc = ida.decompile(f)
            except ida.DecompilationFailure:
                continue
            if cfunc is None:
                continue

            # Function info
            name: str = ida.get_func_name(ea)

            self.type_lib.add_ida_type(cfunc.type.get_rettype())
            return_type = TypeLib.parse_ida_type(cfunc.type.get_rettype())

            arguments = self.collect_variables(f.frsize,
                                               cfunc.get_stkoff_delta(),
                                               cfunc.arguments)
            local_vars = self.collect_variables(
                f.frsize,
                cfunc.get_stkoff_delta(),
                [v for v in cfunc.get_lvars() if not v.is_arg_var],
            )
            raw_code = ""
            for line in cfunc.get_pseudocode():
                raw_code += f"{' '.join(ida_lines.tag_remove(line.line).split())}\n"
            ast = AST(function=cfunc)
            decompiler = Function(
                ast=ast,
                name=name,
                return_type=return_type,
                arguments=arguments,
                local_vars=local_vars,
                raw_code=raw_code,
            )
            self.functions.append(
                CollectedFunction(
                    ea=ea,
                    debug=self.debug_functions[ea],
                    decompiler=decompiler,
                ))
        self.write_info()
        return 1
Example #5
0
 def from_item(cls, item: ida.carg_t,
               ast: "AST") -> "Call.Arg":  # type: ignore
     node_id = ast.next_id()
     is_vararg = item.is_vararg
     idx = None
     name = None
     if item.v:
         idx = item.v.idx
         assert ast.function is not None
         name = ast.function.lvars[idx].name
     formal_type = TypeLib.parse_ida_type(item.formal_type)
     return cls(
         node_id=node_id,
         is_vararg=is_vararg,
         idx=idx,
         name=name,
         formal_type=formal_type,
     )
Example #6
0
def type_dumper(args):
    tgt_folder, fname = args
    typelib = TypeLib()
    with open(fname, "r") as f:
        for line in f:
            e = Example.from_json(json.loads(line))
            for var in e.target.values():
                typelib.add(var.typ)
    typelib.sort()
    with open(os.path.join(tgt_folder, "types",
                           fname.split("/")[-1]), "w") as type_lib_file:
        encoded = TypeLibCodec.encode(typelib)
        type_lib_file.write(encoded)
Example #7
0
    def activate(self, ctx) -> int:
        """Collects types, user-defined variables, and their locations"""
        print("Collecting vars and types.")
        # `ea` is the start address of a single function
        for ea in idautils.Functions():
            # Decompile
            f = ida.get_func(ea)
            cfunc = None
            try:
                cfunc = ida.decompile(f)
            except ida.DecompilationFailure:
                continue
            if cfunc is None:
                continue

            # Function info
            name: str = ida.get_func_name(ea)
            self.type_lib.add_ida_type(cfunc.type.get_rettype())
            return_type = TypeLib.parse_ida_type(cfunc.type.get_rettype())

            arguments = self.collect_variables(f.frsize,
                                               cfunc.get_stkoff_delta(),
                                               cfunc.arguments)
            local_vars = self.collect_variables(
                f.frsize,
                cfunc.get_stkoff_delta(),
                [v for v in cfunc.get_lvars() if not v.is_arg_var],
            )
            self.functions[ea] = Function(
                name=name,
                return_type=return_type,
                arguments=arguments,
                local_vars=local_vars,
            )

        self.write_type_lib()
        self.write_functions()
        return 1
Example #8
0
 def from_item(cls, item: ida.cexpr_t,
               ast: "AST") -> "Type":  # type: ignore
     node_id = ast.next_id()
     return cls(node_id=node_id, typ=TypeLib.parse_ida_type(item.type))
Example #9
0
def main(args):
    np.random.seed(1234)
    random.seed(1992)

    tgt_folder = args["TARGET_FOLDER"]
    input_folder = args["INPUT_FOLDER"]
    input_fnames_file = args["INPUT_FNAMES"]
    input_fnames = []
    max_files = int(args["--max"])
    with open(input_fnames_file) as f:
        for s in f:
            s = s.strip()
            if s.endswith(".gz"):
                input_fnames.append(s)
            if len(input_fnames) >= max_files:
                break
    shard_size = int(args["--shard-size"])

    if os.path.exists(tgt_folder):
        op = input(f"{tgt_folder} exists. remove? (y/n) ")
        if op == "y":
            shutil.rmtree(tgt_folder)

    os.system(f"mkdir -p {tgt_folder}")
    os.system(f"mkdir -p {tgt_folder}/files")
    os.system(f"mkdir -p {tgt_folder}/types")
    num_workers = 16

    valid_example_count = 0

    print("loading examples")
    with multiprocessing.Pool(num_workers) as pool:
        json_iter = pool.imap(
            json_line_reader,
            ((input_folder, fname) for fname in input_fnames),
            chunksize=64,
        )

        example_iter = pool.imap(example_generator, json_iter, chunksize=64)

        for examples in tqdm(example_iter):
            if not examples:
                continue
            json_file_name = examples[0].binary_file["file_name"].split(
                "/")[-1]
            with open(os.path.join(tgt_folder, "files/", json_file_name),
                      "w") as f:
                for example in examples:
                    f.write(dumps(example.to_json()) + "\n")
                    all_functions.setdefault(
                        json_file_name,
                        dict())[example.name] = example.canonical_code

            valid_example_count += len(examples)

    print("valid examples: ", valid_example_count)

    cur_dir = os.getcwd()
    all_files = glob.glob(os.path.join(tgt_folder, "files/*.jsonl"))
    file_prefix = os.path.join(tgt_folder, "files/")
    sorted(all_files)  # sort all files by names
    all_files = list(all_files)
    file_num = len(all_files)
    print("Total valid binary file num: ", file_num)

    test_file = args["--test-file"]
    if test_file:
        print(f"using test file {test_file}")
        with tarfile.open(test_file, "r") as f:
            test_files = [
                os.path.join(file_prefix,
                             x.name.split("/")[-1]) for x in f.getmembers()
                if x.name.endswith(".jsonl")
            ]
        dev_file_num = 0
    else:
        print(f"randomly sample test file {test_file}")
        test_file_num = int(file_num * 0.1)
        dev_file_num = int(file_num * 0.1)
        test_files = list(
            np.random.choice(all_files, size=test_file_num, replace=False))

    test_files_set = set(test_files)
    train_files = [fname for fname in all_files if fname not in test_files_set]

    if dev_file_num == 0:
        dev_file_num = int(len(train_files) * 0.1)

    np.random.shuffle(train_files)
    dev_files = train_files[-dev_file_num:]
    train_files = train_files[:-dev_file_num]

    # Create types from filtered training set
    with multiprocessing.Pool(num_workers) as pool:
        pool.map(
            type_dumper,
            ((tgt_folder, fname) for fname in train_files),
            chunksize=64,
        )
    print("reading typelib")
    typelib = TypeLib()
    for fname in tqdm(train_files):
        fname = os.path.basename(fname)
        fname = fname[:fname.index(".")] + ".jsonl"
        typelib.add_json_file(os.path.join(tgt_folder, "types", fname))
    typelib.prune(5)
    typelib.sort()

    print("dumping typelib")
    with open(os.path.join(tgt_folder, "typelib.json"), "w") as type_lib_file:
        encoded = TypeLibCodec.encode(typelib)
        type_lib_file.write(encoded)

    train_functions = dict()
    for train_file in train_files:
        file_name = train_file.split("/")[-1]
        for func_name, func in all_functions[file_name].items():
            train_functions.setdefault(func_name, set()).add(func)

    print(
        f"number training: {len(train_files)}",
        f"number dev: {len(dev_files)}",
        f"number test: {len(test_files)}",
        sep=", ",
    )
    print("dump training files")
    shards = [
        train_files[i:i + shard_size]
        for i in range(0, len(train_files), shard_size)
    ]
    for shard_id, shard_files in enumerate(shards):
        print(f"Preparing shard {shard_id}, {len(shard_files)} files: ")
        with open(os.path.join(tgt_folder, "file_list.txt"), "w") as f:
            for file_name in shard_files:
                f.write(file_name.split("/")[-1] + "\n")

        os.chdir(os.path.join(tgt_folder, "files"))
        print("creating tar file...")
        os.system(f"tar cf ../train-shard-{shard_id}.tar -T ../file_list.txt")
        os.chdir(cur_dir)

    def _dump_dev_file(tgt_file_name, file_names):
        with open(os.path.join(tgt_folder, "file_list.txt"), "w") as f:
            for file_name in file_names:
                last_file_name = file_name.split("/")[-1]
                f.write(last_file_name + "\n")

                with open(file_name) as fr:
                    all_lines = fr.readlines()

                replace_lines = []
                for line in all_lines:
                    json_dict = json.loads(line.strip())
                    func_name = json_dict["name"]
                    canonical_code = all_functions[last_file_name][func_name]
                    func_name_in_train = False
                    func_body_in_train = False
                    if func_name in train_functions:
                        func_name_in_train = True
                        if canonical_code in train_functions[func_name]:
                            func_body_in_train = True

                    json_dict["test_meta"] = dict(
                        function_name_in_train=func_name_in_train,
                        function_body_in_train=func_body_in_train,
                    )
                    new_json_str = json.dumps(json_dict)
                    replace_lines.append(new_json_str.strip())

                with open(file_name, "w") as fw:
                    for line in replace_lines:
                        fw.write(line + "\n")

        os.chdir(os.path.join(tgt_folder, "files"))
        print("creating tar file...")
        os.system(f"tar cf ../{tgt_file_name} -T ../file_list.txt")
        os.chdir(cur_dir)

    print("dump dev files")
    _dump_dev_file("dev.tar", dev_files)
    print("dump test files")
    _dump_dev_file("test.tar", test_files)