Ejemplo n.º 1
0
def load_func_features(input_list, options, features):
    grouped_bins, packages = group_binaries(input_list)
    func_features_list = do_multiprocess(
        load_func_features_helper,
        grouped_bins.values(),
        chunk_size=1,
        threshold=1,
        initializer=_init_load,
        initargs=(options, features),
    )
    funcs = {}
    for func_features in func_features_list:
        funcs.update(func_features)
    return funcs
Ejemplo n.º 2
0
    def run(self, input_path):
        elfs = self.get_elf_files(input_path)

        logger.info("[+] start extracting {0} files ...".format(len(elfs)))
        t0 = time.time()

        if self.debug:
            # We only fetch the first ELF for debugging.
            elfs = [elfs[0]]

        # IDA's processing time for each binary is significantly different.
        # Thus, it is better to set the chunk size to 1.
        res = do_multiprocess(self.run_helper, elfs, chunk_size=1, threshold=1)
        logger.info("done in: (%0.3fs)" % (time.time() - t0))
        return res
Ejemplo n.º 3
0
def calc_metric(funcs, funcs_strs, dst_options):
    # now select for features. this find local optimum value using hill
    # climbing.
    metric_results = do_multiprocess(
        calc_metric_helper,
        funcs.keys(),
        chunk_size=1,
        threshold=1,
        initializer=_init_calc,
        initargs=(funcs, funcs_strs, dst_options),
    )
    func_keys, tp_results, tn_results, target_opts = zip(*metric_results)
    # merge results into one numpy array
    tp_results = np.vstack([x for x in tp_results if len(x)])
    tn_results = np.vstack([x for x in tn_results if len(x)])
    assert len(tp_results) == len(tn_results)
    return func_keys, tp_results, tn_results, target_opts
Ejemplo n.º 4
0
def calc_metric(funcs, options, target_key, option_idx, feature_indices):
    # now select for features. this find local optimum value using hill
    # climbing.
    metric_results = do_multiprocess(
        calc_metric_helper,
        funcs.keys(),
        chunk_size=1,
        threshold=1,
        initializer=_init_calc,
        initargs=(funcs, options, target_key, option_idx, feature_indices),
    )
    func_keys, results_arch, results = zip(*metric_results)
    scores_arch = {}
    scores = {}
    for idx, func_key in enumerate(func_keys):
        scores_arch[func_key] = results_arch[idx]
        scores[func_key] = results[idx]
    return scores_arch, scores
Ejemplo n.º 5
0
def load_func_features(input_list, options, features, str_features):
    grouped_bins, packages = group_binaries(input_list, options)
    func_features_list = do_multiprocess(
        load_func_features_helper,
        grouped_bins.values(),
        chunk_size=1,
        threshold=1,
        initializer=_init_load,
        initargs=(options, features, str_features),
    )
    funcs = {}
    funcs_strs = {}
    duplicate_cnt = 0
    for func_features, func_types, dup_cnt in func_features_list:
        funcs.update(func_features)
        funcs_strs.update(func_types)
        duplicate_cnt += dup_cnt
    num_funcs = sum([len(x) for x in funcs.values()])
    logger.info("%d functions loaded.", num_funcs)
    logger.info("%d compiler-generated duplicates.", duplicate_cnt)
    return funcs, funcs_strs
Ejemplo n.º 6
0
def get_rank(func_keys, scores, options, target_key, interested_keys,
             target_option):
    src_options = [op for op in options if op != target_option]
    metric_results = do_multiprocess(
        get_rank_helper,
        src_options,
        chunk_size=1,
        threshold=1,
        initializer=_init_rank,
        initargs=(func_keys, scores, options, target_key, interested_keys),
    )
    src_option, total_rank, total_funcs, total_other_ranks = zip(
        *metric_results)
    ranks = {}
    func_counts = {}
    other_ranks = {}
    for idx, option in enumerate(src_option):
        if option == None:
            continue
        ranks[option] = total_rank[idx]
        func_counts[option] = total_funcs[idx]
        other_ranks[option] = total_other_ranks[idx]
    return [ranks, func_counts, other_ranks]
Ejemplo n.º 7
0
        default=1,
        help="number of binaries to handle in each process",
    )
    op.add_option(
        "--pool_size",
        type="int",
        action="store",
        dest="pool_size",
        default=multiprocessing.cpu_count(),
        help="number of processes",
    )
    op.add_option("--debug", action="store_true", dest="debug")
    (opts, args) = op.parse_args()

    assert opts.input_list and os.path.isfile(opts.input_list)
    # Add features to functions in each binary
    with open(opts.input_list, "r") as f:
        bins = f.read().splitlines()
    if opts.debug:
        bins = [bins[0]]
    t0 = time.time()
    logger.info("Processing %d binaries ...", len(bins))
    do_multiprocess(
        extract_features,
        bins,
        chunk_size=opts.chunk_size,
        pool_size=opts.pool_size,
        threshold=opts.threshold,
    )
    logger.info("done. (%0.3fs)", (time.time() - t0))
Ejemplo n.º 8
0
        for ctags_fname in glob.glob(
                os.path.join(opts.ctags_dir, "[!include]*.tags")):
            update_type_map(type_map, ctags_fname)

        logger.info("done ... %0.3fs", time.time() - t0)
        store_cache(type_map, fname="ctags_cache", cache_dir=".tiknib_cache")

    # Add abstracted type data to functions in each binary
    with open(opts.input_list, "r") as f:
        bins = f.read().splitlines()

    t0 = time.time()
    logger.info("Processing %d binaries ...", len(bins))
    bins = list(map(lambda x: (type_map, x), bins))
    do_multiprocess(extract_func_types,
                    bins,
                    chunk_size=opts.chunk_size,
                    threshold=opts.threshold)
    logger.info("done. (%0.3fs)", (time.time() - t0))

    # Below code exist is not used for now.
#    t0 = time.time()
#    func_cnt = 0
#    for i in range(0, len(bins), opts.chunk_size):
#        logger.info("Processing %d/%d binaries ...", i, len(bins))
#        args = do_multiprocess(load_func_data,
#                               bins[i:i+opts.chunk_size],
#                               chunk_size=1)
#        # Do not want to share large type_mapping dictionary, so that process it
#        # sequentially
#        args = make_functype_abstract(type_map, args)
#        args = list(filter(lambda x: x and x[1], args))
Ejemplo n.º 9
0
        bins = f.read().splitlines()

    pack_bins = {}
    for bin_path in bins:
        package, compiler, arch, opti, bin_name = parse_fname(bin_path)
        if package not in pack_bins:
            pack_bins[package] = []
        pack_bins[package].append(bin_path)

    result = {}
    logger.info("Processing %d binaries ...", len(bins))
    t0 = time.time()
    for package, bin_list in pack_bins.items():
        logger.info("Processing %d binaries in %s ...", len(bin_list), package)
        numbers = do_multiprocess(
            filter_funcs, bin_list, chunk_size=opts.chunk_size,
            threshold=opts.threshold
        )
        numbers.sort()

        # build oracle to pick functions uniquely.
        oracle = {}
        done = {}
        for data in numbers:
            pack_name, bin_path, num_funcs, names, sources = data
            package, compiler, arch, opti, bin_name = parse_fname(bin_path)
            if pack_name not in oracle:
                oracle[pack_name] = {}
                done[pack_name] = set()
            if bin_name not in oracle[pack_name]:
                oracle[pack_name][bin_name] = {}
            # sources = (source file, source line)
Ejemplo n.º 10
0
    #bins = list(filter(lambda x: "_find" in x, bins))

    # Fix this function to filter out specific options.
    def filter_bins(bin_path):
        package, compiler, arch, opti, bin_name = parse_fname(bin_path)
        if compiler not in ["clang-7.0", "gcc-8.2.0"]:
            return False
        return True

    bins = list(filter(filter_bins, bins))

    result = {}
    logger.info("Processing %d binaries ...", len(bins))
    t0 = time.time()
    numbers = do_multiprocess(count_funcs,
                              bins,
                              chunk_size=opts.chunk_size,
                              threshold=opts.threshold)
    logger.info("done. (%0.3fs)", (time.time() - t0))

    filtered_num_funcs = {}
    filtered_num_bbs = {}
    for data in numbers:
        bin_path, num_funcs, num_bbs = data
        package, compiler, arch, opti, bin_name = parse_fname(bin_path)
        if arch.endswith("_64"):
            continue
        if "eb" in arch:
            continue
        #compiler = compiler.split("-")[0]
        key = (opti, arch, compiler)
        if key not in filtered_num_funcs:
Ejemplo n.º 11
0
        dest="chunk_size",
        default=1,
        help="number of binaries to process in each process",
    )
    op.add_option("--force", action="store_true", dest="force")
    (opts, args) = op.parse_args()

    assert opts.input_list

    with open(opts.input_list, "r") as f:
        bins = f.read().splitlines()

    t0 = time.time()
    logger.info("Processing %d binaries ...", len(bins))
    failed_bins = do_multiprocess(extract_func_lineno,
                                  bins,
                                  chunk_size=opts.chunk_size,
                                  threshold=opts.threshold)
    logger.info("done. (%0.3fs)", (time.time() - t0))

    failed_bins = list(filter(lambda x: x is not None, failed_bins))
    if failed_bins:
        print("{} bins failed.".format(len(failed_bins)))

        with open("failed_bins.txt", "w") as f:
            for b in failed_bins:
                f.write(b + "\n")

        from tiknib.idascript import IDAScript
        idascript = IDAScript(
            idapath=IDA_PATH,
            idc=IDA_FETCH_FUNCDATA,