def load_func_features_helper(bin_paths): # returns {function_key: {option_idx: np.array(feature_values)}} global g_options, g_features func_features = {} num_features = len(g_features) optionidx_map = get_optionidx_map(g_options) for bin_path in bin_paths: package, compiler, arch, opti, bin_name = parse_fname(bin_path) others = parse_other_options(bin_path) _, func_data_list = load_func_data(bin_path) for func_data in func_data_list: # Use only .text functions for testing if func_data["seg_name"] != ".text": continue if func_data["name"].startswith("sub_"): continue func_key = (package, bin_name, func_data["name"]) option_key = (opti, arch, compiler, others) if option_key not in optionidx_map: continue option_idx = optionidx_map[option_key] if func_key not in func_features: func_features[func_key] = {} if option_key not in func_features[func_key]: func_features[func_key][option_idx] = np.zeros( num_features, dtype=np.float64) for feature_idx, feature in enumerate(g_features): if feature not in func_data["feature"]: continue val = func_data["feature"][feature] func_features[func_key][option_idx][feature_idx] = val return func_features
def load_func_features_helper(bin_paths): # TODO: handle suffix correctly. # returns {function_key: {option_idx: np.array(feature_values)}} global g_options, g_features, g_str_features func_features = {} func_str_features = {} num_features = len(g_features) + len(g_str_features) optionidx_map = get_optionidx_map(g_options) # This counts compiler-generated duplicates (.isra, .part, .cold) duplicate_cnt = 0 for bin_path in bin_paths: package, compiler, arch, opti, bin_name = parse_fname(bin_path) others = parse_other_options(bin_path) _, func_data_list = load_func_data(bin_path, suffix="filtered2") for func_data in func_data_list: # Use only .text functions for testing # These are already filtered in filter_functions.py if func_data["seg_name"] != ".text": continue if func_data["name"].startswith("sub_"): continue #func_key = (package, bin_name, func_data["name"]) func_key = (package, bin_name, func_data["src_file"], func_data["src_line"]) option_key = (opti, arch, compiler, others) if option_key not in optionidx_map: continue option_idx = optionidx_map[option_key] if func_key not in func_features: func_features[func_key] = {} func_str_features[func_key] = {} # in the below condition by using option_key instead of option_idx, # we can filter duplicate functions and only leave the last one. # TODO: move this filtering to filter_functions.py if option_key not in func_features[func_key]: func_features[func_key][option_idx] = np.zeros( num_features, dtype=np.float64) func_str_features[func_key][option_idx] = [] else: duplicate_cnt += 1 for feature_idx, feature in enumerate(g_features): if feature not in func_data["feature"]: continue val = func_data["feature"][feature] func_features[func_key][option_idx][feature_idx] = val for feature_idx, str_feature in enumerate(g_str_features): if str_feature not in func_data: continue val = func_data[str_feature] if "type" in str_feature: if not isinstance(val, list): val = [val] val = normalize_type(val) val = list(enumerate(val)) func_str_features[func_key][option_idx].append(val) return func_features, func_str_features, duplicate_cnt
def extract_features(bin_name): global feature_funcs bin_name, func_data_list = load_func_data(bin_name) fm = FeatureManager() for func_data in func_data_list: features = fm.get_all(func_data) func_data["feature"] = features store_func_data(bin_name, func_data_list)
def count_funcs(bin_path): # TODO: handle suffix correctly. #bin_path, func_data_list = load_func_data(bin_path) bin_path, func_data_list = load_func_data(bin_path, suffix="filtered") func_data_list = sorted(func_data_list, key=lambda x: x['name']) num_funcs = len(func_data_list) num_bbs = sum(map(lambda x: x['cfg_size'], func_data_list)) return bin_path, num_funcs, num_bbs
def extract_func_types(args): type_map, bin_name = args bin_name, func_data_list = load_func_data(bin_name) for func in func_data_list: ret_type = fetch_type(type_map, func["ret_type"]) arg_types = [] for idx, var_name, t, _ in func["args"]: arg_types.append(fetch_type(type_map, t)) func["abstract_args_type"] = arg_types func["abstract_ret_type"] = ret_type store_func_data(bin_name, func_data_list)
def extract_func_types(args): # TODO: handle suffix correctly. type_map, bin_name = args bin_name, func_data_list = load_func_data(bin_name, suffix="filtered") for func in func_data_list: ret_type = fetch_type(type_map, func["ret_type"]) arg_types = [] for idx, var_name, t, _ in func["args"]: arg_types.append(fetch_type(type_map, t)) func["abstract_args_type"] = arg_types func["abstract_ret_type"] = ret_type store_func_data(bin_name, func_data_list, suffix="filtered")
def filter_funcs(bin_path): global g_oracle bin_path, func_data_list = load_func_data(bin_path) func_data_list = sorted(func_data_list, key=lambda x: x['name']) num_orig_funcs = len(func_data_list) pack_name = func_data_list[0]['package'] # filter functions by segment name (consider functions in code segment) funcs = list(filter(lambda x: x['seg_name'] == '.text', func_data_list)) num_code_funcs = len(funcs) funcs = list(filter(lambda x: 'src_path' in x and x['src_path'], funcs)) num_src_funcs = len(funcs) # To identify functions inserted by compilers #for func in funcs: # if func['package'] not in func['src_file']: # print(func['name'], func['src_file'], func['src_line']) # filter functions by package name (remove functions inserted by compilers) funcs = list(filter(lambda x: pack_name in x['src_path'], funcs)) num_pack_funcs = len(funcs) if num_pack_funcs == 0: print("No functions: ", pack_name, bin_path, num_orig_funcs) funcs = list(filter(lambda x: not x['name'].startswith('sub_'), funcs)) num_sub_funcs = len(funcs) names = set(map(lambda x: x['name'], funcs)) sources = set(map(lambda x: (x['src_file'], x['src_line']), funcs)) if g_oracle: package, compiler, arch, opti, bin_name = parse_fname(bin_path) funcs = list(filter( lambda x: x['src_file'] in g_oracle[pack_name][bin_name] and x['src_line'] in g_oracle[pack_name][bin_name][x['src_file']], funcs)) # TODO: handle suffix correctly. store_func_data(bin_path, funcs, suffix="filtered") num_oracle_funcs = len(funcs) num_readelf_funcs = 0 # if g_oracle: # cmd = "readelf -s {} | grep FUNC | grep -v UND | wc -l".format(bin_path) # cmd = " objdump --syms -j .text {} | grep \"F .text\" | ".format(bin_path) # cmd += " cut -d \" \" -f 1 | sort | uniq | wc -l" # num_readelf_funcs = int(system(cmd)) num_funcs = (num_orig_funcs, num_code_funcs, num_src_funcs, num_pack_funcs, num_sub_funcs, num_oracle_funcs, num_readelf_funcs) return pack_name, bin_path, num_funcs, names, sources
def extract_features(bin_name): global feature_funcs # TODO: handle suffix correctly. bin_name, func_data_list = load_func_data(bin_name, suffix="filtered") fm = FeatureManager() for func_data in func_data_list: try: features = fm.get_all(func_data) except: import traceback traceback.print_exc() print("Error: ", bin_name) return func_data["feature"] = features store_func_data(bin_name, func_data_list, suffix="filtered2")
def extract_func_lineno(bin_name): try: bin_name, func_data_list = load_func_data(bin_name) except: print(bin_name) return bin_name func_addrs = dict(map(lambda x: (x["startEA"], x["name"]), func_data_list)) line_map = fetch_lineno(bin_name, func_addrs) for func in func_data_list: func_addr = func["startEA"] if func_addr not in line_map or not line_map[func_addr][0]: continue func["src_path"] = line_map[func_addr][0] func["src_file"] = parse_source_path(func["src_path"]) func["src_line"] = line_map[func_addr][1] # Fix ase18 source paths coreutils-6.7-6.5 / coreutils-6.7-6.7 if 'coreutils-6.7-6.5' in func['src_path']: func['src_path'] = func['src_path'].replace('6.7-6.5', '6.5') store_func_data(bin_name, func_data_list) return