def main(): config = parseArgs() check_config(config) print(f'config >{config}<') nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got nr_of_cpus >{nr_of_cpus}<') ##load ret-type dict ret_type_dict = pickle_lib.get_pickle_file_content( config['return_type_dict_file']) print(f"ret-type-dict >{ret_type_dict}<") pickle_files = common_stuff_lib.get_all_filenames_of_type( config['balanced_dataset_dir'], '.pickle') ### transform dataset ret-types to ints print( f"Transform return-type to int and save to >{config['tfrecord_save_dir']}<" ) p = Pool(nr_of_cpus) pickle_files = [ config['balanced_dataset_dir'] + "/" + f for f in pickle_files ] star_list = zip(pickle_files, repeat(ret_type_dict), repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() print("Done. Run train_arg_one_model_lstm.py next")
def main(): config = common_stuff_lib.parseArgs() print(f'config >{config}<') print() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() print(f"Using files in directory >{config['save_dir']}<") pickle_files = common_stuff_lib.get_all_filenames_of_type(config['save_dir'], '.pickle') for file in pickle_files: cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) counter = 0 for item in cont: #print(f'item[0] >{item[0]}< item[1] >{item[1]}<') if counter < 1: print(f"return type >{item[1]}< from file >{config['save_dir'] + file}<") print(f'{item[0]}') counter += 1 print(f'Counted >{counter}< text,label elements') print()
def main(): config = parseArgs() print(f'config >{config}<') check_if_dir_exists(config['pickle_dir']) check_if_dir_exists(config['work_dir']) check_if_dir_exists(config['save_dir']) check_if_dir_exists(config['tfrecord_save_dir']) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got nr_of_cpus >{nr_of_cpus}<') ### get all pickle files #pickle_files = get_all_tar_filenames(config['pickle_dir']) pickle_files = common_stuff_lib.get_all_filenames_of_type(config['pickle_dir'], '.tar.bz2') ### print 5 files, check and debug pickle_lib.print_X_pickle_filenames(pickle_files, 5) ### build p = Pool(nr_of_cpus) pickle_files = [config["pickle_dir"] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config['work_dir']), repeat(config['save_dir']), repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() print("Done. Run build_ret_type__vocab__seq_len.py next")
def main(): config = parseArgs() nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got nr_of_cpus >{nr_of_cpus}<') print(f"Using files in directory >{config['balanced_dataset_dir']}<") pickle_files = common_stuff_lib.get_all_filenames_of_type( config['balanced_dataset_dir'], '.pickle') for file in pickle_files: cont = pickle_lib.get_pickle_file_content( config['balanced_dataset_dir'] + file) counter = 0 for item in cont: #print(f'item[0] >{item[0]}< item[1] >{item[1]}<') if counter < 1: print( f"return type >{item[1]}< from file >{config['balanced_dataset_dir'] + file}<" ) counter += 1 print(f'Counted >{counter}< text,label elements') print()
def main(): config = common_stuff_lib.parseArgs() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() copy_files_to_build_dataset(config) pickle_files = common_stuff_lib.get_all_filenames_of_type( config['pickle_dir'], '.tar.bz2') ### print 5 files, check and debug pickle_lib.print_X_pickle_filenames(pickle_files, 5) ### build p = Pool(nr_of_cpus) pickle_files = [config["pickle_dir"] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config['work_dir']), repeat(config['save_dir']), repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() print(f'Run build_ret_type__vocab__seq_len.py next')
def main(): config = common_stuff_lib.parseArgs() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading\n') print() copy_files_to_build_dataset(config) ### get all pickle files #pickle_files = get_all_tar_filenames(config['pickle_dir']) pickle_files = common_stuff_lib.get_all_filenames_of_type( config['pickle_dir'], '.tar.bz2') ### print 5 files, check and debug pickle_lib.print_X_pickle_filenames(pickle_files, 5) ### build # p = Pool(nr_of_cpus) # #p = Pool(len(pickle_files)) # pickle_files = [config["pickle_dir"] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config['work_dir']), repeat(config['save_dir']), repeat(config)) # # all_ret_types = p.starmap(proc_build, star_list) # p.close() # p.join() test = joblib.Parallel(n_jobs=-1, prefer="processes")( joblib.delayed(proc_build)(a, b, c, d) for a, b, c, d in star_list) print("Done. Run build_ret_type__vocab__seq_len.py next")
def main(): config = common_stuff_lib.parseArgs() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() ret_type_dict = pickle_lib.get_pickle_file_content(config['return_type_dict_file']) ## get number of different return types pickle_files = common_stuff_lib.get_all_filenames_of_type(config['save_dir'], '.pickle') p = Pool(nr_of_cpus) pickle_files_save_dir = [config['save_dir'] + "/" + f for f in pickle_files] star_list = zip(pickle_files_save_dir, repeat(ret_type_dict), repeat(config)) all_ret_types = p.starmap(proc_count, star_list) p.close() p.join() ## build count dict ret_type_counter = dict() nr = 0 for key in ret_type_dict: ret_type_counter[key] = 0 for counts_dict in all_ret_types: #print(f"counts_dict >{counts_dict}<") for counts_dict_key in counts_dict: #print(f"counts_dict[counts_dict_key] >{counts_dict[counts_dict_key]}<") ret_type_counter[counts_dict_key] += counts_dict[counts_dict_key] print(f"The counts of every arg_three :") for key in ret_type_counter: print(f"arg_three type >{key}< exists\t\t\t>{ret_type_counter[key]}< \ttimes") config['minimum_nr_of_return_types'] = input('Put in minimum nr of arg_three to build balanced dataset:') ### filter all that >= int(config['minimum_nr_of_return_types']) ret_type_counter_filtered = dict() for key in ret_type_dict: if ret_type_counter[key] >= int(config['minimum_nr_of_return_types']): ret_type_counter_filtered[key] = ret_type_counter[key] print(f"The filtered counts (>={int(config['minimum_nr_of_return_types'])}) of every type >{ret_type_counter_filtered}<") ### now select int(config['minimum_nr_of_return_types']) disassemblies,labels from ### filter and store to dict the usable text,label pairs for key in ret_type_counter_filtered: print(f'build balanced with key >{key}<') t = Thread(target=proc_build_balanced, args=(pickle_files_save_dir, key, int(config['minimum_nr_of_return_types']), config, )) t.start() print(f'Run build_balanced_ret_type__vocab__seq_len.py next')
def main(): user_home_path = os.path.expanduser('~') #files = os.listdir(user_home_path + "/test/save_dir/") files = common_stuff_lib.get_all_filenames_of_type( user_home_path + "/test/save_dir/", '.pickle') for file in files: if os.path.isfile(user_home_path + "/tmptest/" + file.replace('.pickle', '.pickle.tar.bz2')): os.remove(user_home_path + "/tmptest/" + file.replace('.pickle', '.pickle.tar.bz2')) print(f'file >{file}<')
def copy_files_to_build_dataset(config): pickle_files = common_stuff_lib.get_all_filenames_of_type( config['pickle_dir'], '.tar.bz2') if len(pickle_files) > 0: decision = 'z' while ((decision != 'y') and (decision != 'n')): decision = input( f"There are still files in >{config['pickle_dir']}< . Do you want to use them: Type in (y/n):" ) if decision == 'y': print(f'Using files still there') return pickle_path = config['git_repo_path'] + '/ubuntu-20-04-pickles/' pickle_files = common_stuff_lib.get_all_filenames_of_type( pickle_path, '.tar.bz2') counter = 0 for file in pickle_files: counter += 1 nr_files = 'z' while (not nr_files.isdecimal()): nr_files = input( f'In directory >{pickle_path}< are >{counter}< files.\nHow many files to use for dataset? Type in:' ) counter = 0 for file in pickle_files: print(f'Copy file >{file}< ', end='\r') copyfile(pickle_path + file, config['pickle_dir'] + file) counter += 1 if counter >= int(nr_files): break print(f'Copied >{nr_files}< files') print()
def main(): # tarbz2_files = common_stuff_lib.get_all_filenames_of_type("/tmp/test/", '.tar.bz2') # # work_dir = "/tmp/work_dir" # for tarbz2_file in tarbz2_files: # tarbz2_lib.untar_file_to_path('/tmp/test/' + tarbz2_file, work_dir) user_home_path = os.path.expanduser('~') path = user_home_path + "/ret-type/work_dir/" pickle_files = common_stuff_lib.get_all_filenames_of_type(path, '.pickle') for file in pickle_files: cont = pickle_lib.get_pickle_file_content(path + file) for elem in cont: print(f'elem >{elem}<')
def main(): config = common_stuff_lib.parseArgs() print(f'config >{config}<') print() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() print(f"Using files in directory >{config['save_dir']}<") print() pickle_files = common_stuff_lib.get_all_filenames_of_type( config['save_dir'], '.pickle') all_ret_types_list = set() counter = 0 max_seq_len = 0 for file in pickle_files: cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) counter = 0 max_seq_len = 0 for item in cont: all_ret_types_list.add(item[1]) if counter < 1: print( f"nr-of-arguments >{item[1]}< from file >{config['save_dir'] + file}<" ) print() print(f'text >{item[0]}<\nlabel >{item[1]}<') if len(item[0]) > max_seq_len: max_seq_len = len(item[0]) counter += 1 print(f'Counted >{counter}< text,label elements') print(f'longest disassembly got >{max_seq_len}< words') print('----------------------------------------') print() print(f'all_ret_types_list >{all_ret_types_list}<')
def check_if_balanced_dir_is_empty(config): pickle_files = common_stuff_lib.get_all_filenames_of_type( config['balanced_dataset_dir'], '.pickle') if len(pickle_files) > 0: decision = 'z' while ((decision != 'y') and (decision != 'n')): decision = input( f"There are still files in >{config['balanced_dataset_dir']}< . Do you want to use them: Type in (y/n):" ) print() if decision == 'y': print(f'Using files still there') print() return print(f"Deleting files in >{config['balanced_dataset_dir']}<") print() for file in pickle_files: os.remove(config['balanced_dataset_dir'] + file)
def main(): config = common_stuff_lib.parseArgs() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() ## build return type dict-file and max-seq-length-file and vocabulary pickle_files = common_stuff_lib.get_all_filenames_of_type( config['save_dir'], '.pickle') print(f'pickle-files we use to build >{pickle_files}<') print(f'Building return-type dict, vocabulary and max-squenece-length') p = Pool(nr_of_cpus) pickle_files = [config['save_dir'] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() ret_set = set() vocab = set() seq_length = 0 ##put all stuff together for ret_set1, vocab1, seq_length1 in all_ret_types: ret_set.update(ret_set1) vocab.update(vocab1) if seq_length1 > seq_length: seq_length = seq_length1 # ret_set = set() # vocab = set() # seq_length = 0 # counter = 1 # pickle_count = len(pickle_files) # # for file in pickle_files: # print(f'File >{file}< >{counter}/{pickle_count}<', end='\r') # counter += 1 # cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) # for item in cont: # #print(f'item-1 >{item[1]}<') # ## build ret-type-dict # ret_set.add(item[1]) # # ##build max-seq-length # if len(item[0]) > seq_length: # seq_length = len(item[0]) # # ## build vocabulary # for word in item[0].split(): # vocab.add(word) print( f"Build return-type dict from set and save it to >{config['return_type_dict_file']}<" ) ## build ret-type-dict and save ret_type_dict = dict() counter = 0 for elem in ret_set: ret_type_dict[elem] = counter counter += 1 print(f"ret-type-dict :") for key in ret_type_dict: print(f"argument one >{key}< label >{ret_type_dict[key]}<") pickle_lib.save_to_pickle_file(ret_type_dict, config['return_type_dict_file']) print(f"Saving vocabulary to >{config['vocabulary_file']}<") ## build vocabulary list from set and save vocab_list = list(vocab) pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file']) ## save max-seq-length print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<") pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file']) print("Done. Run build_balanced_dataset.py next")
def main(): config = common_stuff_lib.parseArgs() print(f'config >{config}<') print() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() ## build return type dict-file and max-seq-length-file and vocabulary pickle_files = common_stuff_lib.get_all_filenames_of_type(config['balanced_dataset_dir'], '.pickle') if len(pickle_files) == 0: print(f"There are no files in >{config['balanced_dataset_dir']}<") exit() #print(f'pickle-files we use to build >{pickle_files}<') pickle_lib.print_X_pickle_filenames(pickle_files, 5) print(f'Building return-type dict, vocabulary and max-squenece-length') print() p = Pool(nr_of_cpus) pickle_files = [config['balanced_dataset_dir'] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() ret_set = set() vocab = set() seq_length = 0 ##put all stuff together for ret_set1, vocab1, seq_length1 in all_ret_types: ret_set.update(ret_set1) vocab.update(vocab1) if seq_length1 > seq_length: seq_length = seq_length1 print(f"Build return-type dict from set and save it to >{config['return_type_dict_file']}<") print() ## build ret-type-dict and save ret_type_dict = dict() counter = 0 ret_set_list = sorted(ret_set) for elem in ret_set_list: ret_type_dict[elem] = counter counter += 1 print(f"ret-type-dict :") for key in ret_type_dict: print(f"nr-of-args >{key}< label >{ret_type_dict[key]}<") print() pickle_lib.save_to_pickle_file(ret_type_dict, config['return_type_dict_file']) print(f"Saving vocabulary to >{config['vocabulary_file']}<") print() ## build vocabulary list from set and save vocab_list = list(vocab) pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file']) ## save max-seq-length print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<") print() pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file']) print("Done. Run transform_ret_type_to_int.py next")
def main(): config = parseArgs() print(f'config >{config}<') check_if_dir_exists(config['pickle_dir']) check_if_dir_exists(config['work_dir']) check_if_dir_exists(config['save_dir']) check_if_dir_exists(config['tfrecord_save_dir']) ### get all pickle files #pickle_files = get_all_tar_filenames(config['pickle_dir']) pickle_files = common_stuff_lib.get_all_filenames_of_type( config['pickle_dir'], '.tar.bz2') ### print 5 files, check and debug pickle_lib.print_X_pickle_filenames(pickle_files, 5) ### build p = Pool(nr_of_cpus) pickle_files = [config["pickle_dir"] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config['work_dir']), repeat(config['save_dir']), repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() ## build return type dict-file and max-seq-length-file and vocabulary pickle_files = common_stuff_lib.get_all_filenames_of_type( config['save_dir'], '.pickle') print(f'pickle-files >{pickle_files}<') print(f'Building return-type dict, vocabulary and max-squenece-length') ret_set = set() vocab = set() seq_length = 0 counter = 1 pickle_count = len(pickle_files) for file in pickle_files: print(f'File >{file}< >{counter}/{pickle_count}<', end='\r') counter += 1 cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) for item in cont: #print(f'item-1 >{item[1]}<') ## build ret-type-dict ret_set.add(item[1]) ##build max-seq-length if len(item[0]) > seq_length: seq_length = len(item[0]) ## build vocabulary for word in item[0].split(): vocab.add(word) print( f"Build return-type dict and save it to >{config['return_type_dict_file']}<" ) ## build ret-type-dict and save ret_type_dict = dict() counter = 0 for elem in ret_set: ret_type_dict[elem] = counter counter += 1 pickle_lib.save_to_pickle_file(ret_type_dict, config['return_type_dict_file']) print(f"Build vocabulary and save it to >{config['vocabulary_file']}<") ## build vocabulary list from set and save vocab_list = list(vocab) pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file']) ## save max-seq-length print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<") pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file']) ### transform dataset ret-types to ints print( f"Transform return-type to int and save to >{config['tfrecord_save_dir']}<" ) trans_ds = list() counter = 1 for file in pickle_files: print(f'Transform File >{file}< >{counter}/{pickle_count}<', end='\r') counter += 1 cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) for item in cont: trans_ds.append((item[0], ret_type_dict[item[1]])) tfrecord_lib.save_caller_callee_to_tfrecord( trans_ds, config['tfrecord_save_dir'] + file.replace('.pickle', '.tfrecord')) print("Splitting dataset to train,val,test") tfrecord_lib.split_to_train_val_test(config['tfrecord_save_dir']) print("Done. Run build_caller_callee_model.py now")
def main(): config = common_stuff_lib.parseArgs() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading\n') print() copy_files_to_build_dataset(config) pickle_files = common_stuff_lib.get_all_filenames_of_type( config['pickle_dir'], '.tar.bz2') ### print 5 files, check and debug pickle_lib.print_X_pickle_filenames(pickle_files, 5) ###loop through all pickle.tar.bz2 files for pickle_file in pickle_files: print(f"Untar pickle-file >{pickle_file}< to >{config['work_dir']}<") tarbz2_lib.untar_file_to_path(config['pickle_dir'] + pickle_file, config['work_dir']) ###install source-package of pickle-file-content pickle_file_name = os.path.basename(pickle_file) pickle_file_name = pickle_file_name.replace('.pickle.tar.bz2', '') gdb_lib.install_source_package(pickle_file_name, config) ###check with gdb (list cmd) if the sources are newer/older than binary ## warning: Source file is more recent than executable. ###get dir name dir_name = config['ubuntu_src_pkgs'] ##dir_name = get_dirname_of_src(pickle_file_name) print(f'Dir with src is:{dir_name}') res = check_if_src_match_binary(pickle_file_name, dir_name, config) ##src and binary dont match, unpack the second src in the dir if not res: unpack_second_src(pickle_file_name) res = check_if_src_match_binary(pickle_file_name, dir_name) print(f'res of second src dir: {res}') else: print(f'src match binary') #break ###open the pickle print( f"Open untarred pickle file: >{config['work_dir'] + pickle_file}<") pickle_content = open_pickle(config['work_dir'] + pickle_file.replace('.tar.bz2', '')) #print(f'pickle_content >{pickle_content}<') #exit() fcn = '' fl = '' bina = '' gdb_func_sign = '' ### loop through the pickle-file and get source-code from function #print(f'pickle-content: {next(iter(pickle_content))}') for funcSign, gdb_ret_type, func_name, file_name, disas_att, disas_intel, package_name, binary in pickle_content: print(f'funcSign: {funcSign}') #print(f'gdb_ret_type: {gdb_ret_type}') print(f'func_name: {func_name}') print(f'file_name: {file_name}') #print(f'disas_att: {disas_att}') #print(f'disas_intel: {disas_intel}') print(f'package_name: {package_name}') print(f'binary: {binary}') fcn = func_name fl = file_name bina = binary gdb_func_sign = funcSign #break ### get source code of function pkg_name = pickle_file.replace('.pickle.tar.bz2', '') pkg_name = os.path.basename(pkg_name) print(f'pkg_name:{pkg_name}') #pkg_src_name = "/tmp/" + pkg_name + "/" + dir_name pkg_src_name = config['ubuntu_src_pkgs'] print(f'pkg_src_name:{pkg_src_name}') full_path = get_full_path(pkg_src_name, fl) print(f'full-path:{full_path}') len_full_path = len(full_path) nr_of_empty_src_code = 0 ### ctags does not get return-type if its located lines above func_name ### gdb funcSign got it, we need to check if we need more lines than ctags tells us for f in full_path: src_code = get_source_code(f, fcn, gdb_func_sign) if src_code: print(f'src-code:{src_code}') else: print(f'no src-code found') nr_of_empty_src_code += 1 print( f'nr_of_empty_src_code:{nr_of_empty_src_code} len_full_path:{len_full_path}' ) if len_full_path == nr_of_empty_src_code + 1: print('only found one source code, thats good') else: print('ERROR found more than one source code for a function') break