def main(): config = common_stuff_lib.parseArgs() print(f'config >{config}<') print() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() print(f"Using files in directory >{config['tfrecord_save_dir']}<") print() return_type_dict = pickle_lib.get_pickle_file_content( config['tfrecord_save_dir'] + 'return_type_dict.pickle') print(f'return_type_dict value >{return_type_dict}<') print() vocabulary_list = pickle_lib.get_pickle_file_content( config['tfrecord_save_dir'] + 'vocabulary_list.pickle') print(f'vocabulary_list >{vocabulary_list}<') print() print(f'vocabulary_list length >{len(vocabulary_list)}<') print() max_seq_length = pickle_lib.get_pickle_file_content( config['tfrecord_save_dir'] + 'max_seq_length.pickle') print(f'max_seq_length >{max_seq_length}<')
def proc_build(file, config): ret_set = set() vocab = set() seq_length = 0 print(f'File >{file}<') cont = pickle_lib.get_pickle_file_content(file) for item in cont: #print(f'item-1 >{item[1]}<') ## build ret-type-dict ret_set.add(item[1]) ##build max-seq-length if len(item[0]) > seq_length: if len(item[0]) > 100000: print(f'len-bigger 100.000') seq_length = len(item[0]) ## build vocabulary for word in item[0].split(): vocab.add(word) return (ret_set, vocab, seq_length)
def main(): config = parseArgs() nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got nr_of_cpus >{nr_of_cpus}<') print(f"Using files in directory >{config['balanced_dataset_dir']}<") pickle_files = common_stuff_lib.get_all_filenames_of_type( config['balanced_dataset_dir'], '.pickle') for file in pickle_files: cont = pickle_lib.get_pickle_file_content( config['balanced_dataset_dir'] + file) counter = 0 for item in cont: #print(f'item[0] >{item[0]}< item[1] >{item[1]}<') if counter < 1: print( f"return type >{item[1]}< from file >{config['balanced_dataset_dir'] + file}<" ) counter += 1 print(f'Counted >{counter}< text,label elements') print()
def main(): config = parseArgs() check_config(config) print(f'config >{config}<') nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got nr_of_cpus >{nr_of_cpus}<') ##load ret-type dict ret_type_dict = pickle_lib.get_pickle_file_content( config['return_type_dict_file']) print(f"ret-type-dict >{ret_type_dict}<") pickle_files = common_stuff_lib.get_all_filenames_of_type( config['balanced_dataset_dir'], '.pickle') ### transform dataset ret-types to ints print( f"Transform return-type to int and save to >{config['tfrecord_save_dir']}<" ) p = Pool(nr_of_cpus) pickle_files = [ config['balanced_dataset_dir'] + "/" + f for f in pickle_files ] star_list = zip(pickle_files, repeat(ret_type_dict), repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() print("Done. Run train_arg_one_model_lstm.py next")
def proc_build_balanced(pickle_files, key, minimum_ret_type_count, config): #print(f'build balanced') ### filter and store to dict the usable text,label pairs ## a dict that counts how many text,labels from one key-type we got ret_type_count_watcher = 1 # nr = 0 # for key in ret_type_counter_filtered: # ret_type_count_watcher[key] = 0 ret_type_0 = list() for file in pickle_files: cont = pickle_lib.get_pickle_file_content(file) for item in cont: ## is the ret-type we found in our filtered list? #for key in ret_type_counter_filtered: if key == item[1]: #print(f'got filtered ret-type') if ret_type_count_watcher <= minimum_ret_type_count: ret_type_0.append((item[0], item[1])) ret_type_count_watcher += 1 if ret_type_count_watcher > minimum_ret_type_count: break if ret_type_count_watcher > minimum_ret_type_count: break ### save them #print(f'Save balanced dataset') pickle_lib.save_to_pickle_file( ret_type_0, config['balanced_dataset_dir'] + key.replace(' ', '_') + '.pickle')
def main(): config = common_stuff_lib.parseArgs() print(f'config >{config}<') print() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() print(f"Using files in directory >{config['save_dir']}<") pickle_files = common_stuff_lib.get_all_filenames_of_type(config['save_dir'], '.pickle') for file in pickle_files: cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) counter = 0 for item in cont: #print(f'item[0] >{item[0]}< item[1] >{item[1]}<') if counter < 1: print(f"return type >{item[1]}< from file >{config['save_dir'] + file}<") print(f'{item[0]}') counter += 1 print(f'Counted >{counter}< text,label elements') print()
def main(): config = common_stuff_lib.parseArgs() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() ret_type_dict = pickle_lib.get_pickle_file_content(config['return_type_dict_file']) ## get number of different return types pickle_files = common_stuff_lib.get_all_filenames_of_type(config['save_dir'], '.pickle') p = Pool(nr_of_cpus) pickle_files_save_dir = [config['save_dir'] + "/" + f for f in pickle_files] star_list = zip(pickle_files_save_dir, repeat(ret_type_dict), repeat(config)) all_ret_types = p.starmap(proc_count, star_list) p.close() p.join() ## build count dict ret_type_counter = dict() nr = 0 for key in ret_type_dict: ret_type_counter[key] = 0 for counts_dict in all_ret_types: #print(f"counts_dict >{counts_dict}<") for counts_dict_key in counts_dict: #print(f"counts_dict[counts_dict_key] >{counts_dict[counts_dict_key]}<") ret_type_counter[counts_dict_key] += counts_dict[counts_dict_key] print(f"The counts of every arg_three :") for key in ret_type_counter: print(f"arg_three type >{key}< exists\t\t\t>{ret_type_counter[key]}< \ttimes") config['minimum_nr_of_return_types'] = input('Put in minimum nr of arg_three to build balanced dataset:') ### filter all that >= int(config['minimum_nr_of_return_types']) ret_type_counter_filtered = dict() for key in ret_type_dict: if ret_type_counter[key] >= int(config['minimum_nr_of_return_types']): ret_type_counter_filtered[key] = ret_type_counter[key] print(f"The filtered counts (>={int(config['minimum_nr_of_return_types'])}) of every type >{ret_type_counter_filtered}<") ### now select int(config['minimum_nr_of_return_types']) disassemblies,labels from ### filter and store to dict the usable text,label pairs for key in ret_type_counter_filtered: print(f'build balanced with key >{key}<') t = Thread(target=proc_build_balanced, args=(pickle_files_save_dir, key, int(config['minimum_nr_of_return_types']), config, )) t.start() print(f'Run build_balanced_ret_type__vocab__seq_len.py next')
def proc_build(file, ret_type_dict, config): trans_ds = list() #print(f'Transform File >{file}<') cont = pickle_lib.get_pickle_file_content(file) for item in cont: #print(f"item >{item[0]}< item-1 >{item[1]}< >{ret_type_dict[item[1]]}<") trans_ds.append((item[0], ret_type_dict[item[1]])) tfrecord_lib.save_caller_callee_to_tfrecord( trans_ds, config['tfrecord_save_dir'] + os.path.basename(file).replace('.pickle', '.tfrecord'))
def get_prediction(self, model, disasm_caller_callee_str, func_sign_prob_git_path): ### predict now model_path = func_sign_prob_git_path + \ "ubuntu-20-04-scripts/trained_models/" + model + "/saved_model/" ###load vocabulary list vocab_file = func_sign_prob_git_path + \ "ubuntu-20-04-scripts/trained_models/" + model + "/" + \ 'vocabulary_list.pickle' vocabulary = pickle_lib.get_pickle_file_content(vocab_file) ###load max-sequence-length max_seq_len_file = func_sign_prob_git_path + \ "ubuntu-20-04-scripts/trained_models/" + model + "/" + \ 'max_seq_length.pickle' max_seq_length = pickle_lib.get_pickle_file_content(max_seq_len_file) ret = self.predict(model_path, len(vocabulary), max_seq_length, disasm_caller_callee_str) ## get strings for ints, with ret_type_dict ret_type_dict_file = func_sign_prob_git_path + \ "ubuntu-20-04-scripts/trained_models/" + model + "/" + \ 'return_type_dict.pickle' ret_type_dict = pickle_lib.get_pickle_file_content(ret_type_dict_file) ### get human-readable output prediction_summary_str = self.get_prediction_summary(ret_type_dict, ret) ## store for later # nr_of_args_model_summary_str = self.model_summary_str # self._disasTextEdit.setPlainText(f"tf model summary:\n{self.model_summary_str}\n \ # {nr_of_args_model_summary_str}") return prediction_summary_str
def main(): # tarbz2_files = common_stuff_lib.get_all_filenames_of_type("/tmp/test/", '.tar.bz2') # # work_dir = "/tmp/work_dir" # for tarbz2_file in tarbz2_files: # tarbz2_lib.untar_file_to_path('/tmp/test/' + tarbz2_file, work_dir) user_home_path = os.path.expanduser('~') path = user_home_path + "/ret-type/work_dir/" pickle_files = common_stuff_lib.get_all_filenames_of_type(path, '.pickle') for file in pickle_files: cont = pickle_lib.get_pickle_file_content(path + file) for elem in cont: print(f'elem >{elem}<')
def proc_count(file, ret_type_dict, config): #ret_type_dict => 'char' = 0 'int' = 1 ## build count dict ret_type_count = dict() nr = 0 for key in ret_type_dict: ret_type_count[key] = 0 ##count cont = pickle_lib.get_pickle_file_content(file) for item in cont: ret_type_count[item[1]] = ret_type_count[item[1]] + 1 #print(f"Counter >{ret_type_count}<") return ret_type_count
def main(): config = common_stuff_lib.parseArgs() print(f'config >{config}<') print() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() print(f"Using files in directory >{config['save_dir']}<") print() pickle_files = common_stuff_lib.get_all_filenames_of_type( config['save_dir'], '.pickle') all_ret_types_list = set() counter = 0 max_seq_len = 0 for file in pickle_files: cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) counter = 0 max_seq_len = 0 for item in cont: all_ret_types_list.add(item[1]) if counter < 1: print( f"nr-of-arguments >{item[1]}< from file >{config['save_dir'] + file}<" ) print() print(f'text >{item[0]}<\nlabel >{item[1]}<') if len(item[0]) > max_seq_len: max_seq_len = len(item[0]) counter += 1 print(f'Counted >{counter}< text,label elements') print(f'longest disassembly got >{max_seq_len}< words') print('----------------------------------------') print() print(f'all_ret_types_list >{all_ret_types_list}<')
def main(): global vectorize_layer AUTOTUNE = tf.data.experimental.AUTOTUNE config = parseArgs() check_config(config) print(f'tensorflow version running now >{tf.__version__}<') print( f"Build tf.data.dataset with tfrecord files from directory >{config['tfrecord_dir'] + 'train/'}< \ >{config['tfrecord_dir'] + 'val/'}< >{config['tfrecord_dir'] + 'test/'}<" ) if os.path.isdir(config['tfrecord_dir'] + 'train/'): print( f"Found directory >{config['tfrecord_dir'] + 'train/'}< , so we dont use balanced dataset" ) tfrecord_train_dataset = tf.data.Dataset.list_files( config['tfrecord_dir'] + 'train/' + '*.tfrecord') train_dataset = tf.data.TFRecordDataset(tfrecord_train_dataset) tfrecord_val_dataset = tf.data.Dataset.list_files( config['tfrecord_dir'] + 'val/' + '*.tfrecord') val_dataset = tf.data.TFRecordDataset(tfrecord_val_dataset) tfrecord_test_dataset = tf.data.Dataset.list_files( config['tfrecord_dir'] + 'test/' + '*.tfrecord') test_dataset = tf.data.TFRecordDataset(tfrecord_test_dataset) train_dataset = train_dataset.map(_parse_function, num_parallel_calls=AUTOTUNE) val_dataset = val_dataset.map(_parse_function, num_parallel_calls=AUTOTUNE) test_dataset = test_dataset.map(_parse_function, num_parallel_calls=AUTOTUNE) else: print(f"Not found directory >{config['tfrecord_dir'] + 'train/'}<") print( f"We will use balanced dataset from directory >{config['tfrecord_dir']}<" ) tfrecord_all_dataset = tf.data.Dataset.list_files( config['tfrecord_dir'] + '*.tfrecord') full_dataset = tf.data.TFRecordDataset(tfrecord_all_dataset) full_dataset = full_dataset.map(_parse_function, num_parallel_calls=AUTOTUNE) #DATASET_SIZE = full_dataset.cardinality().numpy() for num, _ in enumerate(full_dataset): pass DATASET_SIZE = num print(f'DATASET_SIZE >{DATASET_SIZE}<') train_size = int(0.7 * DATASET_SIZE) val_size = int(0.15 * DATASET_SIZE) test_size = int(0.15 * DATASET_SIZE) print( f'Split to train_size >{train_size}< val_size >{val_size}< test_size >{test_size}<' ) #full_dataset = tf.data.TFRecordDataset(FLAGS.input_file) full_dataset = full_dataset.shuffle(1000) train_dataset = full_dataset.take(train_size) test_dataset = full_dataset.skip(train_size) val_dataset = test_dataset.skip(val_size) test_dataset = test_dataset.take(test_size) for text, label in train_dataset.take(1): print( f'One example from train_dataset with int-as-label:\nText: >{text}<\n Label: >{label}<' ) ###load return-type-dict return_type_dict = pickle_lib.get_pickle_file_content( config['return_type_dict_file']) ###load max-sequence-length max_seq_length = pickle_lib.get_pickle_file_content( config['max_seq_length_file']) ###load vocabulary list vocabulary = pickle_lib.get_pickle_file_content(config['vocabulary_file']) # vectorize_layer = TextVectorization(standardize=None, # max_tokens=len(vocabulary)+2, # output_mode='int', # output_sequence_length=max_seq_length) #vectorize_layer.set_vocabulary(vocabulary) #vocab = vectorize_layer.get_vocabulary() #print(f'10 vocab words >{vocab[:10]}<') text_ds = train_dataset.map(lambda x, y: x, num_parallel_calls=AUTOTUNE) tmp_ds = val_dataset.map(lambda x, y: x, num_parallel_calls=AUTOTUNE) text_ds = text_ds.concatenate(tmp_ds) tmp_ds = test_dataset.map(lambda x, y: x, num_parallel_calls=AUTOTUNE) text_ds = text_ds.concatenate(tmp_ds) print(f'text_ds element_spec >{text_ds.element_spec}<') print( f'Adapt text to TextVectorization layer, this takes time :( ~1hour-15min-->8xV100' ) #text_ds = text_ds.apply(tf.data.experimental.unique()) vectorize_layer.adapt(text_ds.batch(64)) train_dataset = configure_for_performance(train_dataset) val_dataset = configure_for_performance(val_dataset) test_dataset = configure_for_performance(test_dataset) ### vec text train_dataset = train_dataset.map(vectorize_text, num_parallel_calls=AUTOTUNE) val_dataset = val_dataset.map(vectorize_text, num_parallel_calls=AUTOTUNE) test_dataset = test_dataset.map(vectorize_text, num_parallel_calls=AUTOTUNE) #exit() embedding_dim = 64 # model = tf.keras.Sequential([tf.keras.Input(shape=(1,), dtype=tf.string), # vectorize_layer, # tf.keras.layers.Embedding(len(vocabulary)+2, embedding_dim, mask_zero=True, # name='embedding'), # tf.keras.layers.Dropout(0.2), # tf.keras.layers.GlobalAveragePooling1D(), # tf.keras.layers.Dropout(0.2), # tf.keras.layers.Dense(len(return_type_dict))]) model = tf.keras.Sequential([ tf.keras.layers.Embedding(len(vocabulary) + 2, embedding_dim, mask_zero=True), tf.keras.layers.Bidirectional( tf.keras.layers.LSTM(64, return_sequences=True)), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(len(return_type_dict)) ]) # model = tf.keras.Sequential([tf.keras.layers.Embedding(len(vocabulary)+2, embedding_dim, mask_zero=True), # tf.keras.layers.LSTM(64), # tf.keras.layers.Dense(64), # tf.keras.layers.Dense(len(return_type_dict))]) model.summary() ## callbacks to save tensorboard-files and model tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=config['tensorboard_log_dir'], histogram_freq=1, write_graph=False, write_images=False) model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( filepath=config['checkpoint_dir'], save_weights_only=True, monitor='accuracy', mode='max', save_best_only=True) model_checkpoint_callback2 = tf.keras.callbacks.ModelCheckpoint( filepath=config['save_model_dir'], save_weights_only=False, monitor='accuracy', mode='max', save_best_only=True) model.compile( loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy']) history = model.fit(train_dataset, validation_data=val_dataset, epochs=10, callbacks=[ tensorboard_callback, model_checkpoint_callback, model_checkpoint_callback2 ]) ### evaluate the model loss, accuracy = model.evaluate(test_dataset) print("Loss: ", loss) print("Accuracy: ", accuracy) ### save trained word embeddings print(f'Saving trained word embeddings (meta.tsv,vecs.tsv) \ (usable in tensorboard->Projector, use chromium-browser to see it correctly,firefox does not always work)' ) save_trained_word_embeddings(model, config['trained_word_embeddings_dir'], vectorize_layer, embedding_dim)
#out_v.write('\t\n') n = 1 for vec in weights: if n == 0: n = 1 else: out_v.write('\t'.join([str(x) for x in vec]) + "\n") out_v.close() out_m.close() ###load vocabulary list user_home_path = os.path.expanduser('~') vocabulary = pickle_lib.get_pickle_file_content(user_home_path + '/arg_three_save_dir/' + 'tfrecord/' + 'vocabulary_list.pickle') ###load max-sequence-length max_seq_length = pickle_lib.get_pickle_file_content(user_home_path + '/arg_three_save_dir/' + 'tfrecord/' + 'max_seq_length.pickle') print(f'len-vocab-from-file >{len(vocabulary)}<') vectorize_layer = TextVectorization(standardize=None, max_tokens=len(vocabulary) + 2, output_mode='int', output_sequence_length=max_seq_length) def vectorize_text(text, label):
def main(): print(f'Tensorflow version is >{tf.version.VERSION}<') config = parseArgs() check_config(config) model = tf.keras.models.load_model(config['checkpoint_dir'] + 'saved_model/') model.summary() export_model = tf.keras.Sequential( [vectorize_layer, model, tf.keras.layers.Activation('softmax')]) # export_model = tf.keras.Sequential([vectorize_layer, # model # ]) examples = ['null x null 1 mov'] print(f'Example we predict >{examples}<') ret = export_model.predict(examples) print(f"Prediction: >{ret}<") print() ##just a newline user_home_path = os.path.expanduser('~') ret_type_dict = pickle_lib.get_pickle_file_content( user_home_path + '/Documents/gcp-caller-callee/arg_one/' + 'return_type_dict.pickle') reverse_ret_type_dict = dict() counter = 0 for key in ret_type_dict: reverse_ret_type_dict[counter] = key counter += 1 for item in ret: result = 0 biggest = 0 biggest_count = 0 counter = 0 for i in item: if i > biggest: biggest = i biggest_count = counter print( f'ret-type >{reverse_ret_type_dict[counter] : <{30}}< got probability of >{i}<' ) counter += 1 result += i for ret in ret_type_dict: if ret_type_dict[ret] == biggest_count: print() print(f'argument one is of type >{ret}<') print() print(f'Does last count together to 1 ? Result: >{result}<')
#def main(): #global vectorize_layer AUTOTUNE = tf.data.experimental.AUTOTUNE config = common_stuff_lib.parseArgs() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() ###load vocabulary list vocabulary = pickle_lib.get_pickle_file_content(config['vocabulary_file']) ###load max-sequence-length max_seq_length = pickle_lib.get_pickle_file_content( config['max_seq_length_file']) print(f'len-vocab-from-file >{len(vocabulary)}<') vectorize_layer = TextVectorization(standardize=None, max_tokens=len(vocabulary) + 2, output_mode='int', output_sequence_length=max_seq_length) print(f'tensorflow version running now >{tf.__version__}<') print( f"Build tf.data.dataset with tfrecord files from directory >{config['tfrecord_save_dir'] + 'train/'}< \
def main(): global vectorize_layer AUTOTUNE = tf.data.experimental.AUTOTUNE config = parseArgs() check_config(config) print(f'tensorflow version running now >{tf.__version__}<') print( f"Build tf.data.dataset with tfrecord files from directory >{config['tfrecord_dir'] + 'train/'}< \ >{config['tfrecord_dir'] + 'val/'}< >{config['tfrecord_dir'] + 'test/'}<" ) tfrecord_train_dataset = tf.data.Dataset.list_files( config['tfrecord_dir'] + 'train/' + '*.tfrecord') train_dataset = tf.data.TFRecordDataset(tfrecord_train_dataset) tfrecord_val_dataset = tf.data.Dataset.list_files(config['tfrecord_dir'] + 'val/' + '*.tfrecord') val_dataset = tf.data.TFRecordDataset(tfrecord_val_dataset) tfrecord_test_dataset = tf.data.Dataset.list_files(config['tfrecord_dir'] + 'test/' + '*.tfrecord') test_dataset = tf.data.TFRecordDataset(tfrecord_test_dataset) ###de-serialize tfrecord examples to tensors train_dataset = train_dataset.map(_parse_function, num_parallel_calls=AUTOTUNE) val_dataset = val_dataset.map(_parse_function, num_parallel_calls=AUTOTUNE) test_dataset = test_dataset.map(_parse_function, num_parallel_calls=AUTOTUNE) for text, label in train_dataset.take(1): print( f'One example from train_dataset with int-as-label:\nText: >{text}<\n Label: >{label}<' ) ###load return-type-dict return_type_dict = pickle_lib.get_pickle_file_content( config['return_type_dict_file']) ###load max-sequence-length max_seq_length = pickle_lib.get_pickle_file_content( config['max_seq_length_file']) ###load vocabulary list vocabulary = pickle_lib.get_pickle_file_content(config['vocabulary_file']) # vectorize_layer = TextVectorization(standardize=None, # max_tokens=len(vocabulary)+2, # output_mode='int', # output_sequence_length=max_seq_length) vectorize_layer.set_vocabulary(vocabulary) vocab = vectorize_layer.get_vocabulary() print(f'10 vocab words >{vocab[:10]}<') text_ds = train_dataset.map(lambda x, y: x, num_parallel_calls=AUTOTUNE) tmp_ds = val_dataset.map(lambda x, y: x, num_parallel_calls=AUTOTUNE) text_ds = text_ds.concatenate(tmp_ds) tmp_ds = test_dataset.map(lambda x, y: x, num_parallel_calls=AUTOTUNE) text_ds = text_ds.concatenate(tmp_ds) print(f'text_ds element_spec >{text_ds.element_spec}<') #text_ds = text_ds.apply(tf.data.experimental.unique()) vectorize_layer.adapt(text_ds.batch(64)) train_dataset = configure_for_performance(train_dataset) val_dataset = configure_for_performance(val_dataset) test_dataset = configure_for_performance(test_dataset) ### vec text train_dataset = train_dataset.map(vectorize_text, num_parallel_calls=AUTOTUNE) val_dataset = val_dataset.map(vectorize_text, num_parallel_calls=AUTOTUNE) test_dataset = test_dataset.map(vectorize_text, num_parallel_calls=AUTOTUNE) embedding_dim = 8 # model = tf.keras.Sequential([tf.keras.Input(shape=(1,), dtype=tf.string), # vectorize_layer, # tf.keras.layers.Embedding(len(vocabulary)+2, embedding_dim, mask_zero=True, # name='embedding'), # tf.keras.layers.Dropout(0.2), # tf.keras.layers.GlobalAveragePooling1D(), # tf.keras.layers.Dropout(0.2), # tf.keras.layers.Dense(len(return_type_dict))]) model = tf.keras.Sequential([ tf.keras.layers.Embedding(len(vocabulary) + 2, embedding_dim, mask_zero=True), tf.keras.layers.Dropout(0.2), tf.keras.layers.GlobalAveragePooling1D(), tf.keras.layers.Dropout(0.2), tf.keras.layers.Dense(len(return_type_dict)) ]) model.summary() ## callbacks to save tensorboard-files and model tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=config['tensorboard_log_dir'], histogram_freq=1, write_graph=False, write_images=False) model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( filepath=config['checkpoint_dir'], save_weights_only=True, monitor='accuracy', mode='max', save_best_only=True) model_checkpoint_callback2 = tf.keras.callbacks.ModelCheckpoint( filepath=config['save_model_dir'], save_weights_only=False, monitor='accuracy', mode='max', save_best_only=True) model.compile( loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy']) history = model.fit(train_dataset, validation_data=val_dataset, epochs=2, callbacks=[ tensorboard_callback, model_checkpoint_callback, model_checkpoint_callback2 ]) ### evaluate the model loss, accuracy = model.evaluate(test_dataset) print("Loss: ", loss) print("Accuracy: ", accuracy) ### save trained word embeddings print( f'Saving trained word embeddings (meta.tsv,vecs.tsv) (usable in tensorboard->Projector)' ) save_trained_word_embeddings(model, config['trained_word_embeddings_dir'], vectorize_layer)
def proc_build(tarbz2_file, work_dir, save_dir, config): tarbz2_lib.untar_file_to_path(tarbz2_file, work_dir) #untar_one_pickle_file(tarbz2_file, work_dir) pickle_file = work_dir + os.path.basename(tarbz2_file).replace( '.tar.bz2', '') pickle_file_content = pickle_lib.get_pickle_file_content(pickle_file) #pickle_file_content = get_pickle_file_content(work_dir + os.path.basename(pickle_file).replace('.tar.bz2', '')) binaries = set() functions = set() for elem in pickle_file_content: binaries.add(elem[7]) functions.add(elem[2]) print(f'binaries >{binaries}<') counter = 0 dataset_list = list() ## 1. get one binary ## 2. get one function of this binary ## 3. get disassembly of this function ## 4. check if this disassembly calls another function ## 4.1 filter @plt ## 5. if yes: get disassembly of caller function ## 6. save caller, callee, nr_of_args ## 7. check again, if it calls another function ## 8. if yes: get disassembly of caller function ## 9. save caller, calle, func_signature ##10. get disassembly of next function of this binary ##11. check if .... for bin in binaries: for func in functions: ## search for bin and func for elem in pickle_file_content: ### if we found bin and func if elem[7] == bin and elem[2] == func: ## get att disassembly att_dis = elem[4] #print(f'att-dis >{att_dis}<') ## check every line if there is a call for item in att_dis: ## find call in disas if disassembly_lib.find_call_in_disassembly_line(item): ## if found, get callee name callee_name = disassembly_lib.get_callee_name_from_disassembly_line( item) #print(f'callee_name >{callee_name}<') ## search for same bin, but callee func for elem2 in pickle_file_content: ### if we found it, get return type and disassembly if elem2[7] == bin and elem2[2] == callee_name: if (len(elem2[4]) > (int(config[ 'tokenized_disassembly_length']) / 2)) or (len(att_dis) > (int(config[ 'tokenized_disassembly_length']) / 2) ) or (len(elem2[4]) < 1) or ( len(att_dis) < 1): continue #return_type_func_sign = return_type_lib.get_return_type_from_function_signature(elem2[0]) #return_type = return_type_lib.get_return_type_from_gdb_ptype(elem2[1]) nr_of_args = return_type_lib.get_nr_of_args_from_function_signature( elem2[0]) ###for debugging, what string is still unknown ?? should show nothing # if return_type == 'unknown': # print(f'string_before_func_name: {return_type_func_sign}') # # if return_type == 'unknown': # #print('unknown found') # #breaker = True # #break # pass # elif return_type == 'delete': # #print('delete found') # ### no return type found, so delete this item # pass # elif return_type == 'process_further': # print(f'ERRROOOORRRR---------------') if nr_of_args == -1: print(f'Error nr_of_args') else: print(f'nr_of_args >{nr_of_args}<', end='\r') tmp_att_dis = att_dis #print(f'len att-dis 1 >{len(tmp_att_dis)}<') tmp_att_dis = disassembly_lib.clean_att_disassembly_from_comment( tmp_att_dis) callee_dis = disassembly_lib.clean_att_disassembly_from_comment( elem2[4]) #print(f'len att-dis 1 >{len(tmp_att_dis)}<') #print(f'att-dis >{tmp_att_dis}<') dis1_str = ' '.join(tmp_att_dis) #dis2_str = ' '.join(elem2[4]) dis2_str = ' '.join(callee_dis) dis1_str = disassembly_lib.split_disassembly( dis1_str) dis2_str = disassembly_lib.split_disassembly( dis2_str) #dis1_str = dis_split(dis1_str) #dis2_str = dis_split(dis2_str) #print(f'dis1_str >{dis1_str}<') ##the max-seq-length blows memory (>160GB ram) with model.fit() if e.g. over 6million if (len(dis1_str) > (int(config[ 'tokenized_disassembly_length'] ) / 2) ) or (len(dis2_str) > (int(config[ 'tokenized_disassembly_length'] ) / 2) ) or (len(dis1_str) < 1) or ( len(dis2_str) < 1): print( f'tokenized_disassembly_length caller >{len(dis1_str)}<' ) print( f'tokenized_disassembly_length callee >{len(dis2_str)}<' ) #print(f"package >{elem[2]}< bin >{elem[3]}< file >{elem[6]}< func >{elem[7]}<") #print(f"package >{elem2[2]}< bin >{elem2[3]}< file >{elem2[6]}< func >{elem2[7]}<") else: dis_str = dis1_str + dis2_str #print(f'dis_str >{dis_str}<') dataset_list.append( (dis_str, nr_of_args)) counter += 1 break if dataset_list: if config['save_file_type'] == 'pickle': ret_file = open( config['save_dir'] + os.path.basename(pickle_file).replace('.tar.bz2', ''), 'wb+') pickle_list = pickle.dump(dataset_list, ret_file) ret_file.close() else: ## save as tfrecord dis_list = list() ret_list = list() for item in dataset_list: dis_list.append(item[0]) ret_list.append(item[1]) raw_dataset = tf.data.Dataset.from_tensor_slices( (dis_list, ret_list)) serialized_features_dataset = raw_dataset.map(tf_serialize_example) filename = config['save_dir'] + os.path.basename( tarbz2_file).replace('.pickle.tar.bz2', '') + '.tfrecord' writer = tf.data.experimental.TFRecordWriter(filename) writer.write(serialized_features_dataset) return counter
def proc_build(tarbz2_file, work_dir, save_dir, config): tarbz2_lib.untar_file_to_path(tarbz2_file, work_dir) #untar_one_pickle_file(tarbz2_file, work_dir) pickle_file = work_dir + os.path.basename(tarbz2_file).replace( '.tar.bz2', '') pickle_file_content = pickle_lib.get_pickle_file_content(pickle_file) #pickle_file_content = get_pickle_file_content(work_dir + os.path.basename(pickle_file).replace('.tar.bz2', '')) binaries = set() functions = set() for elem in pickle_file_content: binaries.add(elem[7]) functions.add(elem[2]) print(f'binaries >{binaries}<') counter = 0 dataset_list = list() ## 1. get one binary ## 2. get one function of this binary ## 3. get disassembly of this function ## 4. check if this disassembly calls another function ## 4.1 filter @plt ## 5. if yes: get disassembly of caller function ## 6. save caller, callee, nr_of_args ## 7. check again, if it calls another function ## 8. if yes: get disassembly of caller function ## 9. save caller, calle, func_signature ##10. get disassembly of next function of this binary ##11. check if .... for bin in binaries: for func in functions: ## search for bin and func for elem in pickle_file_content: ### if we found bin and func if elem[7] == bin and elem[2] == func: ## get att disassembly att_dis = elem[4] #print(f'att-dis >{att_dis}<') ## check every line if there is a call for item in att_dis: ## find call in disas if disassembly_lib.find_call_in_disassembly_line(item): ## if found, get callee name callee_name = disassembly_lib.get_callee_name_from_disassembly_line( item) #print(f'callee_name >{callee_name}<') ## search for same bin, but callee func for elem2 in pickle_file_content: ### if we found it, get return type and disassembly if elem2[7] == bin and elem2[2] == callee_name: #return_type_func_sign = return_type_lib.get_return_type_from_function_signature(elem2[0]) #return_type = return_type_lib.get_return_type_from_gdb_ptype(elem2[1]) nr_of_args = return_type_lib.get_nr_of_args_from_function_signature( elem2[0]) arg_nr_we_want = 3 if nr_of_args < arg_nr_we_want: #print(f'func got to less args for us') break arg_two = return_type_lib.get_arg_two_name_from_function_signature( elem2[0]) result = common_stuff_lib.is_type_known( arg_two) if result == False: #print(f'arg_two not a known type') pass else: tmp_att_dis = att_dis #print(f'len att-dis 1 >{len(tmp_att_dis)}<') tmp_att_dis = disassembly_lib.clean_att_disassembly_from_comment( tmp_att_dis) callee_dis = disassembly_lib.clean_att_disassembly_from_comment( elem2[4]) #print(f'len att-dis 1 >{len(tmp_att_dis)}<') #print(f'att-dis >{tmp_att_dis}<') dis1_str = ' '.join(tmp_att_dis) #dis2_str = ' '.join(elem2[4]) dis2_str = ' '.join(callee_dis) dis1_str = disassembly_lib.split_disassembly( dis1_str) dis2_str = disassembly_lib.split_disassembly( dis2_str) #dis1_str = dis_split(dis1_str) #dis2_str = dis_split(dis2_str) ##the max-seq-length blows memory (>160GB ram) with model.fit() if e.g. over 6million if (len(dis1_str) > 100000) or ( len(dis2_str) > 100000) or ( len(dis1_str) < 1) or (len(dis2_str) < 1): print( f'dis1_str >{len(dis1_str)}<') print( f'dis2_str >{len(dis2_str)}<') #print(f"package >{elem[2]}< bin >{elem[3]}< file >{elem[6]}< func >{elem[7]}<") #print(f"package >{elem2[2]}< bin >{elem2[3]}< file >{elem2[6]}< func >{elem2[7]}<") else: dis_str = dis1_str + dis2_str #print(f'dis_str >{dis_str}<') dataset_list.append( (dis_str, arg_two)) counter += 1 break if dataset_list: if config['save_file_type'] == 'pickle': ret_file = open( config['save_dir'] + os.path.basename(pickle_file).replace('.tar.bz2', ''), 'wb+') pickle_list = pickle.dump(dataset_list, ret_file) ret_file.close() else: ## save as tfrecord dis_list = list() ret_list = list() for item in dataset_list: dis_list.append(item[0]) ret_list.append(item[1]) raw_dataset = tf.data.Dataset.from_tensor_slices( (dis_list, ret_list)) serialized_features_dataset = raw_dataset.map(tf_serialize_example) filename = config['save_dir'] + os.path.basename( tarbz2_file).replace('.pickle.tar.bz2', '') + '.tfrecord' writer = tf.data.experimental.TFRecordWriter(filename) writer.write(serialized_features_dataset) return counter
import tarfile import os import sys import pickle #import tensorflow as tf from datetime import datetime from multiprocessing import Pool import getopt from itertools import repeat import psutil sys.path.append('../../lib/') import return_type_lib import common_stuff_lib import tarbz2_lib import pickle_lib import disassembly_lib #import tfrecord_lib user_home_path = os.path.expanduser('~') ret = pickle_lib.get_pickle_file_content( user_home_path + "/backup/save_dir/tfrecord/max_seq_length.pickle") print(f'max-seq-length >{ret}<')
out_v.write(out_str) #out_v.write('weight1\tweight2\tweight3\tweigth4\tweigth5\tweigth6\tweigth7\tweigth8\n') #out_v.write('\t\n') n = 1 for vec in weights: if n == 0: n = 1 else: out_v.write('\t'.join([str(x) for x in vec]) + "\n") out_v.close() out_m.close() ###load vocabulary list vocabulary = pickle_lib.get_pickle_file_content('/tmp/save_dir/' + 'tfrecord/' + 'vocabulary_list.pickle') ###load max-sequence-length max_seq_length = pickle_lib.get_pickle_file_content('/tmp/save_dir/' + 'tfrecord/' + 'max_seq_length.pickle') print(f'len-vocab-from-file >{len(vocabulary)}<') vectorize_layer = TextVectorization(standardize=None, max_tokens=len(vocabulary)+2, output_mode='int', output_sequence_length=max_seq_length) def vectorize_text(text, label): text = tf.expand_dims(text, -1) return vectorize_layer(text), label def main():
def check_config(config): if not os.path.isdir(config['checkpoint_dir']): print( f"Directory >{config['checkpoint_dir']}< does not exist. Please specify model checkpoint dir, -h for help" ) exit() # if not os.path.isdir(config['save_dir']): # print(f"Directory >{config['save_dir']}< does not exist. Please specify save_dir dir, -h for help") # exit() ###load vocabulary list user_home_path = os.path.expanduser('~') vocabulary = pickle_lib.get_pickle_file_content( user_home_path + '/Documents/gcp-caller-callee/arg_one/' + 'vocabulary_list.pickle') ###load max-sequence-length max_seq_length = pickle_lib.get_pickle_file_content( user_home_path + '/Documents/gcp-caller-callee/arg_one/' + 'max_seq_length.pickle') print(f'len-vocab-from-file >{len(vocabulary)}<') vectorize_layer = TextVectorization(standardize=None, max_tokens=len(vocabulary) + 2, output_mode='int', output_sequence_length=max_seq_length) def main():
def proc_build(tarbz2_file, work_dir, save_dir, config): tarbz2_lib.untar_file_to_path(tarbz2_file, work_dir) #untar_one_pickle_file(tarbz2_file, work_dir) pickle_file = work_dir + os.path.basename(tarbz2_file).replace( '.tar.bz2', '') pickle_file_content = pickle_lib.get_pickle_file_content(pickle_file) #pickle_file_content = get_pickle_file_content(work_dir + os.path.basename(pickle_file).replace('.tar.bz2', '')) binaries = set() functions = set() for elem in pickle_file_content: binaries.add(elem[7]) functions.add(elem[2]) print(f'binaries >{binaries}<') counter = 0 dataset_list = list() ## 1. get one binary ## 2. get one function of this binary ## 3. get disassembly of this function ## 4. check if this disassembly calls another function ## 4.1 filter @plt ## 5. if yes: get disassembly of caller function ## 6. save caller, callee, func_signature ## 7. check again, if it calls another function ## 8. if yes: get disassembly of caller function ## 9. save caller, calle, func_signature ##10. get disassembly of next function of this binary ##11. check if .... for bin in binaries: for func in functions: ## search for bin and func for elem in pickle_file_content: ### if we found bin and func if elem[7] == bin and elem[2] == func: ## get att disassembly att_dis = elem[4] ## check every line if there is a call for item in att_dis: ## find call in disas if disassembly_lib.find_call_in_disassembly_line(item): ## if found, get callee name callee_name = disassembly_lib.get_callee_name_from_disassembly_line( item) #print(f'callee_name >{callee_name}<') ## search for same bin, but callee func for elem2 in pickle_file_content: ### if we found it, get return type and disassembly if elem2[7] == bin and elem2[2] == callee_name: return_type_func_sign = return_type_lib.get_return_type_from_function_signature( elem2[0]) return_type = return_type_lib.get_return_type_from_gdb_ptype( elem2[1]) ###for debugging, what string is still unknown ?? should show nothing if return_type == 'unknown': print( f'string_before_func_name: {string_before_func_name}' ) if return_type == 'unknown': #print('unknown found') #breaker = True #break pass elif return_type == 'delete': #print('delete found') ### no return type found, so delete this item pass elif return_type == 'process_further': print(f'ERRROOOORRRR---------------') else: dis1_str = ' '.join(att_dis) dis2_str = ' '.join(elem2[4]) dis1_str = disassembly_lib.split_disassembly( dis1_str) dis2_str = disassembly_lib.split_disassembly( dis2_str) #dis1_str = dis_split(dis1_str) #dis2_str = dis_split(dis2_str) dis_str = dis1_str + dis2_str #print(f'dis_str >{dis_str}<') dataset_list.append( (dis_str, return_type)) counter += 1 break if dataset_list: if config['save_file_type'] == 'pickle': ret_file = open( config['save_dir'] + os.path.basename(pickle_file).replace('.tar.bz2', ''), 'wb+') pickle_list = pickle.dump(dataset_list, ret_file) ret_file.close() else: ## save as tfrecord dis_list = list() ret_list = list() for item in dataset_list: dis_list.append(item[0]) ret_list.append(item[1]) raw_dataset = tf.data.Dataset.from_tensor_slices( (dis_list, ret_list)) serialized_features_dataset = raw_dataset.map(tf_serialize_example) filename = config['save_dir'] + os.path.basename( tarbz2_file).replace('.pickle.tar.bz2', '') + '.tfrecord' writer = tf.data.experimental.TFRecordWriter(filename) writer.write(serialized_features_dataset) return counter
def main(): config = parseArgs() print(f'config >{config}<') check_if_dir_exists(config['pickle_dir']) check_if_dir_exists(config['work_dir']) check_if_dir_exists(config['save_dir']) check_if_dir_exists(config['tfrecord_save_dir']) ### get all pickle files #pickle_files = get_all_tar_filenames(config['pickle_dir']) pickle_files = common_stuff_lib.get_all_filenames_of_type( config['pickle_dir'], '.tar.bz2') ### print 5 files, check and debug pickle_lib.print_X_pickle_filenames(pickle_files, 5) ### build p = Pool(nr_of_cpus) pickle_files = [config["pickle_dir"] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config['work_dir']), repeat(config['save_dir']), repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() ## build return type dict-file and max-seq-length-file and vocabulary pickle_files = common_stuff_lib.get_all_filenames_of_type( config['save_dir'], '.pickle') print(f'pickle-files >{pickle_files}<') print(f'Building return-type dict, vocabulary and max-squenece-length') ret_set = set() vocab = set() seq_length = 0 counter = 1 pickle_count = len(pickle_files) for file in pickle_files: print(f'File >{file}< >{counter}/{pickle_count}<', end='\r') counter += 1 cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) for item in cont: #print(f'item-1 >{item[1]}<') ## build ret-type-dict ret_set.add(item[1]) ##build max-seq-length if len(item[0]) > seq_length: seq_length = len(item[0]) ## build vocabulary for word in item[0].split(): vocab.add(word) print( f"Build return-type dict and save it to >{config['return_type_dict_file']}<" ) ## build ret-type-dict and save ret_type_dict = dict() counter = 0 for elem in ret_set: ret_type_dict[elem] = counter counter += 1 pickle_lib.save_to_pickle_file(ret_type_dict, config['return_type_dict_file']) print(f"Build vocabulary and save it to >{config['vocabulary_file']}<") ## build vocabulary list from set and save vocab_list = list(vocab) pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file']) ## save max-seq-length print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<") pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file']) ### transform dataset ret-types to ints print( f"Transform return-type to int and save to >{config['tfrecord_save_dir']}<" ) trans_ds = list() counter = 1 for file in pickle_files: print(f'Transform File >{file}< >{counter}/{pickle_count}<', end='\r') counter += 1 cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) for item in cont: trans_ds.append((item[0], ret_type_dict[item[1]])) tfrecord_lib.save_caller_callee_to_tfrecord( trans_ds, config['tfrecord_save_dir'] + file.replace('.pickle', '.tfrecord')) print("Splitting dataset to train,val,test") tfrecord_lib.split_to_train_val_test(config['tfrecord_save_dir']) print("Done. Run build_caller_callee_model.py now")