def main():
    config = common_stuff_lib.parseArgs()
    print(f'config >{config}<')
    print()
    check_config(config)

    nr_of_cpus = psutil.cpu_count(logical=True)
    print(f'We got >{nr_of_cpus}< CPUs for threading')
    print()

    print(f"Using files in directory >{config['tfrecord_save_dir']}<")
    print()

    return_type_dict = pickle_lib.get_pickle_file_content(
        config['tfrecord_save_dir'] + 'return_type_dict.pickle')

    print(f'return_type_dict value >{return_type_dict}<')
    print()

    vocabulary_list = pickle_lib.get_pickle_file_content(
        config['tfrecord_save_dir'] + 'vocabulary_list.pickle')

    print(f'vocabulary_list >{vocabulary_list}<')
    print()

    print(f'vocabulary_list length >{len(vocabulary_list)}<')
    print()

    max_seq_length = pickle_lib.get_pickle_file_content(
        config['tfrecord_save_dir'] + 'max_seq_length.pickle')

    print(f'max_seq_length >{max_seq_length}<')
def proc_build(file, config):

    ret_set = set()
    vocab = set()
    seq_length = 0

    print(f'File >{file}<')

    cont = pickle_lib.get_pickle_file_content(file)
    for item in cont:
        #print(f'item-1 >{item[1]}<')
        ## build ret-type-dict
        ret_set.add(item[1])

        ##build max-seq-length
        if len(item[0]) > seq_length:
            if len(item[0]) > 100000:
                print(f'len-bigger 100.000')
            seq_length = len(item[0])

        ## build vocabulary
        for word in item[0].split():
            vocab.add(word)

    return (ret_set, vocab, seq_length)
def main():
    config = parseArgs()

    nr_of_cpus = psutil.cpu_count(logical=True)
    print(f'We got nr_of_cpus >{nr_of_cpus}<')

    print(f"Using files in directory >{config['balanced_dataset_dir']}<")

    pickle_files = common_stuff_lib.get_all_filenames_of_type(
        config['balanced_dataset_dir'], '.pickle')

    for file in pickle_files:
        cont = pickle_lib.get_pickle_file_content(
            config['balanced_dataset_dir'] + file)
        counter = 0

        for item in cont:
            #print(f'item[0] >{item[0]}<  item[1] >{item[1]}<')
            if counter < 1:
                print(
                    f"return type >{item[1]}< from file >{config['balanced_dataset_dir'] + file}<"
                )
            counter += 1

        print(f'Counted >{counter}< text,label elements')
        print()
Esempio n. 4
0
def main():
    config = parseArgs()

    check_config(config)

    print(f'config >{config}<')

    nr_of_cpus = psutil.cpu_count(logical=True)
    print(f'We got nr_of_cpus >{nr_of_cpus}<')

    ##load ret-type dict
    ret_type_dict = pickle_lib.get_pickle_file_content(
        config['return_type_dict_file'])
    print(f"ret-type-dict >{ret_type_dict}<")

    pickle_files = common_stuff_lib.get_all_filenames_of_type(
        config['balanced_dataset_dir'], '.pickle')

    ### transform dataset ret-types to ints
    print(
        f"Transform return-type to int and save to >{config['tfrecord_save_dir']}<"
    )
    p = Pool(nr_of_cpus)

    pickle_files = [
        config['balanced_dataset_dir'] + "/" + f for f in pickle_files
    ]

    star_list = zip(pickle_files, repeat(ret_type_dict), repeat(config))

    all_ret_types = p.starmap(proc_build, star_list)
    p.close()
    p.join()

    print("Done. Run train_arg_one_model_lstm.py next")
def proc_build_balanced(pickle_files, key, minimum_ret_type_count, config):
    #print(f'build balanced')
    ### filter and store to dict the usable text,label pairs

    ## a dict that counts how many text,labels from one key-type we got
    ret_type_count_watcher = 1
    #     nr = 0
    #     for key in ret_type_counter_filtered:
    #         ret_type_count_watcher[key] = 0

    ret_type_0 = list()
    for file in pickle_files:
        cont = pickle_lib.get_pickle_file_content(file)
        for item in cont:
            ## is the ret-type we found in our filtered list?
            #for key in ret_type_counter_filtered:
            if key == item[1]:
                #print(f'got filtered ret-type')
                if ret_type_count_watcher <= minimum_ret_type_count:
                    ret_type_0.append((item[0], item[1]))
                    ret_type_count_watcher += 1
                    if ret_type_count_watcher > minimum_ret_type_count:
                        break

        if ret_type_count_watcher > minimum_ret_type_count:
            break

    ### save them
    #print(f'Save balanced dataset')
    pickle_lib.save_to_pickle_file(
        ret_type_0,
        config['balanced_dataset_dir'] + key.replace(' ', '_') + '.pickle')
Esempio n. 6
0
def main():
    config = common_stuff_lib.parseArgs()
    print(f'config >{config}<')
    print()
    check_config(config)
    
    nr_of_cpus = psutil.cpu_count(logical=True)
    print(f'We got >{nr_of_cpus}< CPUs for threading')
    print()
    
    print(f"Using files in directory >{config['save_dir']}<")
    
    pickle_files = common_stuff_lib.get_all_filenames_of_type(config['save_dir'], '.pickle')
    
    for file in pickle_files:
        cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file)
        counter = 0
        
        for item in cont:
            #print(f'item[0] >{item[0]}<  item[1] >{item[1]}<')
            if counter < 1:
                print(f"return type >{item[1]}< from file >{config['save_dir'] + file}<")
                print(f'{item[0]}')
            counter += 1
            
        print(f'Counted >{counter}< text,label elements')
        print()
Esempio n. 7
0
def main():
    config = common_stuff_lib.parseArgs()
    check_config(config)
    
    nr_of_cpus = psutil.cpu_count(logical=True)
    print(f'We got >{nr_of_cpus}< CPUs for threading')
    print()
    
    ret_type_dict = pickle_lib.get_pickle_file_content(config['return_type_dict_file'])
    
    ## get number of different return types
    pickle_files = common_stuff_lib.get_all_filenames_of_type(config['save_dir'], '.pickle')
    
    p = Pool(nr_of_cpus)
    
    pickle_files_save_dir = [config['save_dir'] + "/" + f for f in pickle_files]
    star_list = zip(pickle_files_save_dir, repeat(ret_type_dict), repeat(config))
    all_ret_types = p.starmap(proc_count, star_list)
    p.close()
    p.join()
    
    ## build count dict
    ret_type_counter = dict()
    nr = 0
    for key in ret_type_dict:
        ret_type_counter[key] = 0
        
    for counts_dict in all_ret_types:
        #print(f"counts_dict >{counts_dict}<")
        for counts_dict_key in counts_dict:
            #print(f"counts_dict[counts_dict_key] >{counts_dict[counts_dict_key]}<")
            ret_type_counter[counts_dict_key]  += counts_dict[counts_dict_key]
        
    print(f"The counts of every arg_three :")
    for key in ret_type_counter:
        print(f"arg_three type >{key}< exists\t\t\t>{ret_type_counter[key]}< \ttimes")
    
    config['minimum_nr_of_return_types'] = input('Put in minimum nr of arg_three to build balanced dataset:')
    
    ### filter all that >= int(config['minimum_nr_of_return_types'])
    ret_type_counter_filtered = dict()
    for key in ret_type_dict:
        if ret_type_counter[key] >= int(config['minimum_nr_of_return_types']):
            ret_type_counter_filtered[key] = ret_type_counter[key]
            
    print(f"The filtered counts (>={int(config['minimum_nr_of_return_types'])}) of every type >{ret_type_counter_filtered}<")
    
    ### now select int(config['minimum_nr_of_return_types']) disassemblies,labels from 
    ### filter and store to dict the usable text,label pairs
    
    for key in ret_type_counter_filtered:
        print(f'build balanced with key >{key}<')
        t = Thread(target=proc_build_balanced, args=(pickle_files_save_dir, key, int(config['minimum_nr_of_return_types']), config, ))
        t.start()
        
    print(f'Run build_balanced_ret_type__vocab__seq_len.py next')
Esempio n. 8
0
def proc_build(file, ret_type_dict, config):
    trans_ds = list()

    #print(f'Transform File >{file}<')

    cont = pickle_lib.get_pickle_file_content(file)
    for item in cont:
        #print(f"item >{item[0]}<  item-1 >{item[1]}< >{ret_type_dict[item[1]]}<")
        trans_ds.append((item[0], ret_type_dict[item[1]]))

    tfrecord_lib.save_caller_callee_to_tfrecord(
        trans_ds, config['tfrecord_save_dir'] +
        os.path.basename(file).replace('.pickle', '.tfrecord'))
    def get_prediction(self, model, disasm_caller_callee_str, func_sign_prob_git_path):
        ### predict now    
        model_path = func_sign_prob_git_path + \
                            "ubuntu-20-04-scripts/trained_models/" + model + "/saved_model/"
                            
        ###load vocabulary list
        vocab_file = func_sign_prob_git_path + \
                            "ubuntu-20-04-scripts/trained_models/" + model + "/" + \
                            'vocabulary_list.pickle'
        
                                                    
        vocabulary = pickle_lib.get_pickle_file_content(vocab_file)
        
        ###load max-sequence-length
        max_seq_len_file = func_sign_prob_git_path + \
                            "ubuntu-20-04-scripts/trained_models/" + model + "/" + \
                            'max_seq_length.pickle'
                            
        max_seq_length = pickle_lib.get_pickle_file_content(max_seq_len_file)
        
        ret = self.predict(model_path, len(vocabulary), max_seq_length, disasm_caller_callee_str)
        
        ## get strings for ints, with ret_type_dict
        ret_type_dict_file = func_sign_prob_git_path + \
                                    "ubuntu-20-04-scripts/trained_models/" + model + "/" + \
                                    'return_type_dict.pickle'
                            
        ret_type_dict = pickle_lib.get_pickle_file_content(ret_type_dict_file)
        
        ### get human-readable output
        prediction_summary_str = self.get_prediction_summary(ret_type_dict, ret)
       
        ## store for later
#         nr_of_args_model_summary_str = self.model_summary_str
#         self._disasTextEdit.setPlainText(f"tf model summary:\n{self.model_summary_str}\n \
#                                         {nr_of_args_model_summary_str}")
       
        return prediction_summary_str   
def main():
    #     tarbz2_files = common_stuff_lib.get_all_filenames_of_type("/tmp/test/", '.tar.bz2')
    #
    #     work_dir = "/tmp/work_dir"
    #     for tarbz2_file in tarbz2_files:
    #         tarbz2_lib.untar_file_to_path('/tmp/test/' + tarbz2_file, work_dir)
    user_home_path = os.path.expanduser('~')
    path = user_home_path + "/ret-type/work_dir/"
    pickle_files = common_stuff_lib.get_all_filenames_of_type(path, '.pickle')

    for file in pickle_files:
        cont = pickle_lib.get_pickle_file_content(path + file)

        for elem in cont:
            print(f'elem >{elem}<')
def proc_count(file, ret_type_dict, config):
    #ret_type_dict => 'char' = 0   'int' = 1

    ## build count dict
    ret_type_count = dict()
    nr = 0
    for key in ret_type_dict:
        ret_type_count[key] = 0

    ##count
    cont = pickle_lib.get_pickle_file_content(file)
    for item in cont:
        ret_type_count[item[1]] = ret_type_count[item[1]] + 1

    #print(f"Counter >{ret_type_count}<")

    return ret_type_count
def main():
    config = common_stuff_lib.parseArgs()
    print(f'config >{config}<')
    print()
    check_config(config)

    nr_of_cpus = psutil.cpu_count(logical=True)
    print(f'We got >{nr_of_cpus}< CPUs for threading')
    print()

    print(f"Using files in directory >{config['save_dir']}<")
    print()

    pickle_files = common_stuff_lib.get_all_filenames_of_type(
        config['save_dir'], '.pickle')

    all_ret_types_list = set()
    counter = 0
    max_seq_len = 0

    for file in pickle_files:
        cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file)
        counter = 0
        max_seq_len = 0

        for item in cont:
            all_ret_types_list.add(item[1])
            if counter < 1:
                print(
                    f"nr-of-arguments >{item[1]}< from file >{config['save_dir'] + file}<"
                )
                print()
                print(f'text >{item[0]}<\nlabel >{item[1]}<')

            if len(item[0]) > max_seq_len:
                max_seq_len = len(item[0])

            counter += 1

        print(f'Counted >{counter}< text,label elements')
        print(f'longest disassembly got >{max_seq_len}< words')
        print('----------------------------------------')
        print()

    print(f'all_ret_types_list >{all_ret_types_list}<')
def main():
    global vectorize_layer

    AUTOTUNE = tf.data.experimental.AUTOTUNE

    config = parseArgs()

    check_config(config)

    print(f'tensorflow version running now >{tf.__version__}<')

    print(
        f"Build tf.data.dataset with tfrecord files from directory >{config['tfrecord_dir'] + 'train/'}< \
            >{config['tfrecord_dir'] + 'val/'}< >{config['tfrecord_dir'] + 'test/'}<"
    )

    if os.path.isdir(config['tfrecord_dir'] + 'train/'):
        print(
            f"Found directory >{config['tfrecord_dir'] + 'train/'}< , so we dont use balanced dataset"
        )

        tfrecord_train_dataset = tf.data.Dataset.list_files(
            config['tfrecord_dir'] + 'train/' + '*.tfrecord')
        train_dataset = tf.data.TFRecordDataset(tfrecord_train_dataset)

        tfrecord_val_dataset = tf.data.Dataset.list_files(
            config['tfrecord_dir'] + 'val/' + '*.tfrecord')
        val_dataset = tf.data.TFRecordDataset(tfrecord_val_dataset)

        tfrecord_test_dataset = tf.data.Dataset.list_files(
            config['tfrecord_dir'] + 'test/' + '*.tfrecord')
        test_dataset = tf.data.TFRecordDataset(tfrecord_test_dataset)

        train_dataset = train_dataset.map(_parse_function,
                                          num_parallel_calls=AUTOTUNE)
        val_dataset = val_dataset.map(_parse_function,
                                      num_parallel_calls=AUTOTUNE)
        test_dataset = test_dataset.map(_parse_function,
                                        num_parallel_calls=AUTOTUNE)

    else:
        print(f"Not found directory >{config['tfrecord_dir'] + 'train/'}<")
        print(
            f"We will use balanced dataset from directory >{config['tfrecord_dir']}<"
        )

        tfrecord_all_dataset = tf.data.Dataset.list_files(
            config['tfrecord_dir'] + '*.tfrecord')
        full_dataset = tf.data.TFRecordDataset(tfrecord_all_dataset)

        full_dataset = full_dataset.map(_parse_function,
                                        num_parallel_calls=AUTOTUNE)

        #DATASET_SIZE = full_dataset.cardinality().numpy()
        for num, _ in enumerate(full_dataset):
            pass
        DATASET_SIZE = num
        print(f'DATASET_SIZE >{DATASET_SIZE}<')

        train_size = int(0.7 * DATASET_SIZE)
        val_size = int(0.15 * DATASET_SIZE)
        test_size = int(0.15 * DATASET_SIZE)

        print(
            f'Split to train_size >{train_size}< val_size >{val_size}< test_size >{test_size}<'
        )

        #full_dataset = tf.data.TFRecordDataset(FLAGS.input_file)
        full_dataset = full_dataset.shuffle(1000)
        train_dataset = full_dataset.take(train_size)
        test_dataset = full_dataset.skip(train_size)
        val_dataset = test_dataset.skip(val_size)
        test_dataset = test_dataset.take(test_size)

    for text, label in train_dataset.take(1):
        print(
            f'One example from train_dataset with int-as-label:\nText: >{text}<\n Label: >{label}<'
        )

    ###load return-type-dict
    return_type_dict = pickle_lib.get_pickle_file_content(
        config['return_type_dict_file'])

    ###load max-sequence-length
    max_seq_length = pickle_lib.get_pickle_file_content(
        config['max_seq_length_file'])

    ###load vocabulary list
    vocabulary = pickle_lib.get_pickle_file_content(config['vocabulary_file'])

    #     vectorize_layer = TextVectorization(standardize=None,
    #                                     max_tokens=len(vocabulary)+2,
    #                                     output_mode='int',
    #                                     output_sequence_length=max_seq_length)

    #vectorize_layer.set_vocabulary(vocabulary)

    #vocab = vectorize_layer.get_vocabulary()
    #print(f'10 vocab words >{vocab[:10]}<')

    text_ds = train_dataset.map(lambda x, y: x, num_parallel_calls=AUTOTUNE)
    tmp_ds = val_dataset.map(lambda x, y: x, num_parallel_calls=AUTOTUNE)
    text_ds = text_ds.concatenate(tmp_ds)
    tmp_ds = test_dataset.map(lambda x, y: x, num_parallel_calls=AUTOTUNE)
    text_ds = text_ds.concatenate(tmp_ds)
    print(f'text_ds element_spec >{text_ds.element_spec}<')

    print(
        f'Adapt text to TextVectorization layer, this takes time :(  ~1hour-15min-->8xV100'
    )
    #text_ds = text_ds.apply(tf.data.experimental.unique())
    vectorize_layer.adapt(text_ds.batch(64))

    train_dataset = configure_for_performance(train_dataset)
    val_dataset = configure_for_performance(val_dataset)
    test_dataset = configure_for_performance(test_dataset)

    ### vec text
    train_dataset = train_dataset.map(vectorize_text,
                                      num_parallel_calls=AUTOTUNE)
    val_dataset = val_dataset.map(vectorize_text, num_parallel_calls=AUTOTUNE)
    test_dataset = test_dataset.map(vectorize_text,
                                    num_parallel_calls=AUTOTUNE)

    #exit()
    embedding_dim = 64

    #     model = tf.keras.Sequential([tf.keras.Input(shape=(1,), dtype=tf.string),
    #                                  vectorize_layer,
    #                                  tf.keras.layers.Embedding(len(vocabulary)+2, embedding_dim, mask_zero=True,
    #                                     name='embedding'),
    #                                     tf.keras.layers.Dropout(0.2),
    #                                     tf.keras.layers.GlobalAveragePooling1D(),
    #                                     tf.keras.layers.Dropout(0.2),
    #                                     tf.keras.layers.Dense(len(return_type_dict))])

    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(len(vocabulary) + 2,
                                  embedding_dim,
                                  mask_zero=True),
        tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(64, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(len(return_type_dict))
    ])

    #     model = tf.keras.Sequential([tf.keras.layers.Embedding(len(vocabulary)+2, embedding_dim, mask_zero=True),
    #                                  tf.keras.layers.LSTM(64),
    #                                  tf.keras.layers.Dense(64),
    #                                  tf.keras.layers.Dense(len(return_type_dict))])

    model.summary()

    ## callbacks to save tensorboard-files and model
    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=config['tensorboard_log_dir'],
        histogram_freq=1,
        write_graph=False,
        write_images=False)

    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=config['checkpoint_dir'],
        save_weights_only=True,
        monitor='accuracy',
        mode='max',
        save_best_only=True)

    model_checkpoint_callback2 = tf.keras.callbacks.ModelCheckpoint(
        filepath=config['save_model_dir'],
        save_weights_only=False,
        monitor='accuracy',
        mode='max',
        save_best_only=True)

    model.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer='adam',
        metrics=['accuracy'])

    history = model.fit(train_dataset,
                        validation_data=val_dataset,
                        epochs=10,
                        callbacks=[
                            tensorboard_callback, model_checkpoint_callback,
                            model_checkpoint_callback2
                        ])

    ### evaluate the model
    loss, accuracy = model.evaluate(test_dataset)
    print("Loss: ", loss)
    print("Accuracy: ", accuracy)

    ### save trained word embeddings
    print(f'Saving trained word embeddings (meta.tsv,vecs.tsv) \
            (usable in tensorboard->Projector, use chromium-browser to see it correctly,firefox does not always work)'
          )
    save_trained_word_embeddings(model, config['trained_word_embeddings_dir'],
                                 vectorize_layer, embedding_dim)
    #out_v.write('\t\n')
    n = 1
    for vec in weights:
        if n == 0:
            n = 1
        else:
            out_v.write('\t'.join([str(x) for x in vec]) + "\n")

    out_v.close()
    out_m.close()


###load vocabulary list
user_home_path = os.path.expanduser('~')
vocabulary = pickle_lib.get_pickle_file_content(user_home_path +
                                                '/arg_three_save_dir/' +
                                                'tfrecord/' +
                                                'vocabulary_list.pickle')

###load max-sequence-length
max_seq_length = pickle_lib.get_pickle_file_content(user_home_path +
                                                    '/arg_three_save_dir/' +
                                                    'tfrecord/' +
                                                    'max_seq_length.pickle')
print(f'len-vocab-from-file >{len(vocabulary)}<')
vectorize_layer = TextVectorization(standardize=None,
                                    max_tokens=len(vocabulary) + 2,
                                    output_mode='int',
                                    output_sequence_length=max_seq_length)


def vectorize_text(text, label):
Esempio n. 15
0
def main():

    print(f'Tensorflow version is >{tf.version.VERSION}<')

    config = parseArgs()

    check_config(config)

    model = tf.keras.models.load_model(config['checkpoint_dir'] +
                                       'saved_model/')

    model.summary()

    export_model = tf.keras.Sequential(
        [vectorize_layer, model,
         tf.keras.layers.Activation('softmax')])

    #     export_model = tf.keras.Sequential([vectorize_layer,
    #                                           model
    #                                         ])

    examples = ['null x null 1 mov']
    print(f'Example we predict >{examples}<')

    ret = export_model.predict(examples)
    print(f"Prediction: >{ret}<")
    print()  ##just a newline

    user_home_path = os.path.expanduser('~')
    ret_type_dict = pickle_lib.get_pickle_file_content(
        user_home_path + '/Documents/gcp-caller-callee/arg_one/' +
        'return_type_dict.pickle')

    reverse_ret_type_dict = dict()
    counter = 0
    for key in ret_type_dict:
        reverse_ret_type_dict[counter] = key
        counter += 1

    for item in ret:
        result = 0
        biggest = 0
        biggest_count = 0
        counter = 0
        for i in item:
            if i > biggest:
                biggest = i
                biggest_count = counter

            print(
                f'ret-type >{reverse_ret_type_dict[counter] : <{30}}< got probability of >{i}<'
            )
            counter += 1

            result += i
        for ret in ret_type_dict:
            if ret_type_dict[ret] == biggest_count:
                print()
                print(f'argument one is of type >{ret}<')

    print()
    print(f'Does last count together to 1 ? Result: >{result}<')

#def main():
#global vectorize_layer

AUTOTUNE = tf.data.experimental.AUTOTUNE

config = common_stuff_lib.parseArgs()
check_config(config)

nr_of_cpus = psutil.cpu_count(logical=True)
print(f'We got >{nr_of_cpus}< CPUs for threading')
print()

###load vocabulary list
vocabulary = pickle_lib.get_pickle_file_content(config['vocabulary_file'])

###load max-sequence-length
max_seq_length = pickle_lib.get_pickle_file_content(
    config['max_seq_length_file'])

print(f'len-vocab-from-file >{len(vocabulary)}<')
vectorize_layer = TextVectorization(standardize=None,
                                    max_tokens=len(vocabulary) + 2,
                                    output_mode='int',
                                    output_sequence_length=max_seq_length)

print(f'tensorflow version running now >{tf.__version__}<')

print(
    f"Build tf.data.dataset with tfrecord files from directory >{config['tfrecord_save_dir'] + 'train/'}< \
def main():
    global vectorize_layer

    AUTOTUNE = tf.data.experimental.AUTOTUNE

    config = parseArgs()

    check_config(config)

    print(f'tensorflow version running now >{tf.__version__}<')

    print(
        f"Build tf.data.dataset with tfrecord files from directory >{config['tfrecord_dir'] + 'train/'}< \
            >{config['tfrecord_dir'] + 'val/'}< >{config['tfrecord_dir'] + 'test/'}<"
    )

    tfrecord_train_dataset = tf.data.Dataset.list_files(
        config['tfrecord_dir'] + 'train/' + '*.tfrecord')
    train_dataset = tf.data.TFRecordDataset(tfrecord_train_dataset)

    tfrecord_val_dataset = tf.data.Dataset.list_files(config['tfrecord_dir'] +
                                                      'val/' + '*.tfrecord')
    val_dataset = tf.data.TFRecordDataset(tfrecord_val_dataset)

    tfrecord_test_dataset = tf.data.Dataset.list_files(config['tfrecord_dir'] +
                                                       'test/' + '*.tfrecord')
    test_dataset = tf.data.TFRecordDataset(tfrecord_test_dataset)

    ###de-serialize tfrecord examples to tensors
    train_dataset = train_dataset.map(_parse_function,
                                      num_parallel_calls=AUTOTUNE)
    val_dataset = val_dataset.map(_parse_function, num_parallel_calls=AUTOTUNE)
    test_dataset = test_dataset.map(_parse_function,
                                    num_parallel_calls=AUTOTUNE)

    for text, label in train_dataset.take(1):
        print(
            f'One example from train_dataset with int-as-label:\nText: >{text}<\n Label: >{label}<'
        )

    ###load return-type-dict
    return_type_dict = pickle_lib.get_pickle_file_content(
        config['return_type_dict_file'])

    ###load max-sequence-length
    max_seq_length = pickle_lib.get_pickle_file_content(
        config['max_seq_length_file'])

    ###load vocabulary list
    vocabulary = pickle_lib.get_pickle_file_content(config['vocabulary_file'])

    #     vectorize_layer = TextVectorization(standardize=None,
    #                                     max_tokens=len(vocabulary)+2,
    #                                     output_mode='int',
    #                                     output_sequence_length=max_seq_length)

    vectorize_layer.set_vocabulary(vocabulary)

    vocab = vectorize_layer.get_vocabulary()
    print(f'10 vocab words >{vocab[:10]}<')

    text_ds = train_dataset.map(lambda x, y: x, num_parallel_calls=AUTOTUNE)
    tmp_ds = val_dataset.map(lambda x, y: x, num_parallel_calls=AUTOTUNE)
    text_ds = text_ds.concatenate(tmp_ds)
    tmp_ds = test_dataset.map(lambda x, y: x, num_parallel_calls=AUTOTUNE)
    text_ds = text_ds.concatenate(tmp_ds)
    print(f'text_ds element_spec >{text_ds.element_spec}<')

    #text_ds = text_ds.apply(tf.data.experimental.unique())
    vectorize_layer.adapt(text_ds.batch(64))

    train_dataset = configure_for_performance(train_dataset)
    val_dataset = configure_for_performance(val_dataset)
    test_dataset = configure_for_performance(test_dataset)

    ### vec text
    train_dataset = train_dataset.map(vectorize_text,
                                      num_parallel_calls=AUTOTUNE)
    val_dataset = val_dataset.map(vectorize_text, num_parallel_calls=AUTOTUNE)
    test_dataset = test_dataset.map(vectorize_text,
                                    num_parallel_calls=AUTOTUNE)

    embedding_dim = 8

    #     model = tf.keras.Sequential([tf.keras.Input(shape=(1,), dtype=tf.string),
    #                                  vectorize_layer,
    #                                  tf.keras.layers.Embedding(len(vocabulary)+2, embedding_dim, mask_zero=True,
    #                                     name='embedding'),
    #                                     tf.keras.layers.Dropout(0.2),
    #                                     tf.keras.layers.GlobalAveragePooling1D(),
    #                                     tf.keras.layers.Dropout(0.2),
    #                                     tf.keras.layers.Dense(len(return_type_dict))])

    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(len(vocabulary) + 2,
                                  embedding_dim,
                                  mask_zero=True),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(len(return_type_dict))
    ])

    model.summary()

    ## callbacks to save tensorboard-files and model
    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=config['tensorboard_log_dir'],
        histogram_freq=1,
        write_graph=False,
        write_images=False)

    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=config['checkpoint_dir'],
        save_weights_only=True,
        monitor='accuracy',
        mode='max',
        save_best_only=True)

    model_checkpoint_callback2 = tf.keras.callbacks.ModelCheckpoint(
        filepath=config['save_model_dir'],
        save_weights_only=False,
        monitor='accuracy',
        mode='max',
        save_best_only=True)

    model.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer='adam',
        metrics=['accuracy'])

    history = model.fit(train_dataset,
                        validation_data=val_dataset,
                        epochs=2,
                        callbacks=[
                            tensorboard_callback, model_checkpoint_callback,
                            model_checkpoint_callback2
                        ])

    ### evaluate the model
    loss, accuracy = model.evaluate(test_dataset)
    print("Loss: ", loss)
    print("Accuracy: ", accuracy)

    ### save trained word embeddings
    print(
        f'Saving trained word embeddings (meta.tsv,vecs.tsv) (usable in tensorboard->Projector)'
    )
    save_trained_word_embeddings(model, config['trained_word_embeddings_dir'],
                                 vectorize_layer)
Esempio n. 18
0
def proc_build(tarbz2_file, work_dir, save_dir, config):

    tarbz2_lib.untar_file_to_path(tarbz2_file, work_dir)
    #untar_one_pickle_file(tarbz2_file, work_dir)

    pickle_file = work_dir + os.path.basename(tarbz2_file).replace(
        '.tar.bz2', '')
    pickle_file_content = pickle_lib.get_pickle_file_content(pickle_file)
    #pickle_file_content = get_pickle_file_content(work_dir + os.path.basename(pickle_file).replace('.tar.bz2', ''))

    binaries = set()
    functions = set()
    for elem in pickle_file_content:
        binaries.add(elem[7])
        functions.add(elem[2])

    print(f'binaries >{binaries}<')

    counter = 0
    dataset_list = list()

    ## 1. get one binary
    ## 2. get one function of this binary
    ## 3. get disassembly of this function
    ## 4. check if this disassembly calls another function
    ## 4.1 filter @plt
    ## 5. if yes: get disassembly of caller function
    ## 6. save caller, callee, nr_of_args
    ## 7. check again, if it calls another function
    ## 8. if yes: get disassembly of caller function
    ## 9. save caller, calle, func_signature
    ##10. get disassembly of next function of this binary
    ##11. check if ....
    for bin in binaries:
        for func in functions:
            ## search for bin and func
            for elem in pickle_file_content:
                ### if we found bin and func
                if elem[7] == bin and elem[2] == func:
                    ## get att disassembly
                    att_dis = elem[4]
                    #print(f'att-dis >{att_dis}<')
                    ## check every line if there is a call
                    for item in att_dis:
                        ## find call in disas
                        if disassembly_lib.find_call_in_disassembly_line(item):
                            ## if found, get callee name
                            callee_name = disassembly_lib.get_callee_name_from_disassembly_line(
                                item)

                            #print(f'callee_name >{callee_name}<')

                            ## search for same bin, but callee func
                            for elem2 in pickle_file_content:
                                ### if we found it, get return type and disassembly
                                if elem2[7] == bin and elem2[2] == callee_name:

                                    if (len(elem2[4]) >
                                        (int(config[
                                            'tokenized_disassembly_length']) /
                                         2)) or (len(att_dis) > (int(config[
                                             'tokenized_disassembly_length']) /
                                                                 2)
                                                 ) or (len(elem2[4]) < 1) or (
                                                     len(att_dis) < 1):
                                        continue

                                    #return_type_func_sign = return_type_lib.get_return_type_from_function_signature(elem2[0])
                                    #return_type = return_type_lib.get_return_type_from_gdb_ptype(elem2[1])

                                    nr_of_args = return_type_lib.get_nr_of_args_from_function_signature(
                                        elem2[0])

                                    ###for debugging, what string is still unknown ?? should show nothing
                                    #                                     if return_type == 'unknown':
                                    #                                         print(f'string_before_func_name: {return_type_func_sign}')
                                    #
                                    #                                     if return_type == 'unknown':
                                    #                                         #print('unknown found')
                                    #                                         #breaker = True
                                    #                                         #break
                                    #                                         pass
                                    #                                     elif return_type == 'delete':
                                    #                                         #print('delete found')
                                    #                                         ### no return type found, so delete this item
                                    #                                         pass
                                    #                                     elif return_type == 'process_further':
                                    #                                         print(f'ERRROOOORRRR---------------')
                                    if nr_of_args == -1:
                                        print(f'Error nr_of_args')
                                    else:
                                        print(f'nr_of_args >{nr_of_args}<',
                                              end='\r')

                                        tmp_att_dis = att_dis
                                        #print(f'len att-dis 1 >{len(tmp_att_dis)}<')
                                        tmp_att_dis = disassembly_lib.clean_att_disassembly_from_comment(
                                            tmp_att_dis)
                                        callee_dis = disassembly_lib.clean_att_disassembly_from_comment(
                                            elem2[4])
                                        #print(f'len att-dis 1 >{len(tmp_att_dis)}<')
                                        #print(f'att-dis >{tmp_att_dis}<')

                                        dis1_str = ' '.join(tmp_att_dis)
                                        #dis2_str = ' '.join(elem2[4])
                                        dis2_str = ' '.join(callee_dis)

                                        dis1_str = disassembly_lib.split_disassembly(
                                            dis1_str)
                                        dis2_str = disassembly_lib.split_disassembly(
                                            dis2_str)
                                        #dis1_str = dis_split(dis1_str)
                                        #dis2_str = dis_split(dis2_str)
                                        #print(f'dis1_str >{dis1_str}<')

                                        ##the max-seq-length blows memory (>160GB ram) with model.fit() if e.g. over 6million
                                        if (len(dis1_str) > (int(config[
                                                'tokenized_disassembly_length']
                                                                 ) / 2)
                                            ) or (len(dis2_str) > (int(config[
                                                'tokenized_disassembly_length']
                                                                       ) / 2)
                                                  ) or (len(dis1_str) < 1) or (
                                                      len(dis2_str) < 1):
                                            print(
                                                f'tokenized_disassembly_length caller >{len(dis1_str)}<'
                                            )
                                            print(
                                                f'tokenized_disassembly_length callee >{len(dis2_str)}<'
                                            )
                                            #print(f"package >{elem[2]}< bin >{elem[3]}< file >{elem[6]}< func >{elem[7]}<")
                                            #print(f"package >{elem2[2]}< bin >{elem2[3]}< file >{elem2[6]}< func >{elem2[7]}<")

                                        else:
                                            dis_str = dis1_str + dis2_str

                                            #print(f'dis_str >{dis_str}<')

                                            dataset_list.append(
                                                (dis_str, nr_of_args))
                                            counter += 1

                                        break

    if dataset_list:
        if config['save_file_type'] == 'pickle':
            ret_file = open(
                config['save_dir'] +
                os.path.basename(pickle_file).replace('.tar.bz2', ''), 'wb+')
            pickle_list = pickle.dump(dataset_list, ret_file)
            ret_file.close()
        else:
            ## save as tfrecord
            dis_list = list()
            ret_list = list()

            for item in dataset_list:
                dis_list.append(item[0])
                ret_list.append(item[1])

            raw_dataset = tf.data.Dataset.from_tensor_slices(
                (dis_list, ret_list))

            serialized_features_dataset = raw_dataset.map(tf_serialize_example)

            filename = config['save_dir'] + os.path.basename(
                tarbz2_file).replace('.pickle.tar.bz2', '') + '.tfrecord'
            writer = tf.data.experimental.TFRecordWriter(filename)
            writer.write(serialized_features_dataset)

    return counter
def proc_build(tarbz2_file, work_dir, save_dir, config):

    tarbz2_lib.untar_file_to_path(tarbz2_file, work_dir)
    #untar_one_pickle_file(tarbz2_file, work_dir)

    pickle_file = work_dir + os.path.basename(tarbz2_file).replace(
        '.tar.bz2', '')
    pickle_file_content = pickle_lib.get_pickle_file_content(pickle_file)
    #pickle_file_content = get_pickle_file_content(work_dir + os.path.basename(pickle_file).replace('.tar.bz2', ''))

    binaries = set()
    functions = set()
    for elem in pickle_file_content:
        binaries.add(elem[7])
        functions.add(elem[2])

    print(f'binaries >{binaries}<')

    counter = 0
    dataset_list = list()

    ## 1. get one binary
    ## 2. get one function of this binary
    ## 3. get disassembly of this function
    ## 4. check if this disassembly calls another function
    ## 4.1 filter @plt
    ## 5. if yes: get disassembly of caller function
    ## 6. save caller, callee, nr_of_args
    ## 7. check again, if it calls another function
    ## 8. if yes: get disassembly of caller function
    ## 9. save caller, calle, func_signature
    ##10. get disassembly of next function of this binary
    ##11. check if ....
    for bin in binaries:
        for func in functions:
            ## search for bin and func
            for elem in pickle_file_content:
                ### if we found bin and func
                if elem[7] == bin and elem[2] == func:
                    ## get att disassembly
                    att_dis = elem[4]
                    #print(f'att-dis >{att_dis}<')
                    ## check every line if there is a call
                    for item in att_dis:
                        ## find call in disas
                        if disassembly_lib.find_call_in_disassembly_line(item):
                            ## if found, get callee name
                            callee_name = disassembly_lib.get_callee_name_from_disassembly_line(
                                item)

                            #print(f'callee_name >{callee_name}<')

                            ## search for same bin, but callee func
                            for elem2 in pickle_file_content:
                                ### if we found it, get return type and disassembly
                                if elem2[7] == bin and elem2[2] == callee_name:

                                    #return_type_func_sign = return_type_lib.get_return_type_from_function_signature(elem2[0])
                                    #return_type = return_type_lib.get_return_type_from_gdb_ptype(elem2[1])
                                    nr_of_args = return_type_lib.get_nr_of_args_from_function_signature(
                                        elem2[0])
                                    arg_nr_we_want = 3
                                    if nr_of_args < arg_nr_we_want:
                                        #print(f'func got to less args for us')
                                        break

                                    arg_two = return_type_lib.get_arg_two_name_from_function_signature(
                                        elem2[0])

                                    result = common_stuff_lib.is_type_known(
                                        arg_two)

                                    if result == False:
                                        #print(f'arg_two not a known type')
                                        pass
                                    else:
                                        tmp_att_dis = att_dis
                                        #print(f'len att-dis 1 >{len(tmp_att_dis)}<')
                                        tmp_att_dis = disassembly_lib.clean_att_disassembly_from_comment(
                                            tmp_att_dis)
                                        callee_dis = disassembly_lib.clean_att_disassembly_from_comment(
                                            elem2[4])
                                        #print(f'len att-dis 1 >{len(tmp_att_dis)}<')
                                        #print(f'att-dis >{tmp_att_dis}<')

                                        dis1_str = ' '.join(tmp_att_dis)
                                        #dis2_str = ' '.join(elem2[4])
                                        dis2_str = ' '.join(callee_dis)

                                        dis1_str = disassembly_lib.split_disassembly(
                                            dis1_str)
                                        dis2_str = disassembly_lib.split_disassembly(
                                            dis2_str)
                                        #dis1_str = dis_split(dis1_str)
                                        #dis2_str = dis_split(dis2_str)

                                        ##the max-seq-length blows memory (>160GB ram) with model.fit() if e.g. over 6million
                                        if (len(dis1_str) > 100000) or (
                                                len(dis2_str) > 100000) or (
                                                    len(dis1_str) <
                                                    1) or (len(dis2_str) < 1):
                                            print(
                                                f'dis1_str >{len(dis1_str)}<')
                                            print(
                                                f'dis2_str >{len(dis2_str)}<')
                                            #print(f"package >{elem[2]}< bin >{elem[3]}< file >{elem[6]}< func >{elem[7]}<")
                                            #print(f"package >{elem2[2]}< bin >{elem2[3]}< file >{elem2[6]}< func >{elem2[7]}<")

                                        else:
                                            dis_str = dis1_str + dis2_str

                                            #print(f'dis_str >{dis_str}<')

                                            dataset_list.append(
                                                (dis_str, arg_two))
                                            counter += 1

                                        break

    if dataset_list:
        if config['save_file_type'] == 'pickle':
            ret_file = open(
                config['save_dir'] +
                os.path.basename(pickle_file).replace('.tar.bz2', ''), 'wb+')
            pickle_list = pickle.dump(dataset_list, ret_file)
            ret_file.close()
        else:
            ## save as tfrecord
            dis_list = list()
            ret_list = list()

            for item in dataset_list:
                dis_list.append(item[0])
                ret_list.append(item[1])

            raw_dataset = tf.data.Dataset.from_tensor_slices(
                (dis_list, ret_list))

            serialized_features_dataset = raw_dataset.map(tf_serialize_example)

            filename = config['save_dir'] + os.path.basename(
                tarbz2_file).replace('.pickle.tar.bz2', '') + '.tfrecord'
            writer = tf.data.experimental.TFRecordWriter(filename)
            writer.write(serialized_features_dataset)

    return counter
Esempio n. 20
0
import tarfile
import os
import sys
import pickle
#import tensorflow as tf
from datetime import datetime
from multiprocessing import Pool
import getopt
from itertools import repeat
import psutil

sys.path.append('../../lib/')
import return_type_lib
import common_stuff_lib
import tarbz2_lib
import pickle_lib
import disassembly_lib
#import tfrecord_lib

user_home_path = os.path.expanduser('~')
ret = pickle_lib.get_pickle_file_content(
    user_home_path + "/backup/save_dir/tfrecord/max_seq_length.pickle")

print(f'max-seq-length >{ret}<')
    out_v.write(out_str)
    #out_v.write('weight1\tweight2\tweight3\tweigth4\tweigth5\tweigth6\tweigth7\tweigth8\n')
    #out_v.write('\t\n')
    n = 1
    for vec in weights:
        if n == 0:
            n = 1
        else:
            out_v.write('\t'.join([str(x) for x in vec]) + "\n")
         
    out_v.close()
    out_m.close()


###load vocabulary list
vocabulary = pickle_lib.get_pickle_file_content('/tmp/save_dir/' + 'tfrecord/' + 'vocabulary_list.pickle')

###load max-sequence-length 
max_seq_length = pickle_lib.get_pickle_file_content('/tmp/save_dir/' + 'tfrecord/' + 'max_seq_length.pickle')
print(f'len-vocab-from-file >{len(vocabulary)}<')
vectorize_layer = TextVectorization(standardize=None,
                                    max_tokens=len(vocabulary)+2,
                                    output_mode='int',
                                    output_sequence_length=max_seq_length)

def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label


def main():
Esempio n. 22
0
def check_config(config):
    if not os.path.isdir(config['checkpoint_dir']):
        print(
            f"Directory >{config['checkpoint_dir']}< does not exist. Please specify model checkpoint dir, -h for help"
        )
        exit()


#     if not os.path.isdir(config['save_dir']):
#         print(f"Directory >{config['save_dir']}< does not exist. Please specify save_dir dir, -h for help")
#         exit()

###load vocabulary list
user_home_path = os.path.expanduser('~')
vocabulary = pickle_lib.get_pickle_file_content(
    user_home_path + '/Documents/gcp-caller-callee/arg_one/' +
    'vocabulary_list.pickle')

###load max-sequence-length
max_seq_length = pickle_lib.get_pickle_file_content(
    user_home_path + '/Documents/gcp-caller-callee/arg_one/' +
    'max_seq_length.pickle')
print(f'len-vocab-from-file >{len(vocabulary)}<')
vectorize_layer = TextVectorization(standardize=None,
                                    max_tokens=len(vocabulary) + 2,
                                    output_mode='int',
                                    output_sequence_length=max_seq_length)


def main():
def proc_build(tarbz2_file, work_dir, save_dir, config):

    tarbz2_lib.untar_file_to_path(tarbz2_file, work_dir)
    #untar_one_pickle_file(tarbz2_file, work_dir)

    pickle_file = work_dir + os.path.basename(tarbz2_file).replace(
        '.tar.bz2', '')
    pickle_file_content = pickle_lib.get_pickle_file_content(pickle_file)
    #pickle_file_content = get_pickle_file_content(work_dir + os.path.basename(pickle_file).replace('.tar.bz2', ''))

    binaries = set()
    functions = set()
    for elem in pickle_file_content:
        binaries.add(elem[7])
        functions.add(elem[2])

    print(f'binaries >{binaries}<')

    counter = 0
    dataset_list = list()

    ## 1. get one binary
    ## 2. get one function of this binary
    ## 3. get disassembly of this function
    ## 4. check if this disassembly calls another function
    ## 4.1 filter @plt
    ## 5. if yes: get disassembly of caller function
    ## 6. save caller, callee, func_signature
    ## 7. check again, if it calls another function
    ## 8. if yes: get disassembly of caller function
    ## 9. save caller, calle, func_signature
    ##10. get disassembly of next function of this binary
    ##11. check if ....
    for bin in binaries:
        for func in functions:
            ## search for bin and func
            for elem in pickle_file_content:
                ### if we found bin and func
                if elem[7] == bin and elem[2] == func:
                    ## get att disassembly
                    att_dis = elem[4]
                    ## check every line if there is a call
                    for item in att_dis:
                        ## find call in disas
                        if disassembly_lib.find_call_in_disassembly_line(item):
                            ## if found, get callee name
                            callee_name = disassembly_lib.get_callee_name_from_disassembly_line(
                                item)

                            #print(f'callee_name >{callee_name}<')

                            ## search for same bin, but callee func
                            for elem2 in pickle_file_content:
                                ### if we found it, get return type and disassembly
                                if elem2[7] == bin and elem2[2] == callee_name:

                                    return_type_func_sign = return_type_lib.get_return_type_from_function_signature(
                                        elem2[0])
                                    return_type = return_type_lib.get_return_type_from_gdb_ptype(
                                        elem2[1])

                                    ###for debugging, what string is still unknown ?? should show nothing
                                    if return_type == 'unknown':
                                        print(
                                            f'string_before_func_name: {string_before_func_name}'
                                        )

                                    if return_type == 'unknown':
                                        #print('unknown found')
                                        #breaker = True
                                        #break
                                        pass
                                    elif return_type == 'delete':
                                        #print('delete found')
                                        ### no return type found, so delete this item
                                        pass
                                    elif return_type == 'process_further':
                                        print(f'ERRROOOORRRR---------------')
                                    else:

                                        dis1_str = ' '.join(att_dis)
                                        dis2_str = ' '.join(elem2[4])

                                        dis1_str = disassembly_lib.split_disassembly(
                                            dis1_str)
                                        dis2_str = disassembly_lib.split_disassembly(
                                            dis2_str)
                                        #dis1_str = dis_split(dis1_str)
                                        #dis2_str = dis_split(dis2_str)

                                        dis_str = dis1_str + dis2_str

                                        #print(f'dis_str >{dis_str}<')

                                        dataset_list.append(
                                            (dis_str, return_type))
                                        counter += 1
                                        break

    if dataset_list:
        if config['save_file_type'] == 'pickle':
            ret_file = open(
                config['save_dir'] +
                os.path.basename(pickle_file).replace('.tar.bz2', ''), 'wb+')
            pickle_list = pickle.dump(dataset_list, ret_file)
            ret_file.close()
        else:
            ## save as tfrecord
            dis_list = list()
            ret_list = list()

            for item in dataset_list:
                dis_list.append(item[0])
                ret_list.append(item[1])

            raw_dataset = tf.data.Dataset.from_tensor_slices(
                (dis_list, ret_list))

            serialized_features_dataset = raw_dataset.map(tf_serialize_example)

            filename = config['save_dir'] + os.path.basename(
                tarbz2_file).replace('.pickle.tar.bz2', '') + '.tfrecord'
            writer = tf.data.experimental.TFRecordWriter(filename)
            writer.write(serialized_features_dataset)

    return counter
def main():
    config = parseArgs()

    print(f'config >{config}<')

    check_if_dir_exists(config['pickle_dir'])
    check_if_dir_exists(config['work_dir'])
    check_if_dir_exists(config['save_dir'])
    check_if_dir_exists(config['tfrecord_save_dir'])

    ### get all pickle files
    #pickle_files = get_all_tar_filenames(config['pickle_dir'])
    pickle_files = common_stuff_lib.get_all_filenames_of_type(
        config['pickle_dir'], '.tar.bz2')
    ### print 5 files, check and debug
    pickle_lib.print_X_pickle_filenames(pickle_files, 5)

    ### build
    p = Pool(nr_of_cpus)

    pickle_files = [config["pickle_dir"] + "/" + f for f in pickle_files]
    star_list = zip(pickle_files, repeat(config['work_dir']),
                    repeat(config['save_dir']), repeat(config))
    all_ret_types = p.starmap(proc_build, star_list)
    p.close()
    p.join()

    ## build return type dict-file and max-seq-length-file and vocabulary
    pickle_files = common_stuff_lib.get_all_filenames_of_type(
        config['save_dir'], '.pickle')
    print(f'pickle-files >{pickle_files}<')

    print(f'Building return-type dict, vocabulary and max-squenece-length')
    ret_set = set()
    vocab = set()
    seq_length = 0
    counter = 1
    pickle_count = len(pickle_files)

    for file in pickle_files:
        print(f'File >{file}< >{counter}/{pickle_count}<', end='\r')
        counter += 1
        cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file)
        for item in cont:
            #print(f'item-1 >{item[1]}<')
            ## build ret-type-dict
            ret_set.add(item[1])

            ##build max-seq-length
            if len(item[0]) > seq_length:
                seq_length = len(item[0])

            ## build vocabulary
            for word in item[0].split():
                vocab.add(word)

    print(
        f"Build return-type dict and save it to >{config['return_type_dict_file']}<"
    )
    ## build ret-type-dict and save
    ret_type_dict = dict()
    counter = 0
    for elem in ret_set:
        ret_type_dict[elem] = counter
        counter += 1

    pickle_lib.save_to_pickle_file(ret_type_dict,
                                   config['return_type_dict_file'])

    print(f"Build vocabulary and save it to >{config['vocabulary_file']}<")
    ## build vocabulary list from set and save
    vocab_list = list(vocab)
    pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file'])

    ## save max-seq-length
    print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<")
    pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file'])

    ### transform dataset ret-types to ints
    print(
        f"Transform return-type to int and save to >{config['tfrecord_save_dir']}<"
    )
    trans_ds = list()
    counter = 1
    for file in pickle_files:
        print(f'Transform File >{file}< >{counter}/{pickle_count}<', end='\r')
        counter += 1
        cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file)
        for item in cont:
            trans_ds.append((item[0], ret_type_dict[item[1]]))

        tfrecord_lib.save_caller_callee_to_tfrecord(
            trans_ds,
            config['tfrecord_save_dir'] + file.replace('.pickle', '.tfrecord'))

    print("Splitting dataset to train,val,test")
    tfrecord_lib.split_to_train_val_test(config['tfrecord_save_dir'])

    print("Done. Run build_caller_callee_model.py now")