Exemple #1
0
def find_max_len(pretrain_probs):

    file_wordvec = '../asmdata/vec_embedding_no_ops.txt'
    max_len_ = 0

    for prob in pretrain_probs:
        file_train = '../asmdata/' + prob + '_Seq_train.txt'
        file_CV = '../asmdata/' + prob + '_Seq_CV.txt'
        file_test = '../asmdata/' + prob + '_Seq_test.txt'

        print("Loading pretrain ", prob)

        wordvec = Data_IO.loadWordEmbedding(file_wordvec)
        y_train, X_train, maxlen_train = Data_IO.load_ASMSeqData(
            file_train, wordvec)
        y_CV, X_CV, maxlen_CV = Data_IO.load_ASMSeqData(file_CV, wordvec)
        y_test, X_test, maxlen_test = Data_IO.load_ASMSeqData(
            file_test, wordvec)

        if nb_classes == 2:
            y_train = [x if x == 0 else 1 for x in y_train]
            y_CV = [x if x == 0 else 1 for x in y_CV]
            y_test = [x if x == 0 else 1 for x in y_test]

        # maxlen: the length of the longest instruction sequence
        maxlen = np.max([maxlen_train, maxlen_CV, maxlen_test])
        if maxlen % 2 == 1:
            maxlen = maxlen + 1
        if maxlen > max_len_:
            max_len_ = max_len

    return max_len_
def get_entire_data(df):
    '''
      Returns the entire DataFrame ready to be applied to the SVD calculations
  '''

    rows = len(df)
    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)

    return Data_IO.OneEpochIterator(
        [df['item_a'], df['item_b'], df['similarity']], batch_size=-1)
def get_epoch_data(df):
    '''
      Shuffles the data and separates it into training and testing datasets
  '''

    df = df.sample(frac=1).reset_index(drop=True)
    rows = len(df)
    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
    split_index = int(rows * 0.8)

    df_train = df[0:split_index]
    df_test = df[split_index:].reset_index(drop=True)

    # PREPARE BATCH DATA
    iter_train = Data_IO.ShuffleIterator(
        [df_train['item_a'], df_train['item_b'], df_train['similarity']],
        batch_size=BATCH_SIZE)
    iter_test = Data_IO.OneEpochIterator(
        [df_test['item_a'], df_test['item_b'], df_test['similarity']],
        batch_size=-1)
    samples_per_batch = len(df_train) // BATCH_SIZE

    return iter_train, iter_test, samples_per_batch
Exemple #4
0
def run_net(pretrain_probs, probs, nb_epoch=params.nb_epoch):
    # maxlen: the length of the longest instruction sequence
    maxlen = find_max_len(pretrain_probs)
    print('max number of instructions: ' + str(maxlen))

    file_wordvec = '../asmdata/vec_embedding_no_ops.txt'

    #
    # print('Loading data...\n')
    # print ('Load token-vec: '+ file_wordvec)
    # X_train_ =[]
    # X_CV_ = []
    # X_test_=[]
    # y_train_=[]
    # y_CV_ =[]
    # y_test_=[]
    #
    #
    # for prob in pretrain_probs:
    #     file_train = '../asmdata/' + prob + '_Seq_train.txt'
    #     file_CV = '../asmdata/' + prob + '_Seq_CV.txt'
    #     file_test = '../asmdata/' + prob + '_Seq_test.txt'
    #
    #     print('\nLoad training data: ' + file_train)
    #     print('\nLoad CV data: ' + file_CV)
    #     print('\nLoad test data: ' + file_test)
    #
    #     wordvec = Data_IO.loadWordEmbedding(file_wordvec)
    #     y_train, X_train, maxlen_train = Data_IO.load_ASMSeqData(file_train, wordvec)
    #     y_CV, X_CV, maxlen_CV = Data_IO.load_ASMSeqData(file_CV, wordvec)
    #     y_test, X_test, maxlen_test = Data_IO.load_ASMSeqData(file_test, wordvec)
    #
    #     if nb_classes == 2:
    #         y_train = [x if x == 0 else 1 for x in y_train]
    #         y_CV = [x if x == 0 else 1 for x in y_CV]
    #         y_test = [x if x == 0 else 1 for x in y_test]
    #
    #     y_testnum = y_test
    #
    #     # padding data
    #     Data_IO.paddingASMSeq(X_train, maxlen)
    #     Data_IO.paddingASMSeq(X_CV, maxlen)
    #     Data_IO.paddingASMSeq(X_test, maxlen)
    #
    #
    #     if len(X_train_) == 0:
    #         X_train_ = np.array(X_train)
    #         X_CV_ = np.array(X_CV)
    #         X_test_ = np.array(X_test)
    #
    #         y_train_ = np_utils.to_categorical(y_train, nb_classes)
    #         y_CV_ = np_utils.to_categorical(y_CV, nb_classes)
    #         y_test_ = np_utils.to_categorical(y_test, nb_classes)
    #     else:
    #         X_train = np.array(X_train)
    #         X_CV = np.array(X_CV)
    #         X_test = np.array(X_test)
    #
    #         y_train = np_utils.to_categorical(y_train, nb_classes)
    #         y_CV = np_utils.to_categorical(y_CV, nb_classes)
    #         y_test = np_utils.to_categorical(y_test, nb_classes)
    #
    #         X_train_ = np.concatenate((X_train_, X_train))
    #         X_CV_ = np.concatenate((X_CV_, X_CV))
    #         X_test_ = np.concatenate((X_test_, X_test))
    #
    #         y_train_ = np.concatenate((y_train_, y_train))
    #         y_CV_ = np.concatenate((y_CV_, y_CV))
    #         y_test_ = np.concatenate((y_test_, y_test))
    #
    #
    # print('num train :' + str(len(X_train_)))
    # print('num CV :' + str(len(X_CV_)))
    # print('num test :' + str(len(X_test_)))
    # print('label num train :' + str(len(y_train_)))
    # print('label num CV :' + str(len(y_CV_)))
    # print('label num test :' + str(len(y_test_)))
    #
    # X_train = X_train_
    # X_test = X_test_
    # X_CV = X_CV_
    # y_train = y_train_
    # y_test = y_test_
    # y_CV = y_CV_
    #
    # input = Input(shape=((maxlen, embddingsize)), dtype='float32')
    # # embedding = Embedding(input_dim=len(word2id), output_dim=embddingsize, input_length=max_len)(input)
    #
    # # embedding = Embedding(len(word2id), embddingsize, weights=[embedding_matrix],
    # #                       input_length=max_len, trainable=False)(input)
    #
    # x = Conv1D(filters=conv_filters[0], kernel_size=filter_length, strides=1, padding='same', dilation_rate=1,
    #            use_bias=True, activation=activation)(input)
    # print('filter num =' + str(conv_filters[0]))
    # for conv in range(1, len(conv_filters)):
    #     print('filter num =' + str(conv_filters[conv]))
    #     x = MaxPooling1D(pool_size=pool_size, strides=None, padding='same')(x)
    #     if conv == len(conv_filters) - 1:
    #         x = Conv1D(filters=conv_filters[conv], kernel_size=filter_length, strides=1, padding='same',
    #                    dilation_rate=1, use_bias=True, activation=activation, name='latent_map')(x)
    #     else:
    #         x = Conv1D(filters=conv_filters[conv], kernel_size=filter_length, strides=1, padding='same',
    #                    dilation_rate=1, use_bias=True, activation=activation)(x)
    # x = GlobalMaxPooling1D()(x)
    # x = Dense(num_hid, activation=activation)(x)
    # x = Dropout(0.5)(x)
    # x = Dense(nb_classes, activation='softmax', name='predict_layer')(x)
    #
    # model = Model(inputs=input, outputs=x)
    #
    # print(model.summary())
    #
    # #loss = 'mean_squared_error', 'binary_crossentropy','categorical_crossentropy'
    # from keras import optimizers
    # sgd = optimizers.SGD(lr=0.4, clipnorm=1.)
    # model.compile(loss= losses.categorical_crossentropy,#'mean_squared_error', #binary_crossentropy
    #               optimizer= sgd,
    #               metrics=['accuracy'])
    #
    # print('Training pretrain...')
    # best_weights = ModelCheckpoint('best_cnn_transfer.h5', verbose=1, monitor='val_accuracy', save_best_only=True, mode='auto')
    # hist = model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, verbose=True, validation_data=(X_CV, y_CV),
    #                  callbacks=[best_weights])
    # # plot_training_process.plot_training_process(hist.history, results_path, prob, data_part, net="cnn_pretrained")
    # import shutil
    # shutil.copyfile('best_cnn_transfer_.h5',
    #                 os.path.join(model_path, "cnn_transfer_" + probs[0] + ".h5"))

    for prob in probs:
        file_train = '../asmdata/' + prob + '_Seq_train.txt'
        file_CV = '../asmdata/' + prob + '_Seq_CV.txt'
        file_test = '../asmdata/' + prob + '_Seq_test.txt'

        print('\nLoad training data: ' + file_train)
        print('\nLoad CV data: ' + file_CV)
        print('\nLoad test data: ' + file_test)

        wordvec = Data_IO.loadWordEmbedding(file_wordvec)
        y_train, X_train, maxlen_train = Data_IO.load_ASMSeqData(
            file_train, wordvec)
        y_CV, X_CV, maxlen_CV = Data_IO.load_ASMSeqData(file_CV, wordvec)
        y_test, X_test, maxlen_test = Data_IO.load_ASMSeqData(
            file_test, wordvec)

        if nb_classes == 2:
            y_train = [x if x == 0 else 1 for x in y_train]
            y_CV = [x if x == 0 else 1 for x in y_CV]
            y_test = [x if x == 0 else 1 for x in y_test]

        y_testnum = y_test

        Data_IO.paddingASMSeq(X_train, maxlen)
        Data_IO.paddingASMSeq(X_CV, maxlen)
        Data_IO.paddingASMSeq(X_test, maxlen)

        train_data = int(data_part * len(X_train))

        X_train = np.array(X_train)
        X_train = X_train[:train_data]
        y_train = y_train[:train_data]
        X_CV = np.array(X_CV)
        X_test = np.array(X_test)

        y_train = np_utils.to_categorical(y_train, nb_classes)
        y_CV = np_utils.to_categorical(y_CV, nb_classes)
        y_test = np_utils.to_categorical(y_test, nb_classes)

        print('num train :' + str(len(X_train)))
        print('num CV :' + str(len(X_CV)))
        print('num test :' + str(len(X_test)))
        print('Build model...')

    model = load_model(
        os.path.join(model_path, "cnn_transfer_" + probs[0] + ".h5"))
    # loss = 'mean_squared_error', 'binary_crossentropy','categorical_crossentropy'
    from keras import optimizers
    sgd = optimizers.SGD(lr=0.4, clipnorm=1.)
    model.compile(
        loss=losses.
        categorical_crossentropy,  # 'mean_squared_error', #binary_crossentropy
        optimizer=sgd,
        metrics=['accuracy'])

    print('Start finetuning...')
    best_weights = ModelCheckpoint('best_cnn_transfer_.h5',
                                   verbose=1,
                                   monitor='val_accuracy',
                                   save_best_only=True,
                                   mode='auto')
    hist = model.fit(X_train,
                     y_train,
                     batch_size=batch_size,
                     epochs=nb_epoch,
                     verbose=True,
                     validation_data=(X_CV, y_CV),
                     callbacks=[best_weights])
    plot_training_process.plot_training_process(hist.history,
                                                results_path,
                                                prob,
                                                data_part,
                                                net="cnn_pretrained")
    import shutil
    shutil.copyfile(
        'best_cnn_transfer_.h5',
        os.path.join(
            model_path,
            "cnn_transfer_" + prob + "_" + str(int(data_part * 100)) + ".h5"))

    # # save history to json file
    # import json
    # with open('P%s_M%s.json'%(pretrained_name, model_name), 'wb') as f:
    #     json.dump(hist.history, f)
    # save history to pickle file
    # import pickle
    # with open('cnn_' + prob + "_" + str(int(data_part*100)) + '.pkl', 'wb') as f:
    #     pickle.dump(hist.history, f)

    model = load_model(
        os.path.join(
            model_path,
            "cnn_transfer_" + prob + "_" + str(int(data_part * 100)) + ".h5"))
    L = model.predict(X_test, batch_size=batch_size)

    print(L.shape)
    print(y_test.shape)
    y_true = np.argmax(y_test, axis=1)
    y_pred = np.argmax(L, axis=1)

    acc = accuracy_score(y_true, y_pred)
    # f1_macro = f1_score(y_true, y_pred, average='macro')
    # f1_micro = f1_score(y_true, y_pred, average='micro')
    f1_weighted = f1_score(y_true, y_pred, average='weighted')

    print("Accuracy=", acc)
    # print("F1-macro=", f1_macro)
    # print("F1-micro=", f1_micro)
    print("F1-score=", f1_weighted)

    import operator
    count = 0

    fout = open(
        os.path.join(
            results_path,
            'cnn_transfer_' + prob + "_" + str(int(data_part * 100)) + '.roc'),
        'w')
    fout.write('label, ')
    label_list = np.unique(y_testnum)
    fout.write(', '.join([str(i) for i in label_list]))
    fout.write('\n')
    for idx, probs in enumerate(L):
        pred_label, value = max(enumerate(probs), key=operator.itemgetter(1))
        if pred_label == y_testnum[idx]:
            count += 1
        fout.write(str(y_testnum[idx]) + ' ')
        fout.write(' '.join([str(i) for i in probs]))
        fout.write('\n')
    print("ROC's saved to {}".format(
        os.path.join(
            results_path, 'cnn_transfer_' + prob + "_" +
            str(int(data_part * 100)) + '.roc')))
    fout.close()
import os

import numpy as np
import sys

import Data_IO

embddingsize = 30  # the length of word vectors

file_wordvec = '../asmdata/vec_embedding_no_ops.txt'
print('Loading data...\n')
print('Load token-vec: ' + file_wordvec)

max_len = 0
embedding_matrix, word2id = Data_IO.loadWordEmbedding(file_wordvec)
probs = ["FLOW016", "MNMX", "SUBINC", "SUMTRIAN"]
for prob in probs:
    file_train = '../asmdata/' + prob + '_Seq_train.txt'
    file_CV = '../asmdata/' + prob + '_Seq_CV.txt'
    file_test = '../asmdata/' + prob + '_Seq_test.txt'

    print('\nLoad training data: ' + file_train)
    print('\nLoad CV data: ' + file_CV)
    print('\nLoad test data: ' + file_test)

    p_y_train, p_X_train, maxlen_train = Data_IO.load_ASMSeqData(
        file_train, word2id)
    p_y_CV, p_X_CV, maxlen_CV = Data_IO.load_ASMSeqData(file_CV, word2id)
    p_y_test, p_X_test, maxlen_test = Data_IO.load_ASMSeqData(
        file_test, word2id)
def run_net(probs, nb_epoch=params.nb_epoch):
    # probs = problems MNMX, FLOW16, ...
    # pretrained_name = pretrained model, if None then training from scratch
    # finetune = continue training from pretrained model
    # model = name of the model after training
    # input data
    # prob = sys.argv[1]#'MNMX'

    file_wordvec = '../asmdata/vec_embedding_no_ops.txt'

    print('Loading data...\n')
    print('Load token-vec: ' + file_wordvec)
    X_train = []
    X_CV = []
    X_test = []
    y_train = []
    y_CV = []
    y_test = []

    # print (word2id)
    for prob in probs:
        file_train = '../asmdata/' + prob + '_Seq_train.txt'
        file_CV = '../asmdata/' + prob + '_Seq_CV.txt'
        file_test = '../asmdata/' + prob + '_Seq_test.txt'

        print('\nLoad training data: ' + file_train)
        print('\nLoad CV data: ' + file_CV)
        print('\nLoad test data: ' + file_test)

        wordvec = Data_IO.loadWordEmbedding(file_wordvec)
        y_train, X_train, maxlen_train = Data_IO.load_ASMSeqData(
            file_train, wordvec)
        y_CV, X_CV, maxlen_CV = Data_IO.load_ASMSeqData(file_CV, wordvec)
        y_test, X_test, maxlen_test = Data_IO.load_ASMSeqData(
            file_test, wordvec)

        if nb_classes == 2:
            y_train = [x if x == 0 else 1 for x in y_train]
            y_CV = [x if x == 0 else 1 for x in y_CV]
            y_test = [x if x == 0 else 1 for x in y_test]

        y_testnum = y_test

        # maxlen: the length of the longest instruction sequence
        maxlen = np.max([maxlen_train, maxlen_CV, maxlen_test])
        if maxlen % 2 == 1:
            maxlen = maxlen + 1
        print('max number of instructions: ' + str(maxlen))
        # padding data
        Data_IO.paddingASMSeq(X_train, maxlen)
        Data_IO.paddingASMSeq(X_CV, maxlen)
        Data_IO.paddingASMSeq(X_test, maxlen)

        X_train = np.array(X_train)
        X_CV = np.array(X_CV)
        X_test = np.array(X_test)

        y_train = np_utils.to_categorical(y_train, nb_classes)
        y_CV = np_utils.to_categorical(y_CV, nb_classes)
        y_test = np_utils.to_categorical(y_test, nb_classes)

        print('num train :' + str(len(X_train)))
        print('num CV :' + str(len(X_CV)))
        print('num test :' + str(len(X_test)))
        print('Build model...')

    input = Input(shape=((maxlen, embddingsize)), dtype='float32')
    x = Conv1D(filters=conv_filters[0],
               kernel_size=filter_length,
               strides=1,
               padding='same',
               dilation_rate=1,
               use_bias=True,
               activation=activation)(input)
    print('filter num =' + str(conv_filters[0]))
    for conv in range(1, len(conv_filters)):
        print('filter num =' + str(conv_filters[conv]))
        x = MaxPooling1D(pool_size=pool_size, strides=None, padding='same')(x)
        if conv == len(conv_filters) - 1:
            x = Conv1D(filters=conv_filters[conv],
                       kernel_size=filter_length,
                       strides=1,
                       padding='same',
                       dilation_rate=1,
                       use_bias=True,
                       activation=activation,
                       name='latent_map')(x)
        else:
            x = Conv1D(filters=conv_filters[conv],
                       kernel_size=filter_length,
                       strides=1,
                       padding='same',
                       dilation_rate=1,
                       use_bias=True,
                       activation=activation)(x)
    encode = x
    # x = GlobalMaxPooling1D()(x)
    # x = Dense(num_hid, activation=activation)(x)
    # x = Dense(nb_classes, activation='softmax')(x)
    conv_filters_decoder = conv_filters
    conv_filters_decoder.reverse()

    for conv in range(1, len(conv_filters_decoder)):
        print('filter num =' + str(conv_filters_decoder[conv]))
        x = Conv1D(filters=conv_filters[conv],
                   kernel_size=filter_length,
                   strides=1,
                   padding='same',
                   dilation_rate=1,
                   use_bias=True,
                   activation=activation)(x)
        x = UpSampling1D(size=pool_size)(x)

    # decode equivalent to embedding layer
    x = Conv1D(filters=embddingsize,
               kernel_size=filter_length,
               strides=1,
               padding='same',
               dilation_rate=1,
               use_bias=True,
               activation=activation,
               name='decode_layer')(x)

    model = Model(inputs=input, outputs=x)
    encoder_model = Model(inputs=input, output=encode)
    print(model.summary())

    #loss = 'mean_squared_error', 'binary_crossentropy','categorical_crossentropy'
    from keras import optimizers
    sgd = optimizers.SGD(lr=0.4, clipnorm=1.)
    model.compile(
        loss='mean_squared_error',  #'mean_squared_error', #binary_crossentropy
        optimizer=sgd,
        metrics=['accuracy'])

    print('Training...')
    hist = model.fit(X_train,
                     X_train,
                     batch_size=batch_size,
                     epochs=nb_epoch,
                     verbose=True,
                     validation_data=(X_CV, X_CV))
    plot_training_process.plot_training_process(hist.history,
                                                results_path,
                                                prob,
                                                data_part,
                                                net="ae")
    # save model
    model.save(os.path.join(model_path, "ae_" + prob + ".h5"))
    encoder_model.save(os.path.join(model_path, "ae_encoder_" + prob + ".h5"))
def run_net(probs, nb_epoch = params.nb_epoch):
    file_wordvec = '../asmdata/vec_embedding_no_ops.txt'

    print('Loading data...\n')
    print ('Load token-vec: '+ file_wordvec)
    X_train =[]
    X_CV = []
    X_test=[]
    y_train=[]
    y_CV =[]
    y_test=[]

    # print (word2id)
    for prob in probs:
        file_train = '../asmdata/' + prob + '_Seq_train.txt'
        file_CV = '../asmdata/' + prob + '_Seq_CV.txt'
        file_test = '../asmdata/' + prob + '_Seq_test.txt'
        # file_train = '../asmdata/debug_Seq_train.txt'
        # file_CV = '../asmdata/debug_Seq_CV.txt'
        # file_test = '../asmdata/debug_Seq_test.txt'

        print('\nLoad training data: ' + file_train)
        print('\nLoad CV data: ' + file_CV)
        print('\nLoad test data: ' + file_test)

        wordvec = Data_IO.loadWordEmbedding(file_wordvec)
        y_train, X_train, maxlen_train = Data_IO.load_ASMSeqData(file_train, wordvec)
        y_CV, X_CV, maxlen_CV = Data_IO.load_ASMSeqData(file_CV, wordvec)
        y_test, X_test, maxlen_test = Data_IO.load_ASMSeqData(file_test, wordvec)

        if nb_classes == 2:
            y_train = [x if x == 0 else 1 for x in y_train]
            y_CV = [x if x == 0 else 1 for x in y_CV]
            y_test = [x if x == 0 else 1 for x in y_test]

        y_testnum = y_test

        # maxlen: the length of the longest instruction sequence
        maxlen = np.max([maxlen_train, maxlen_CV, maxlen_test])
        if maxlen % 2 == 1:
            maxlen = maxlen + 1
        print('max number of instructions: ' + str(maxlen))
        # padding data
        Data_IO.paddingASMSeq(X_train, maxlen)
        Data_IO.paddingASMSeq(X_CV, maxlen)
        Data_IO.paddingASMSeq(X_test, maxlen)

        train_data = int(data_part * len(X_train))
        X_train = np.array(X_train)
        X_train = X_train[:train_data]
        y_train = y_train[:train_data]
        X_CV = np.array(X_CV)
        X_test = np.array(X_test)

        y_train = np_utils.to_categorical(y_train, nb_classes)
        y_CV = np_utils.to_categorical(y_CV, nb_classes)
        y_test = np_utils.to_categorical(y_test, nb_classes)

        print('num train :' + str(len(X_train)))
        print('num CV :' + str(len(X_CV)))
        print('num test :' + str(len(X_test)))
        print('Build model...')


    base_model = load_model(os.path.join(model_path, "ae_" + prob + ".h5"))
    branch_1 = base_model.output
    latent_map_feature = base_model.get_layer('latent_map').output
    branch_2 = GlobalMaxPooling1D()(latent_map_feature)
    branch_2 = Dense(num_hid, activation=activation)(branch_2)
    branch_2 = Dropout(0.5)(branch_2)
    branch_2 = Dense(nb_classes, activation='softmax', name='predict_layer')(branch_2)

    model = Model(inputs=base_model.input, outputs=[branch_1, branch_2])

    print(model.summary())

    # loss = 'mean_squared_error', 'binary_crossentropy','categorical_crossentropy'
    from keras import optimizers
    sgd = optimizers.SGD(lr=0.4, clipnorm=1.)

    model.compile(loss=["mean_squared_error", "categorical_crossentropy"],  # 'mean_squared_error', #binary_crossentropy
                  loss_weights=[1.0, 1.0],
                  optimizer=sgd,
                  metrics=['accuracy'])

    print('Training...')
    best_weights = ModelCheckpoint('best_ae_cnn_multi_task.h5', verbose=1, monitor='val_predict_layer_accuracy',
                                   save_best_only=True, mode='auto')
    hist = model.fit(X_train, [X_train, y_train], batch_size=batch_size, epochs=nb_epoch, verbose=True,
                     validation_data=(X_CV, [X_CV, y_CV]), callbacks=[best_weights])
    # hist = model.fit(X_train, [X_train, y_train], batch_size=batch_size, epochs=nb_epoch, verbose=True, validation_data=(X_CV, [X_CV, y_CV]))
    plot_training_process.plot_training_process(hist.history, results_path, prob, data_part, net="ae_cnn_multi_task")
    # save model
    # model.save(os.path.join(model_path, "ae_cnn_" + prob + "_" + str(int(data_part * 100)) + ".h5"))
    import shutil
    shutil.copyfile('best_ae_cnn_multi_task.h5',
                    os.path.join(model_path, "ae_cnn_multi_task_" + prob + "_" + str(int(data_part * 100)) + ".h5"))

    # # save history to json file
    # import json
    # with open('P%s_M%s.json'%(pretrained_name, model_name), 'wb') as f:
    #     json.dump(hist.history, f)
    # save history to pickle file
    # import pickle
    # with open('cnn_' + prob + "_" + str(int(data_part*100)) + '.pkl', 'wb') as f:
    #     pickle.dump(hist.history, f)

    model = load_model(os.path.join(model_path, "ae_cnn_multi_task_" + prob + "_" + str(int(data_part * 100)) + ".h5"))
    L = model.predict(X_test, batch_size=batch_size)[1] # index 0 is output for branch autoencoder, 1 is output for branch classifier

    print(L.shape)
    print(y_test.shape)
    y_true = np.argmax(y_test, axis=1)
    y_pred = np.argmax(L, axis=1)

    acc = accuracy_score(y_true, y_pred)
    # f1_macro = f1_score(y_true, y_pred, average='macro')
    # f1_micro = f1_score(y_true, y_pred, average='micro')
    f1_weighted = f1_score(y_true, y_pred, average='weighted')

    print("Accuracy=", acc)
    # print("F1-macro=", f1_macro)
    # print("F1-micro=", f1_micro)
    print("F1-score=", f1_weighted)

    import operator
    count = 0

    fout = open(os.path.join(results_path, 'ae_cnn_multi_task_' + prob + "_" + str(int(data_part * 100)) + '.roc'), 'w')
    fout.write('label, ')
    label_list = np.unique(y_testnum)
    fout.write(', '.join([str(i) for i in label_list]))
    fout.write('\n')
    for idx, probs in enumerate(L):
        pred_label, value = max(enumerate(probs), key=operator.itemgetter(1))
        if pred_label == y_testnum[idx]:
            count += 1
        fout.write(str(y_testnum[idx]) + ' ')
        fout.write(' '.join([str(i) for i in probs]))
        fout.write('\n')
    print("ROC's saved to {}".format(
        os.path.join(results_path, 'ae_cnn_multi_task_' + prob + "_" + str(int(data_part * 100)) + '.roc')))
    fout.close()
model_path = './models/ae_models'
model_name = 'ae_' + prob + '.h5'

file_train = '../asmdata/'+prob+'_Seq_train.txt'
file_CV = '../asmdata/'+ prob + '_Seq_CV.txt'
file_test = '../asmdata/' + prob + '_Seq_test.txt'

# file_train = "../asmdata/debug_Seq_CV.txt"
# file_train = "../asmdata/debug_Seq_CV.txt"
# file_train = "../asmdata/debug_Seq_CV.txt"

print('\nLoad training data: ' + file_train)
print('\nLoad CV data: ' + file_CV)
print('\nLoad test data: ' + file_test)

wordvec = Data_IO.loadWordEmbedding(file_wordvec)
y_train, X_train, maxlen_train = Data_IO.load_ASMSeqData(file_train, wordvec)
y_CV, X_CV, maxlen_CV = Data_IO.load_ASMSeqData(file_CV, wordvec)
y_test, X_test, maxlen_test = Data_IO.load_ASMSeqData(file_test, wordvec)

if nb_classes == 2:
    y_train = [x if x == 0 else 1 for x in y_train]
    y_CV = [x if x == 0 else 1 for x in y_CV]
    y_test = [x if x == 0 else 1 for x in y_test]

y_testnum = y_test

# maxlen: the length of the longest instruction sequence
maxlen = np.max([maxlen_train, maxlen_CV, maxlen_test])
is_padded = 0
if maxlen % 2 == 1:
def get_data_df(filename):
    '''
      Opens the data .csv and returns as a dataframe
  '''
    return Data_IO.csv_to_df(filename)