def find_max_len(pretrain_probs): file_wordvec = '../asmdata/vec_embedding_no_ops.txt' max_len_ = 0 for prob in pretrain_probs: file_train = '../asmdata/' + prob + '_Seq_train.txt' file_CV = '../asmdata/' + prob + '_Seq_CV.txt' file_test = '../asmdata/' + prob + '_Seq_test.txt' print("Loading pretrain ", prob) wordvec = Data_IO.loadWordEmbedding(file_wordvec) y_train, X_train, maxlen_train = Data_IO.load_ASMSeqData( file_train, wordvec) y_CV, X_CV, maxlen_CV = Data_IO.load_ASMSeqData(file_CV, wordvec) y_test, X_test, maxlen_test = Data_IO.load_ASMSeqData( file_test, wordvec) if nb_classes == 2: y_train = [x if x == 0 else 1 for x in y_train] y_CV = [x if x == 0 else 1 for x in y_CV] y_test = [x if x == 0 else 1 for x in y_test] # maxlen: the length of the longest instruction sequence maxlen = np.max([maxlen_train, maxlen_CV, maxlen_test]) if maxlen % 2 == 1: maxlen = maxlen + 1 if maxlen > max_len_: max_len_ = max_len return max_len_
def get_entire_data(df): ''' Returns the entire DataFrame ready to be applied to the SVD calculations ''' rows = len(df) df = df.iloc[np.random.permutation(rows)].reset_index(drop=True) return Data_IO.OneEpochIterator( [df['item_a'], df['item_b'], df['similarity']], batch_size=-1)
def get_epoch_data(df): ''' Shuffles the data and separates it into training and testing datasets ''' df = df.sample(frac=1).reset_index(drop=True) rows = len(df) df = df.iloc[np.random.permutation(rows)].reset_index(drop=True) split_index = int(rows * 0.8) df_train = df[0:split_index] df_test = df[split_index:].reset_index(drop=True) # PREPARE BATCH DATA iter_train = Data_IO.ShuffleIterator( [df_train['item_a'], df_train['item_b'], df_train['similarity']], batch_size=BATCH_SIZE) iter_test = Data_IO.OneEpochIterator( [df_test['item_a'], df_test['item_b'], df_test['similarity']], batch_size=-1) samples_per_batch = len(df_train) // BATCH_SIZE return iter_train, iter_test, samples_per_batch
def run_net(pretrain_probs, probs, nb_epoch=params.nb_epoch): # maxlen: the length of the longest instruction sequence maxlen = find_max_len(pretrain_probs) print('max number of instructions: ' + str(maxlen)) file_wordvec = '../asmdata/vec_embedding_no_ops.txt' # # print('Loading data...\n') # print ('Load token-vec: '+ file_wordvec) # X_train_ =[] # X_CV_ = [] # X_test_=[] # y_train_=[] # y_CV_ =[] # y_test_=[] # # # for prob in pretrain_probs: # file_train = '../asmdata/' + prob + '_Seq_train.txt' # file_CV = '../asmdata/' + prob + '_Seq_CV.txt' # file_test = '../asmdata/' + prob + '_Seq_test.txt' # # print('\nLoad training data: ' + file_train) # print('\nLoad CV data: ' + file_CV) # print('\nLoad test data: ' + file_test) # # wordvec = Data_IO.loadWordEmbedding(file_wordvec) # y_train, X_train, maxlen_train = Data_IO.load_ASMSeqData(file_train, wordvec) # y_CV, X_CV, maxlen_CV = Data_IO.load_ASMSeqData(file_CV, wordvec) # y_test, X_test, maxlen_test = Data_IO.load_ASMSeqData(file_test, wordvec) # # if nb_classes == 2: # y_train = [x if x == 0 else 1 for x in y_train] # y_CV = [x if x == 0 else 1 for x in y_CV] # y_test = [x if x == 0 else 1 for x in y_test] # # y_testnum = y_test # # # padding data # Data_IO.paddingASMSeq(X_train, maxlen) # Data_IO.paddingASMSeq(X_CV, maxlen) # Data_IO.paddingASMSeq(X_test, maxlen) # # # if len(X_train_) == 0: # X_train_ = np.array(X_train) # X_CV_ = np.array(X_CV) # X_test_ = np.array(X_test) # # y_train_ = np_utils.to_categorical(y_train, nb_classes) # y_CV_ = np_utils.to_categorical(y_CV, nb_classes) # y_test_ = np_utils.to_categorical(y_test, nb_classes) # else: # X_train = np.array(X_train) # X_CV = np.array(X_CV) # X_test = np.array(X_test) # # y_train = np_utils.to_categorical(y_train, nb_classes) # y_CV = np_utils.to_categorical(y_CV, nb_classes) # y_test = np_utils.to_categorical(y_test, nb_classes) # # X_train_ = np.concatenate((X_train_, X_train)) # X_CV_ = np.concatenate((X_CV_, X_CV)) # X_test_ = np.concatenate((X_test_, X_test)) # # y_train_ = np.concatenate((y_train_, y_train)) # y_CV_ = np.concatenate((y_CV_, y_CV)) # y_test_ = np.concatenate((y_test_, y_test)) # # # print('num train :' + str(len(X_train_))) # print('num CV :' + str(len(X_CV_))) # print('num test :' + str(len(X_test_))) # print('label num train :' + str(len(y_train_))) # print('label num CV :' + str(len(y_CV_))) # print('label num test :' + str(len(y_test_))) # # X_train = X_train_ # X_test = X_test_ # X_CV = X_CV_ # y_train = y_train_ # y_test = y_test_ # y_CV = y_CV_ # # input = Input(shape=((maxlen, embddingsize)), dtype='float32') # # embedding = Embedding(input_dim=len(word2id), output_dim=embddingsize, input_length=max_len)(input) # # # embedding = Embedding(len(word2id), embddingsize, weights=[embedding_matrix], # # input_length=max_len, trainable=False)(input) # # x = Conv1D(filters=conv_filters[0], kernel_size=filter_length, strides=1, padding='same', dilation_rate=1, # use_bias=True, activation=activation)(input) # print('filter num =' + str(conv_filters[0])) # for conv in range(1, len(conv_filters)): # print('filter num =' + str(conv_filters[conv])) # x = MaxPooling1D(pool_size=pool_size, strides=None, padding='same')(x) # if conv == len(conv_filters) - 1: # x = Conv1D(filters=conv_filters[conv], kernel_size=filter_length, strides=1, padding='same', # dilation_rate=1, use_bias=True, activation=activation, name='latent_map')(x) # else: # x = Conv1D(filters=conv_filters[conv], kernel_size=filter_length, strides=1, padding='same', # dilation_rate=1, use_bias=True, activation=activation)(x) # x = GlobalMaxPooling1D()(x) # x = Dense(num_hid, activation=activation)(x) # x = Dropout(0.5)(x) # x = Dense(nb_classes, activation='softmax', name='predict_layer')(x) # # model = Model(inputs=input, outputs=x) # # print(model.summary()) # # #loss = 'mean_squared_error', 'binary_crossentropy','categorical_crossentropy' # from keras import optimizers # sgd = optimizers.SGD(lr=0.4, clipnorm=1.) # model.compile(loss= losses.categorical_crossentropy,#'mean_squared_error', #binary_crossentropy # optimizer= sgd, # metrics=['accuracy']) # # print('Training pretrain...') # best_weights = ModelCheckpoint('best_cnn_transfer.h5', verbose=1, monitor='val_accuracy', save_best_only=True, mode='auto') # hist = model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, verbose=True, validation_data=(X_CV, y_CV), # callbacks=[best_weights]) # # plot_training_process.plot_training_process(hist.history, results_path, prob, data_part, net="cnn_pretrained") # import shutil # shutil.copyfile('best_cnn_transfer_.h5', # os.path.join(model_path, "cnn_transfer_" + probs[0] + ".h5")) for prob in probs: file_train = '../asmdata/' + prob + '_Seq_train.txt' file_CV = '../asmdata/' + prob + '_Seq_CV.txt' file_test = '../asmdata/' + prob + '_Seq_test.txt' print('\nLoad training data: ' + file_train) print('\nLoad CV data: ' + file_CV) print('\nLoad test data: ' + file_test) wordvec = Data_IO.loadWordEmbedding(file_wordvec) y_train, X_train, maxlen_train = Data_IO.load_ASMSeqData( file_train, wordvec) y_CV, X_CV, maxlen_CV = Data_IO.load_ASMSeqData(file_CV, wordvec) y_test, X_test, maxlen_test = Data_IO.load_ASMSeqData( file_test, wordvec) if nb_classes == 2: y_train = [x if x == 0 else 1 for x in y_train] y_CV = [x if x == 0 else 1 for x in y_CV] y_test = [x if x == 0 else 1 for x in y_test] y_testnum = y_test Data_IO.paddingASMSeq(X_train, maxlen) Data_IO.paddingASMSeq(X_CV, maxlen) Data_IO.paddingASMSeq(X_test, maxlen) train_data = int(data_part * len(X_train)) X_train = np.array(X_train) X_train = X_train[:train_data] y_train = y_train[:train_data] X_CV = np.array(X_CV) X_test = np.array(X_test) y_train = np_utils.to_categorical(y_train, nb_classes) y_CV = np_utils.to_categorical(y_CV, nb_classes) y_test = np_utils.to_categorical(y_test, nb_classes) print('num train :' + str(len(X_train))) print('num CV :' + str(len(X_CV))) print('num test :' + str(len(X_test))) print('Build model...') model = load_model( os.path.join(model_path, "cnn_transfer_" + probs[0] + ".h5")) # loss = 'mean_squared_error', 'binary_crossentropy','categorical_crossentropy' from keras import optimizers sgd = optimizers.SGD(lr=0.4, clipnorm=1.) model.compile( loss=losses. categorical_crossentropy, # 'mean_squared_error', #binary_crossentropy optimizer=sgd, metrics=['accuracy']) print('Start finetuning...') best_weights = ModelCheckpoint('best_cnn_transfer_.h5', verbose=1, monitor='val_accuracy', save_best_only=True, mode='auto') hist = model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, verbose=True, validation_data=(X_CV, y_CV), callbacks=[best_weights]) plot_training_process.plot_training_process(hist.history, results_path, prob, data_part, net="cnn_pretrained") import shutil shutil.copyfile( 'best_cnn_transfer_.h5', os.path.join( model_path, "cnn_transfer_" + prob + "_" + str(int(data_part * 100)) + ".h5")) # # save history to json file # import json # with open('P%s_M%s.json'%(pretrained_name, model_name), 'wb') as f: # json.dump(hist.history, f) # save history to pickle file # import pickle # with open('cnn_' + prob + "_" + str(int(data_part*100)) + '.pkl', 'wb') as f: # pickle.dump(hist.history, f) model = load_model( os.path.join( model_path, "cnn_transfer_" + prob + "_" + str(int(data_part * 100)) + ".h5")) L = model.predict(X_test, batch_size=batch_size) print(L.shape) print(y_test.shape) y_true = np.argmax(y_test, axis=1) y_pred = np.argmax(L, axis=1) acc = accuracy_score(y_true, y_pred) # f1_macro = f1_score(y_true, y_pred, average='macro') # f1_micro = f1_score(y_true, y_pred, average='micro') f1_weighted = f1_score(y_true, y_pred, average='weighted') print("Accuracy=", acc) # print("F1-macro=", f1_macro) # print("F1-micro=", f1_micro) print("F1-score=", f1_weighted) import operator count = 0 fout = open( os.path.join( results_path, 'cnn_transfer_' + prob + "_" + str(int(data_part * 100)) + '.roc'), 'w') fout.write('label, ') label_list = np.unique(y_testnum) fout.write(', '.join([str(i) for i in label_list])) fout.write('\n') for idx, probs in enumerate(L): pred_label, value = max(enumerate(probs), key=operator.itemgetter(1)) if pred_label == y_testnum[idx]: count += 1 fout.write(str(y_testnum[idx]) + ' ') fout.write(' '.join([str(i) for i in probs])) fout.write('\n') print("ROC's saved to {}".format( os.path.join( results_path, 'cnn_transfer_' + prob + "_" + str(int(data_part * 100)) + '.roc'))) fout.close()
import os import numpy as np import sys import Data_IO embddingsize = 30 # the length of word vectors file_wordvec = '../asmdata/vec_embedding_no_ops.txt' print('Loading data...\n') print('Load token-vec: ' + file_wordvec) max_len = 0 embedding_matrix, word2id = Data_IO.loadWordEmbedding(file_wordvec) probs = ["FLOW016", "MNMX", "SUBINC", "SUMTRIAN"] for prob in probs: file_train = '../asmdata/' + prob + '_Seq_train.txt' file_CV = '../asmdata/' + prob + '_Seq_CV.txt' file_test = '../asmdata/' + prob + '_Seq_test.txt' print('\nLoad training data: ' + file_train) print('\nLoad CV data: ' + file_CV) print('\nLoad test data: ' + file_test) p_y_train, p_X_train, maxlen_train = Data_IO.load_ASMSeqData( file_train, word2id) p_y_CV, p_X_CV, maxlen_CV = Data_IO.load_ASMSeqData(file_CV, word2id) p_y_test, p_X_test, maxlen_test = Data_IO.load_ASMSeqData( file_test, word2id)
def run_net(probs, nb_epoch=params.nb_epoch): # probs = problems MNMX, FLOW16, ... # pretrained_name = pretrained model, if None then training from scratch # finetune = continue training from pretrained model # model = name of the model after training # input data # prob = sys.argv[1]#'MNMX' file_wordvec = '../asmdata/vec_embedding_no_ops.txt' print('Loading data...\n') print('Load token-vec: ' + file_wordvec) X_train = [] X_CV = [] X_test = [] y_train = [] y_CV = [] y_test = [] # print (word2id) for prob in probs: file_train = '../asmdata/' + prob + '_Seq_train.txt' file_CV = '../asmdata/' + prob + '_Seq_CV.txt' file_test = '../asmdata/' + prob + '_Seq_test.txt' print('\nLoad training data: ' + file_train) print('\nLoad CV data: ' + file_CV) print('\nLoad test data: ' + file_test) wordvec = Data_IO.loadWordEmbedding(file_wordvec) y_train, X_train, maxlen_train = Data_IO.load_ASMSeqData( file_train, wordvec) y_CV, X_CV, maxlen_CV = Data_IO.load_ASMSeqData(file_CV, wordvec) y_test, X_test, maxlen_test = Data_IO.load_ASMSeqData( file_test, wordvec) if nb_classes == 2: y_train = [x if x == 0 else 1 for x in y_train] y_CV = [x if x == 0 else 1 for x in y_CV] y_test = [x if x == 0 else 1 for x in y_test] y_testnum = y_test # maxlen: the length of the longest instruction sequence maxlen = np.max([maxlen_train, maxlen_CV, maxlen_test]) if maxlen % 2 == 1: maxlen = maxlen + 1 print('max number of instructions: ' + str(maxlen)) # padding data Data_IO.paddingASMSeq(X_train, maxlen) Data_IO.paddingASMSeq(X_CV, maxlen) Data_IO.paddingASMSeq(X_test, maxlen) X_train = np.array(X_train) X_CV = np.array(X_CV) X_test = np.array(X_test) y_train = np_utils.to_categorical(y_train, nb_classes) y_CV = np_utils.to_categorical(y_CV, nb_classes) y_test = np_utils.to_categorical(y_test, nb_classes) print('num train :' + str(len(X_train))) print('num CV :' + str(len(X_CV))) print('num test :' + str(len(X_test))) print('Build model...') input = Input(shape=((maxlen, embddingsize)), dtype='float32') x = Conv1D(filters=conv_filters[0], kernel_size=filter_length, strides=1, padding='same', dilation_rate=1, use_bias=True, activation=activation)(input) print('filter num =' + str(conv_filters[0])) for conv in range(1, len(conv_filters)): print('filter num =' + str(conv_filters[conv])) x = MaxPooling1D(pool_size=pool_size, strides=None, padding='same')(x) if conv == len(conv_filters) - 1: x = Conv1D(filters=conv_filters[conv], kernel_size=filter_length, strides=1, padding='same', dilation_rate=1, use_bias=True, activation=activation, name='latent_map')(x) else: x = Conv1D(filters=conv_filters[conv], kernel_size=filter_length, strides=1, padding='same', dilation_rate=1, use_bias=True, activation=activation)(x) encode = x # x = GlobalMaxPooling1D()(x) # x = Dense(num_hid, activation=activation)(x) # x = Dense(nb_classes, activation='softmax')(x) conv_filters_decoder = conv_filters conv_filters_decoder.reverse() for conv in range(1, len(conv_filters_decoder)): print('filter num =' + str(conv_filters_decoder[conv])) x = Conv1D(filters=conv_filters[conv], kernel_size=filter_length, strides=1, padding='same', dilation_rate=1, use_bias=True, activation=activation)(x) x = UpSampling1D(size=pool_size)(x) # decode equivalent to embedding layer x = Conv1D(filters=embddingsize, kernel_size=filter_length, strides=1, padding='same', dilation_rate=1, use_bias=True, activation=activation, name='decode_layer')(x) model = Model(inputs=input, outputs=x) encoder_model = Model(inputs=input, output=encode) print(model.summary()) #loss = 'mean_squared_error', 'binary_crossentropy','categorical_crossentropy' from keras import optimizers sgd = optimizers.SGD(lr=0.4, clipnorm=1.) model.compile( loss='mean_squared_error', #'mean_squared_error', #binary_crossentropy optimizer=sgd, metrics=['accuracy']) print('Training...') hist = model.fit(X_train, X_train, batch_size=batch_size, epochs=nb_epoch, verbose=True, validation_data=(X_CV, X_CV)) plot_training_process.plot_training_process(hist.history, results_path, prob, data_part, net="ae") # save model model.save(os.path.join(model_path, "ae_" + prob + ".h5")) encoder_model.save(os.path.join(model_path, "ae_encoder_" + prob + ".h5"))
def run_net(probs, nb_epoch = params.nb_epoch): file_wordvec = '../asmdata/vec_embedding_no_ops.txt' print('Loading data...\n') print ('Load token-vec: '+ file_wordvec) X_train =[] X_CV = [] X_test=[] y_train=[] y_CV =[] y_test=[] # print (word2id) for prob in probs: file_train = '../asmdata/' + prob + '_Seq_train.txt' file_CV = '../asmdata/' + prob + '_Seq_CV.txt' file_test = '../asmdata/' + prob + '_Seq_test.txt' # file_train = '../asmdata/debug_Seq_train.txt' # file_CV = '../asmdata/debug_Seq_CV.txt' # file_test = '../asmdata/debug_Seq_test.txt' print('\nLoad training data: ' + file_train) print('\nLoad CV data: ' + file_CV) print('\nLoad test data: ' + file_test) wordvec = Data_IO.loadWordEmbedding(file_wordvec) y_train, X_train, maxlen_train = Data_IO.load_ASMSeqData(file_train, wordvec) y_CV, X_CV, maxlen_CV = Data_IO.load_ASMSeqData(file_CV, wordvec) y_test, X_test, maxlen_test = Data_IO.load_ASMSeqData(file_test, wordvec) if nb_classes == 2: y_train = [x if x == 0 else 1 for x in y_train] y_CV = [x if x == 0 else 1 for x in y_CV] y_test = [x if x == 0 else 1 for x in y_test] y_testnum = y_test # maxlen: the length of the longest instruction sequence maxlen = np.max([maxlen_train, maxlen_CV, maxlen_test]) if maxlen % 2 == 1: maxlen = maxlen + 1 print('max number of instructions: ' + str(maxlen)) # padding data Data_IO.paddingASMSeq(X_train, maxlen) Data_IO.paddingASMSeq(X_CV, maxlen) Data_IO.paddingASMSeq(X_test, maxlen) train_data = int(data_part * len(X_train)) X_train = np.array(X_train) X_train = X_train[:train_data] y_train = y_train[:train_data] X_CV = np.array(X_CV) X_test = np.array(X_test) y_train = np_utils.to_categorical(y_train, nb_classes) y_CV = np_utils.to_categorical(y_CV, nb_classes) y_test = np_utils.to_categorical(y_test, nb_classes) print('num train :' + str(len(X_train))) print('num CV :' + str(len(X_CV))) print('num test :' + str(len(X_test))) print('Build model...') base_model = load_model(os.path.join(model_path, "ae_" + prob + ".h5")) branch_1 = base_model.output latent_map_feature = base_model.get_layer('latent_map').output branch_2 = GlobalMaxPooling1D()(latent_map_feature) branch_2 = Dense(num_hid, activation=activation)(branch_2) branch_2 = Dropout(0.5)(branch_2) branch_2 = Dense(nb_classes, activation='softmax', name='predict_layer')(branch_2) model = Model(inputs=base_model.input, outputs=[branch_1, branch_2]) print(model.summary()) # loss = 'mean_squared_error', 'binary_crossentropy','categorical_crossentropy' from keras import optimizers sgd = optimizers.SGD(lr=0.4, clipnorm=1.) model.compile(loss=["mean_squared_error", "categorical_crossentropy"], # 'mean_squared_error', #binary_crossentropy loss_weights=[1.0, 1.0], optimizer=sgd, metrics=['accuracy']) print('Training...') best_weights = ModelCheckpoint('best_ae_cnn_multi_task.h5', verbose=1, monitor='val_predict_layer_accuracy', save_best_only=True, mode='auto') hist = model.fit(X_train, [X_train, y_train], batch_size=batch_size, epochs=nb_epoch, verbose=True, validation_data=(X_CV, [X_CV, y_CV]), callbacks=[best_weights]) # hist = model.fit(X_train, [X_train, y_train], batch_size=batch_size, epochs=nb_epoch, verbose=True, validation_data=(X_CV, [X_CV, y_CV])) plot_training_process.plot_training_process(hist.history, results_path, prob, data_part, net="ae_cnn_multi_task") # save model # model.save(os.path.join(model_path, "ae_cnn_" + prob + "_" + str(int(data_part * 100)) + ".h5")) import shutil shutil.copyfile('best_ae_cnn_multi_task.h5', os.path.join(model_path, "ae_cnn_multi_task_" + prob + "_" + str(int(data_part * 100)) + ".h5")) # # save history to json file # import json # with open('P%s_M%s.json'%(pretrained_name, model_name), 'wb') as f: # json.dump(hist.history, f) # save history to pickle file # import pickle # with open('cnn_' + prob + "_" + str(int(data_part*100)) + '.pkl', 'wb') as f: # pickle.dump(hist.history, f) model = load_model(os.path.join(model_path, "ae_cnn_multi_task_" + prob + "_" + str(int(data_part * 100)) + ".h5")) L = model.predict(X_test, batch_size=batch_size)[1] # index 0 is output for branch autoencoder, 1 is output for branch classifier print(L.shape) print(y_test.shape) y_true = np.argmax(y_test, axis=1) y_pred = np.argmax(L, axis=1) acc = accuracy_score(y_true, y_pred) # f1_macro = f1_score(y_true, y_pred, average='macro') # f1_micro = f1_score(y_true, y_pred, average='micro') f1_weighted = f1_score(y_true, y_pred, average='weighted') print("Accuracy=", acc) # print("F1-macro=", f1_macro) # print("F1-micro=", f1_micro) print("F1-score=", f1_weighted) import operator count = 0 fout = open(os.path.join(results_path, 'ae_cnn_multi_task_' + prob + "_" + str(int(data_part * 100)) + '.roc'), 'w') fout.write('label, ') label_list = np.unique(y_testnum) fout.write(', '.join([str(i) for i in label_list])) fout.write('\n') for idx, probs in enumerate(L): pred_label, value = max(enumerate(probs), key=operator.itemgetter(1)) if pred_label == y_testnum[idx]: count += 1 fout.write(str(y_testnum[idx]) + ' ') fout.write(' '.join([str(i) for i in probs])) fout.write('\n') print("ROC's saved to {}".format( os.path.join(results_path, 'ae_cnn_multi_task_' + prob + "_" + str(int(data_part * 100)) + '.roc'))) fout.close()
model_path = './models/ae_models' model_name = 'ae_' + prob + '.h5' file_train = '../asmdata/'+prob+'_Seq_train.txt' file_CV = '../asmdata/'+ prob + '_Seq_CV.txt' file_test = '../asmdata/' + prob + '_Seq_test.txt' # file_train = "../asmdata/debug_Seq_CV.txt" # file_train = "../asmdata/debug_Seq_CV.txt" # file_train = "../asmdata/debug_Seq_CV.txt" print('\nLoad training data: ' + file_train) print('\nLoad CV data: ' + file_CV) print('\nLoad test data: ' + file_test) wordvec = Data_IO.loadWordEmbedding(file_wordvec) y_train, X_train, maxlen_train = Data_IO.load_ASMSeqData(file_train, wordvec) y_CV, X_CV, maxlen_CV = Data_IO.load_ASMSeqData(file_CV, wordvec) y_test, X_test, maxlen_test = Data_IO.load_ASMSeqData(file_test, wordvec) if nb_classes == 2: y_train = [x if x == 0 else 1 for x in y_train] y_CV = [x if x == 0 else 1 for x in y_CV] y_test = [x if x == 0 else 1 for x in y_test] y_testnum = y_test # maxlen: the length of the longest instruction sequence maxlen = np.max([maxlen_train, maxlen_CV, maxlen_test]) is_padded = 0 if maxlen % 2 == 1:
def get_data_df(filename): ''' Opens the data .csv and returns as a dataframe ''' return Data_IO.csv_to_df(filename)