def evaluation(method, dataset, user, device_source): log_name = 'log/cnn_%s_%s_evaluation.txt' % (dataset, method) if os.path.exists(log_name): os.remove(log_name) if user == 'mlsnrs': root_dir_prefix = '/home/mlsnrs/apks' elif user == 'shellhand': root_dir_prefix = '/mnt' save_feature_path = '%s/%s/mamadroid/%s/%s/%s_save_feature_list.csv' % ( root_dir_prefix, device_source, dataset, method, method) save_feature_dict = get_save_feature_dict(save_feature_path) print('have read save_feature_dict: %d' % len(save_feature_dict)) x_train, y_train = get_train_data(dataset, method, save_feature_dict, root_dir_prefix, device_source) print('x_train shape: %s y_train shape: %s' % (str(x_train.shape), str(y_train.shape))) start = time.time() print('start train') clf = CNN(layer_num=3, kernel_size=3, gpu_id=2) clf.fit(x_train, y_train, epoch=5, batch_size=500, lr=0.01) end = time.time() print('Training model time used: %f s' % (end - start)) # torch.cuda.empty_cache() print(x_train.shape) y_pred = clf.predict(x_train, batch_size=20) print(y_pred.shape) cm = confusion_matrix(y_train, np.int32(y_pred >= 0.5)) TP = cm[1][1] FP = cm[0][1] TN = cm[0][0] FN = cm[1][0] F1 = float(2 * TP) / (2 * TP + FN + FP) print('train data TP FP TN FN F1: %d %d %d %d %.4f' % (TP, FP, TN, FN, F1)) with open(log_name, 'a') as f: f.write('train data TP FP TN FN F1: %d %d %d %d %.4f\n' % (TP, FP, TN, FN, F1)) x_train = [] y_train = [] for test_id in range(0, 1): #13): x_test, y_test = get_test_data(dataset, test_id, method, save_feature_dict, root_dir_prefix, device_source) print('x_test shape: %s y_test shape: %s' % (str(x_test.shape), str(y_test.shape))) y_pred = clf.predict(x_test, batch_size=500) # y_pred = classify(y_pred) cm = confusion_matrix(y_test, y_pred) TP = cm[1][1] FP = cm[0][1] TN = cm[0][0] FN = cm[1][0] F1 = float(2 * TP) / (2 * TP + FN + FP) print('test_id %d TP FP TN FN F1: %d %d %d %d %.4f' % (test_id, TP, FP, TN, FN, F1)) with open(log_name, 'a') as f: f.write('test_id %d TP FP TN FN F1: %d %d %d %d %.4f\n' % (test_id, TP, FP, TN, FN, F1))
def train_cnn_model(emb_layer, x_train, y_train, x_val, y_val, opt): model = CNN(embedding_layer=emb_layer, num_words=opt.n_words, embedding_dim=opt.embed_dim, filter_sizes=opt.cnn_filter_shapes, feature_maps=opt.filter_sizes, max_seq_length=opt.sent_len, dropout_rate=opt.dropout_ratio, hidden_units=200, nb_classes=2).build_model() model.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(), metrics=['accuracy']) # y_train = y_train.reshape(-1, 1) # model = build_model(emb_layer, opt) print(model.summary()) early_stopping = EarlyStopping(monitor='val_loss', patience=2) history = model.fit(x_train, y_train, epochs=opt.cnn_epoch, batch_size=opt.batch_size, verbose=1, validation_data=(x_val, y_val), callbacks=[early_stopping]) with open("CNN_train_history.txt", "w") as f: print(history.history, file=f) return model
def train_baseline_cnn(emb_layer, x_train, y_train, x_val, y_val, opt): model = CNN(embedding_layer=emb_layer, num_words=opt.transfer_n_words, embedding_dim=opt.baseline_embed_dim, filter_sizes=opt.cnn_filter_shapes, feature_maps=opt.filter_sizes, max_seq_length=opt.baseline_sent_len, dropout_rate=opt.baseline_drop_out_ratio, hidden_units=200, nb_classes=2).build_model() model.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(), metrics=['accuracy']) # y_train = y_train.reshape(-1, 1) # model = build_model(emb_layer, opt) print(model.summary()) tb_call_back = TensorBoard(log_dir=f'{opt.tbpath}/baseline_cnn_{time()}', histogram_freq=1, write_graph=True, write_images=True) checkpoint = ModelCheckpoint("baseline_cnn.h5", monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1) early_stopping = EarlyStopping(monitor='val_loss', patience=2) history = model.fit(x_train, y_train, epochs=opt.baseline_epochs, batch_size=opt.baseline_batchsize, verbose=1, validation_data=(x_val, y_val), callbacks=[early_stopping, tb_call_back, checkpoint]) with open("CNN_train_baseline_history.txt", "w") as f: print(history.history, file=f) return model
).build_model() model.compile( loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy'] ) # model.summary() history = model.fit( X_train, y_train, epochs=NB_EPOCHS, batch_size=BATCH_SIZE, validation_data=(X_val, y_val), callbacks=[ keras.callbacks.ModelCheckpoint( 'model-%i.h5' % (i + 1), monitor='val_loss', verbose=1, save_best_only=True, mode='min' ), # keras.callbacks.TensorBoard(log_dir='./logs/temp', write_graph=True) ] ) print() histories.append(history.history) # EVALUATION ------------------------------------------------------------------- def get_avg(histories, his_key): tmp = [] for history in histories: tmp.append(history[his_key][np.argmin(history['val_loss'])])
def optimize_para(method, dataset, user, device_source): log_name = 'log/optimize_cnn_%s_%s_evaluation_v2.txt' % (dataset, method) # if os.path.exists(log_name): # os.remove(log_name) if user == 'mlsnrs': root_dir_prefix = '/home/mlsnrs/apks' elif user == 'shellhand': root_dir_prefix = '/mnt' save_feature_path = '%s/%s/mamadroid/%s/%s/%s_save_feature_list.csv' % ( root_dir_prefix, device_source, dataset, method, method) save_feature_dict = get_save_feature_dict(save_feature_path) print('have read save_feature_dict: %d' % len(save_feature_dict)) x_train, y_train = get_train_data( dataset, 2012, method, save_feature_dict, root_dir_prefix ) # dataset, train_year, method, save_feature_dict, root_dir_prefix print('x_train shape: %s y_train shape: %s' % (str(x_train.shape), str(y_train.shape))) start = time.time() print('start train') for b in range(50, 501, 50): for k in [5]: # 3, 5 for lr in [0.01, 0.1, 0.001]: clf = CNN(layer_num=3, kernel_size=k, gpu_id=2) step_size = 10 for e in range(10, 501, step_size): clf.fit(x_train, y_train, epoch=step_size, batch_size=b, lr=lr) end = time.time() # print('Training batch_size=%d kernel_size=%d lr=%.2f epoch=%d time used: %f s' % (b, k, lr, e, end - start)) # torch.cuda.empty_cache() y_pred = clf.predict(x_train, batch_size=20) cm = confusion_matrix(y_train, np.int32(y_pred >= 0.5)) TP = cm[1][1] FP = cm[0][1] TN = cm[0][0] FN = cm[1][0] F1 = float(2 * TP) / (2 * TP + FN + FP) print( 'train data batch_size=%d kernel_size=%d lr=%.2f epoch=%d TP FP TN FN F1: %d %d %d %d %.4f' % (b, k, lr, e, TP, FP, TN, FN, F1)) with open(log_name, 'a') as f: f.write( 'train data batch_size=%d kernel_size=%d lr=%.2f epoch=%d TP FP TN FN F1: %d %d %d %d %.4f\n' % (b, k, lr, e, TP, FP, TN, FN, F1)) for test_id in range(0, 1): #13): x_test, y_test = get_test_data( dataset, 2013, 0, method, save_feature_dict, root_dir_prefix ) # dataset, test_year, test_month, method, save_feature_dict, root_dir_prefix # print('x_test shape: %s y_test shape: %s' % (str(x_test.shape), str(y_test.shape))) y_pred = clf.predict(x_test, batch_size=20) # y_pred = classify(y_pred) cm = confusion_matrix(y_test, np.int32(y_pred >= 0.5)) TP = cm[1][1] FP = cm[0][1] TN = cm[0][0] FN = cm[1][0] F1 = float(2 * TP) / (2 * TP + FN + FP) print( 'test_id %d batch_size=%d kernel_size=%d lr=%.2f epoch=%d TP FP TN FN F1: %d %d %d %d %.4f' % (test_id, b, k, lr, e, TP, FP, TN, FN, F1)) with open(log_name, 'a') as f: f.write( 'test_id %d batch_size=%d kernel_size=%d lr=%.2f epoch=%d TP FP TN FN F1: %d %d %d %d %.4f\n' % (test_id, b, k, lr, e, TP, FP, TN, FN, F1))
def evaluation(method, dataset, user, device_source): if user == 'mlsnrs': root_dir_prefix = '/home/mlsnrs/apks' elif user == 'shellhand': root_dir_prefix = '/mnt' save_feature_path = '%s/ssd_1T/mamadroid/%s/%s/%s_save_feature_list.csv' % ( root_dir_prefix, dataset, method, method) save_feature_dict = get_save_feature_dict(save_feature_path) print('have read save_feature_dict: %d' % len(save_feature_dict)) for train_year in range(2012, 2018): log_name = 'log/cnn_%s_%s_%dtrain_evaluation.txt' % (dataset, method, train_year) if os.path.exists(log_name): os.remove(log_name) x_train, y_train = get_train_data(dataset, train_year, method, save_feature_dict, root_dir_prefix) print('x_train shape: %s y_train shape: %s' % (str(x_train.shape), str(y_train.shape))) start = time.time() print('start train') clf = CNN(layer_num=3, kernel_size=5, gpu_id=3) clf.fit(x_train, y_train, epoch=260, batch_size=350, lr=0.01) # 260 end = time.time() print('Training model time used: %f s' % (end - start)) print(x_train.shape) len_x = x_train.shape[0] if (len_x % 20) != 1: y_pred = clf.predict(x_train, batch_size=20) else: y_pred = clf.predict(x_train, batch_size=21) print(y_pred.shape) cm = confusion_matrix(y_train, np.int32(y_pred >= 0.5)) TP = cm[1][1] FP = cm[0][1] TN = cm[0][0] FN = cm[1][0] F1 = float(2 * TP) / (2 * TP + FN + FP) print('train %d data TP FP TN FN F1: %d %d %d %d %.4f' % (train_year, TP, FP, TN, FN, F1)) with open(log_name, 'a') as f: f.write('train %d data TP FP TN FN F1: %d %d %d %d %.4f\n' % (train_year, TP, FP, TN, FN, F1)) x_train = [] y_train = [] for test_year in range(train_year + 1, 2019): for test_month in range(0, 12): x_test, y_test = get_test_data(dataset, test_year, test_month, method, save_feature_dict, root_dir_prefix) print('%d-%02d x_test shape: %s y_test shape: %s' % (test_year, test_month + 1, str( x_test.shape), str(y_test.shape))) len_x = x_test.shape[0] if (len_x % 20) != 1: y_pred = clf.predict(x_test, batch_size=20) else: y_pred = clf.predict(x_test, batch_size=21) # y_pred = clf.predict(x_test, batch_size = 20) cm = confusion_matrix(y_test, np.int32(y_pred >= 0.5)) TP = cm[1][1] FP = cm[0][1] TN = cm[0][0] FN = cm[1][0] F1 = float(2 * TP) / (2 * TP + FN + FP) print('test %d-%02d TP FP TN FN F1: %d %d %d %d %.4f' % (test_year, test_month + 1, TP, FP, TN, FN, F1)) with open(log_name, 'a') as f: f.write('test %d-%02d TP FP TN FN F1: %d %d %d %d %.4f\n' % (test_year, test_month + 1, TP, FP, TN, FN, F1))