def best_model_test(XD, XD_STRU, XT, XT_STRU, Y, label_row_inds, label_col_inds, FLAGS, measure, test_sets, bestpointer, bestpar_list): model = keras.models.load_model( f'{bestpar_list[bestpointer]}_{bestpointer+1}_model.h5', custom_objects={'cindex_score': cindex_score}) logging( "LoadModel: pointer = %f, ep = %f" % (bestpar_list[bestpointer], bestpointer + 1), FLAGS) terows = label_row_inds[test_sets] tecols = label_col_inds[test_sets] test_drugs, test_drugs_stru, test_prots, test_prots_stru, test_Y = prepare_interaction_pairs( XD, XD_STRU, XT, XT_STRU, Y, terows, tecols) pred = model.predict([ np.array(test_drugs), np.array(test_drugs_stru), np.array(test_prots), np.array(test_prots_stru) ]) loss, prf2 = model.evaluate(([ np.array(test_drugs), np.array(test_drugs_stru), np.array(test_prots), np.array(test_prots_stru) ]), np.array(test_Y), verbose=0) prf = measure(test_Y, pred) logging("TestSets: CI-i = %f, CI-ii = %f, MSE = %f" % (prf, prf2, loss), FLAGS) return prf, loss
def experiment(FLAGS, perfmeasure, deepmethod, foldcount=6): #5-fold cross validation + test #Input #XD: [drugs, features] sized array (features may also be similarities with other drugs #XT: [targets, features] sized array (features may also be similarities with other targets #Y: interaction values, can be real values or binary (+1, -1), insert value float("nan") for unknown entries #perfmeasure: function that takes as input a list of correct and predicted outputs, and returns performance #higher values should be better, so if using error measures use instead e.g. the inverse -error(Y, P) #foldcount: number of cross-validation folds for settings 1-3, setting 4 always runs 3x3 cross-validation dataset = DataSet(fpath=FLAGS.train_path, fpath_test=FLAGS.test_path, setting_no=FLAGS.problem_type, seqlen=FLAGS.max_seq_len, smilen=FLAGS.max_smi_len, need_shuffle=False) # set character set size FLAGS.charseqset_size = dataset.charseqset_size FLAGS.charsmiset_size = dataset.charsmiset_size #XD, XT, Y = dataset.parse_data(fpath = FLAGS.dataset_path) tr_XD, tr_XT, tr_Y, te_XD, te_XT, te_Y = dataset.parse_train_test_data( FLAGS) tr_XD = np.asarray(tr_XD) tr_XT = np.asarray(tr_XT) tr_Y = np.asarray(tr_Y) te_XD = np.asarray(te_XD) te_XT = np.asarray(te_XT) te_Y = np.asarray(te_Y) tr_drugcount = tr_XD.shape[0] print("train drugs: ", tr_drugcount) tr_targetcount = tr_XT.shape[0] print("train targets: ", tr_targetcount) te_drugcount = te_XD.shape[0] print("test drugs: ", te_drugcount) te_targetcount = te_XT.shape[0] print("test targets: ", te_targetcount) FLAGS.drug_count = tr_drugcount FLAGS.target_count = tr_targetcount if not os.path.exists(figdir): os.makedirs(figdir) print(FLAGS.log_dir) S1_avgperf, S1_avgloss, S1_teststd = nfold_1_2_3_setting_sample( tr_XD, tr_XT, tr_Y, te_XD, te_XT, te_Y, perfmeasure, deepmethod, FLAGS, dataset) logging("Setting " + str(FLAGS.problem_type), FLAGS) logging( "avg_perf = %.5f, avg_mse = %.5f, std = %.5f" % (S1_avgperf, S1_avgloss, S1_teststd), FLAGS)
def experiment(FLAGS, perfmeasure, deepmethod, foldcount=6): #5-fold cross validation + test #Input #XD: [drugs, features] sized array (features may also be similarities with other drugs #XT: [targets, features] sized array (features may also be similarities with other targets #Y: interaction values, can be real values or binary (+1, -1), insert value float("nan") for unknown entries #perfmeasure: function that takes as input a list of correct and predicted outputs, and returns performance #higher values should be better, so if using error measures use instead e.g. the inverse -error(Y, P) #foldcount: number of cross-validation folds for settings 1-3, setting 4 always runs 3x3 cross-validation dataset = DataSet( fpath=FLAGS.dataset_path, ### BUNU ARGS DA GUNCELLE setting_no=FLAGS.problem_type, ##BUNU ARGS A EKLE seqlen=FLAGS.max_seq_len, smilen=FLAGS.max_smi_len, need_shuffle=False) # set character set size FLAGS.charseqset_size = dataset.charseqset_size FLAGS.charsmiset_size = dataset.charsmiset_size XD, XT, Y = dataset.parse_data(fpath=FLAGS.dataset_path) XD = np.asarray(XD) XT = np.asarray(XT) Y = np.asarray(Y) drugcount = XD.shape[0] print(drugcount) targetcount = XT.shape[0] print(targetcount) FLAGS.drug_count = drugcount FLAGS.target_count = targetcount label_row_inds, label_col_inds = np.where(np.isnan( Y) == False) #basically finds the point address of affinity [x,y] if not os.path.exists(figdir): os.makedirs(figdir) print(FLAGS.log_dir) S1_avgperf, S1_avgloss, S1_teststd = nfold_1_2_3_setting_sample( XD, XT, Y, label_row_inds, label_col_inds, perfmeasure, deepmethod, FLAGS, dataset) logging("Setting " + str(FLAGS.problem_type), FLAGS) logging( "avg_perf = %.5f, avg_mse = %.5f, std = %.5f" % (S1_avgperf, S1_avgloss, S1_teststd), FLAGS)
def experiment(FLAGS, perfmeasure, foldcount=6): # 5-fold cross validation + test # Input # XD: [drugs, features] sized array (features may also be similarities with other drugs # XT: [targets, features] sized array (features may also be similarities with other targets # Y: interaction values, can be real values or binary (+1, -1), insert value float("nan") for unknown entries # perfmeasure: function that takes as input a list of correct and predicted outputs, and returns performance # higher values should be better, so if using error measures use instead e.g. the inverse -error(Y, P) # foldcount: number of cross-validation folds for settings 1-3, setting 4 always runs 3x3 cross-validation dataset = DataSet(fpath=FLAGS.dataset_path, # BUNU ARGS DA GUNCELLE setting_no=FLAGS.problem_type, # BUNU ARGS A EKLE seqlen=FLAGS.max_seq_len, smilen=FLAGS.max_smi_len, need_shuffle=False) # set character set size FLAGS.charseqset_size = dataset.charseqset_size FLAGS.charsmiset_size = dataset.charsmiset_size XD, XT, Y = dataset.parse_data(fpath=FLAGS.dataset_path) XD = np.asarray(XD) XT = np.asarray(XT) Y = np.asarray(Y) drugcount = XD.shape[0] print(drugcount) targetcount = XT.shape[0] print(targetcount) FLAGS.drug_count = drugcount FLAGS.target_count = targetcount label_row_inds, label_col_inds = np.where( np.isnan(Y) == False) # basically finds the point address of affinity [x,Y] print("Logdir: " + FLAGS.log_dir) s1_avgperf, s1_avgloss, s1_teststd = nfold_1_2_3_setting_sample(XD, XT, Y, label_row_inds, label_col_inds, perfmeasure, FLAGS, dataset) logging("Setting " + str(FLAGS.problem_type), FLAGS) logging("avg_perf = %.5f, avg_mse = %.5f, std = %.5f" % (s1_avgperf, s1_avgloss, s1_teststd), FLAGS) print("Setting " + str(FLAGS.problem_type)) print("avg_perf = %.5f, avg_mse = %.5f, std = %.5f" % (s1_avgperf, s1_avgloss, s1_teststd))
def experiment(FLAGS, perfmeasure, foldcount=6): # 5-fold cross validation + test dataset = DataSet( fpath=FLAGS.dataset_path, # class lấy dataset setting_no=FLAGS.problem_type, seqlen=FLAGS.max_seq_len, smilen=FLAGS.max_smi_len, need_shuffle=False) # set character set size FLAGS.charseqset_size = dataset.charseqset_size FLAGS.charsmiset_size = dataset.charsmiset_size XD, XT, Y = dataset.parse_data(fpath=FLAGS.dataset_path) XD = np.asarray(XD) # chuyển sang array XT = np.asarray(XT) # chuyển sang array Y = np.asarray(Y) # chuyển sang array drugcount = XD.shape[0] # lấy số lượng dữ liệu XD print(drugcount) targetcount = XT.shape[0] # lấy số lượng dữ liệu XT print(targetcount) FLAGS.drug_count = drugcount FLAGS.target_count = targetcount label_row_inds, label_col_inds = np.where(np.isnan(Y) == False) if not os.path.exists(figdir): os.makedirs(figdir) print(FLAGS.log_dir) S1_avgperf, S1_avgloss, S1_teststd = nfold_1_2_3_setting_sample( XD, XT, Y, label_row_inds, label_col_inds, perfmeasure, FLAGS, dataset) # training logging("Setting " + str(FLAGS.problem_type), FLAGS) logging( "avg_perf = %.5f, avg_mse = %.5f, std = %.5f" % (S1_avgperf, S1_avgloss, S1_teststd), FLAGS)
def general_nfold_cv(tr_XD, tr_XT, tr_Y, te_XD, te_XT, te_Y, prfmeasure, runmethod, FLAGS, labeled_sets, val_sets): ## BURAYA DA FLAGS LAZIM???? paramset1 = FLAGS.num_windows #[32]#[32, 512] #[32, 128] # filter numbers paramset2 = FLAGS.smi_window_lengths #[4, 8]#[4, 32] #[4, 8] #filter length smi paramset3 = FLAGS.seq_window_lengths #[8, 12]#[64, 256] #[64, 192]#[8, 192, 384] epoch = FLAGS.num_epoch #100 batchsz = FLAGS.batch_size #256 logging("---Parameter Search-----", FLAGS) ### MODIFIED FOR SINGLE TRAIN h = len(paramset1) * len(paramset2) * len(paramset3) all_predictions = [0 for y in range(h)] all_losses = [0 for y in range(h)] valinds = val_sets labeledinds = labeled_sets tr_label_row_inds, tr_label_col_inds = np.where(np.isnan( tr_Y) == False) #basically finds the point address of affinity [x,y] te_label_row_inds, te_label_col_inds = np.where(np.isnan( te_Y) == False) #basically finds the point address of affinity [x,y] Y_train = np.mat(np.copy(tr_Y)) params = {} XD_train = tr_XD XT_train = tr_XT trrows = tr_label_row_inds[labeledinds] trcols = tr_label_col_inds[labeledinds] #print("trrows", str(trrows), str(len(trrows))) #print("trcols", str(trcols), str(len(trcols))) XD_train = tr_XD[trrows] XT_train = tr_XT[trcols] train_drugs, train_prots, train_Y = prepare_interaction_pairs( tr_XD, tr_XT, tr_Y, trrows, trcols) terows = te_label_row_inds[valinds] tecols = te_label_col_inds[valinds] #print("terows", str(terows), str(len(terows))) #print("tecols", str(tecols), str(len(tecols))) val_drugs, val_prots, val_Y = prepare_interaction_pairs( te_XD, te_XT, te_Y, terows, tecols) pointer = 0 for param1ind in range(len(paramset1)): #hidden neurons param1value = paramset1[param1ind] for param2ind in range(len(paramset2)): #learning rate param2value = paramset2[param2ind] for param3ind in range(len(paramset3)): param3value = paramset3[param3ind] gridmodel = runmethod(FLAGS, param1value, param2value, param3value) # Set callback functions to early stop training and save the best model so far callbacks = [EarlyStopping(monitor='val_loss', patience=15)] gridres = gridmodel.fit( ([np.array(train_drugs), np.array(train_prots)]), np.array(train_Y), batch_size=batchsz, epochs=epoch, shuffle=False) #validation_data=( ([np.array(val_drugs), np.array(val_prots) ]), np.array(val_Y)), predicted_labels = gridmodel.predict( [np.array(val_drugs), np.array(val_prots)]) json.dump( predicted_labels.tolist(), open("predicted_labels_" + str(pointer) + ".txt", "w")) loss, rperf2 = gridmodel.evaluate( ([np.array(val_drugs), np.array(val_prots)]), np.array(val_Y), verbose=0) rperf = prfmeasure(val_Y, predicted_labels) #rperf = rperf[0] logging( "P1 = %d, P2 = %d, P3 = %d, CI-i = %f, CI-ii = %f, MSE = %f" % (param1ind, param2ind, param3ind, rperf, rperf2, loss), FLAGS) #plotLoss(gridres, param1ind, param2ind, param3ind, "1") all_predictions[ pointer] = rperf #TODO FOR EACH VAL SET allpredictions[pointer][foldind] all_losses[pointer] = loss pointer += 1 bestperf = -float('Inf') bestpointer = None best_param_list = [] ##Take average according to folds, then chooose best params pointer = 0 for param1ind in range(len(paramset1)): for param2ind in range(len(paramset2)): for param3ind in range(len(paramset3)): avgperf = 0. foldperf = all_predictions[pointer] avgperf += foldperf #avgperf /= len(val_sets) #print(epoch, batchsz, avgperf) if avgperf > bestperf: bestperf = avgperf bestpointer = pointer best_param_list = [param1ind, param2ind, param3ind] pointer += 1 return bestpointer, best_param_list, bestperf, all_predictions, all_losses
def nfold_1_2_3_setting_sample(tr_XD, tr_XT, tr_Y, te_XD, te_XT, te_Y, measure, runmethod, FLAGS, dataset): bestparamlist = [] test_set, outer_train_sets = dataset.read_sets(FLAGS) ### MODIFIED FOR SINGLE TRAIN AND TEST ##### train_set = outer_train_sets #train_set = [item for sublist in outer_train_sets for item in sublist] bestparamind, best_param_list, bestperf, all_predictions, all_losses = general_nfold_cv( tr_XD, tr_XT, tr_Y, te_XD, te_XT, te_Y, measure, runmethod, FLAGS, train_set, test_set) testperf = all_predictions[bestparamind] ##pointer pos logging("---FINAL RESULTS-----", FLAGS) logging("best param index = %s" % bestparamind, FLAGS) testperfs = [] testloss = [] avgperf = 0. foldperf = all_predictions[bestparamind] foldloss = all_losses[bestparamind] testperfs.append(foldperf) testloss.append(foldloss) avgperf += foldperf avgperf = avgperf / 1 avgloss = np.mean(testloss) teststd = np.std(testperfs) logging("Test Performance CI", FLAGS) logging(testperfs, FLAGS) logging("Test Performance MSE", FLAGS) logging(testloss, FLAGS) return avgperf, avgloss, teststd
def general_nfold_cv(XD, XT, Y, label_row_inds, label_col_inds, prfmeasure, FLAGS, labeled_sets, val_sets): ## BURAYA DA FLAGS LAZIM???? paramset1 = FLAGS.num_windows # [32]#[32, 512] #[32, 128] # filter numbers paramset2 = FLAGS.smi_window_lengths # [4, 8]#[4, 32] #[4, 8] #filter length smi paramset3 = FLAGS.seq_window_lengths # [8, 12]#[64, 256] #[64, 192]#[8, 192, 384] epoch = FLAGS.num_epoch # 100 batchsz = FLAGS.batch_size # 256 logging("---Parameter Search-----", FLAGS) w = len(val_sets) h = len(paramset1) * len(paramset2) * len(paramset3) all_predictions = [[0 for x in range(w)] for y in range(h)] all_losses = [[0 for x in range(w)] for y in range(h)] print(all_predictions) for foldind in range(len(val_sets)): valinds = val_sets[foldind] labeledinds = labeled_sets[foldind] Y_train = np.mat(np.copy(Y)) params = {} XD_train = XD XT_train = XT trrows = label_row_inds[labeledinds] trcols = label_col_inds[labeledinds] # print("trrows", str(trrows), str(len(trrows))) # print("trcols", str(trcols), str(len(trcols))) XD_train = XD[trrows] XT_train = XT[trcols] train_drugs, train_prots, train_Y = prepare_interaction_pairs( XD, XT, Y, trrows, trcols) train_drugs = np.array(train_drugs) train_prots = np.array(train_prots) train_Y = np.array(train_Y) train_drugs = train_drugs[:1000] train_prots = train_prots[:1000] train_Y = train_Y[:1000] terows = label_row_inds[valinds] tecols = label_col_inds[valinds] # print("terows", str(terows), str(len(terows))) # print("tecols", str(tecols), str(len(tecols))) val_drugs, val_prots, val_Y = prepare_interaction_pairs( XD, XT, Y, terows, tecols) val_drugs = np.array(val_drugs) val_prots = np.array(val_prots) val_Y = np.array(val_Y) val_drugs = val_drugs[:1000] val_prots = val_prots[:1000] val_Y = val_Y[:1000] pointer = 0 print(paramset1) print(paramset2) print(paramset3) for param1ind in range(len(paramset1)): # hidden neurons param1value = paramset1[param1ind] for param2ind in range(len(paramset2)): # learning rate param2value = paramset2[param2ind] for param3ind in range(len(paramset3)): for jj in range(100000000): print(jj) param3value = paramset3[param3ind] torch.cuda.empty_cache() model = Net(param2value, param1value, param3value) bestperf = -float('Inf') bestpointer = None best_param_list = [] ##Take average according to folds, then chooose best params pointer = 0 for param1ind in range(len(paramset1)): for param2ind in range(len(paramset2)): for param3ind in range(len(paramset3)): avgperf = 0. for foldind in range(len(val_sets)): foldperf = all_predictions[pointer][foldind] avgperf += foldperf avgperf /= len(val_sets) # print(epoch, batchsz, avgperf) if avgperf > bestperf: bestperf = avgperf bestpointer = pointer best_param_list = [param1ind, param2ind, param3ind] pointer += 1 return bestpointer, best_param_list, bestperf, all_predictions, all_losses
def experiment(FLAGS, foldcount=6): # 5-fold cross validation + test # Input # XD: [drugs, features] sized array (features may also be similarities with other drugs # XT: [targets, features] sized array (features may also be similarities with other targets # Y: interaction values, can be real values or binary (+1, -1), insert value float("nan") for unknown entries # perfmeasure: function that takes as input a list of correct and predicted outputs, and returns performance # higher values should be better, so if using error measures use instead e.g. the inverse -error(Y, P) # foldcount: number of cross-validation folds for settings 1-3, setting 4 always runs 3x3 cross-validation dataset = DataSet( fpath=FLAGS.dataset_path, ### BUNU ARGS DA GUNCELLE setting_no=FLAGS.problem_type, ##BUNU ARGS A EKLE seqlen=FLAGS.max_seq_len, smilen=FLAGS.max_smi_len, need_shuffle=False) # set character set size FLAGS.charseqset_size = dataset.charseqset_size FLAGS.charsmiset_size = dataset.charsmiset_size XD, XT, Y = dataset.parse_data(FLAGS) XD = np.asarray(XD) XT = np.asarray(XT) Y = np.asarray(Y) drugcount = XD.shape[0] print(drugcount) targetcount = XT.shape[0] print(targetcount) FLAGS.drug_count = drugcount FLAGS.target_count = targetcount label_row_inds, label_col_inds = np.where(np.isnan(Y) == False) if not os.path.exists(figdir): os.makedirs(figdir) perf = [] mseloss = [] auc = [] aupr = [] for i in range(1): random.seed(i + 1000) if FLAGS.problem_type == 1: nfolds = get_random_folds(len(label_row_inds), foldcount) if FLAGS.problem_type == 2: nfolds = get_drugwise_folds(label_row_inds, label_col_inds, drugcount, foldcount) if FLAGS.problem_type == 3: nfolds = get_targetwise_folds(label_row_inds, label_col_inds, targetcount, foldcount) avgperf, avgloss, teststd, lossstd, avg_auc, auc_std, avg_aupr, aupr_std = nfold_setting_sample( XD, XT, Y, label_row_inds, label_col_inds, get_cindex, FLAGS, dataset, nfolds, i) logging("Setting " + str(FLAGS.problem_type), FLAGS) logging( "avg_perf = %.5f, avg_mse = %.5f, std = %.5f, loss_std = %.5f, auc = %.5f, auc_std = %.5f, aupr =%.5f, aupr_std = %.5f" % (avgperf, avgloss, teststd, lossstd, avg_auc, auc_std, avg_aupr, aupr_std), FLAGS) perf.append(avgperf) mseloss.append(avgloss) auc.append(avg_auc) aupr.append(avg_aupr) print(FLAGS.log_dir) logging(("Finally"), FLAGS) logging( "avg_perf = %.5f, avg_mse = %.5f, std = %.5f, loss_std = %.5f,auc = %.5f, auc_std = %.5f, aupr =%.5f, aupr_std = %.5f" % (np.mean(perf), np.mean(mseloss), np.std(perf), np.std(mseloss), np.mean(auc), np.std(auc), np.mean(aupr), np.std(aupr)), FLAGS)
def general_nfold_cv_test(XD, XT, Y, label_row_inds, label_col_inds, prfmeasure, FLAGS, labeled_sets, val_sets, get_rm2, best_param_list, i): param1value = best_param_list[0] param2value = best_param_list[1] param3value = best_param_list[2] lamda = best_param_list[3] batchsz = FLAGS.batch_size # 256 logging("---Parameter Search-----", FLAGS) w = len(val_sets) all_predictions = [0 for x in range(w)] all_losses = [0 for x in range(w)] all_auc = [0 for x in range(w)] all_aupr = [0 for x in range(w)] all_preaffinities = [] all_affinities = [] for foldind in range(len(val_sets)): valinds = val_sets[foldind] labeledinds = labeled_sets[foldind] trrows = label_row_inds[labeledinds] trcols = label_col_inds[labeledinds] train_dataset = prepare_interaction_pairs(XD, XT, Y, trrows, trcols) terows = label_row_inds[valinds] tecols = label_col_inds[valinds] test_dataset = prepare_interaction_pairs(XD, XT, Y, terows, tecols) train_loader = DataLoader(dataset=train_dataset, batch_size=batchsz, shuffle=True) test_loader = DataLoader(dataset=test_dataset, batch_size=batchsz) model = net(FLAGS, param1value, param2value, param3value).cuda() model.apply(weights_init) rperf_list = [] for epochind in range(FLAGS.num_epoch): model = train(train_loader, model, FLAGS, param1value, param2value, param3value, lamda) if (epochind + 1) % 2 == 0: rperf, loss, rm2, auc = test(model, test_loader, FLAGS, param1value, param2value, param3value, lamda) rperf_list.append(rperf) print( 'test: epoch:{},p1:{},p2:{},p3:{},loss:{:.5f},rperf:{:.5f}, rm2:{:.5f}' .format(epochind, param1value, param2value, param3value, loss, rperf, rm2)) if rperf >= max(rperf_list): torch.save(model, 'checkpoint.pth') if rperf < max(rperf_list) - 0.1: break loss_func = nn.MSELoss() affinities = [] pre_affinities = [] model = torch.load('checkpoint.pth') model.eval() for drug_SMILES, target_protein, affinity in test_loader: pre_affinity, _, _, _, _, _, _, _, _ = model( drug_SMILES, target_protein, FLAGS, param1value, param2value, param3value) pre_affinities += pre_affinity.cpu().detach().numpy().tolist() affinities += affinity.cpu().detach().numpy().tolist() pre_affinities = np.array(pre_affinities) affinities = np.array(affinities) if 'davis' in FLAGS.dataset_path: pre_label = pre_affinities label = np.int32(affinities > 7.0) auc = roc_auc_score(label, pre_label) aupr = get_aupr(label, pre_label) if 'kiba' in FLAGS.dataset_path: pre_label = pre_affinities label = np.int32(affinities > 12.1) auc = roc_auc_score(label, pre_label) aupr = get_aupr(label, pre_label) rperf = prfmeasure(affinities, pre_affinities) rm2 = get_rm2(affinities, pre_affinities) loss = loss_func(torch.Tensor(pre_affinities), torch.Tensor(affinities)) print('best: p1:{},p2:{},p3:{},loss:{:.5f},rperf:{:.5f}, rm2:{:.5f}'. format(param1value, param2value, param3value, loss, rperf, rm2)) logging( "best: P1 = %d, P2 = %d, P3 = %d, Fold = %d, CI-i = %f, MSE = %f, auc = %f, aupr = %f" % (param1value, param2value, param3value, foldind, rperf, loss, auc, aupr), FLAGS) all_predictions[ foldind] = rperf # TODO FOR EACH VAL SET allpredictions[pointer][foldind] all_losses[foldind] = loss all_auc[foldind] = auc all_aupr[foldind] = aupr all_affinities.append(affinities) all_preaffinities.append(pre_affinities) # save affinities and preaffinites for further analysis np.savetxt("./result/iter" + str(i) + "affinities.txt", np.array(all_affinities)) np.savetxt("./result/iter" + str(i) + "preaffinities.txt", np.array(all_preaffinities)) best_param_list = [param1value, param2value, param3value, lamda] best_perf = np.mean(all_predictions) return best_param_list, best_perf, all_predictions, all_losses, all_auc, all_aupr
def nfold_1_2_3_setting_sample( XD, XT, Y, label_row_inds, label_col_inds, measure, FLAGS, dataset ): # tao du lieu training , validation, va testing chayj theo k fold bestparamlist = [] test_set, outer_train_sets = dataset.read_sets( FLAGS.dataset_path, FLAGS.problem_type) # du lieu training va testing foldinds = len(outer_train_sets) test_sets = [] ## TRAIN AND VAL val_sets = [] train_sets = [] # phan chia du lieu thanh training, testing v for val_foldind in range(foldinds): val_fold = outer_train_sets[val_foldind] val_sets.append(val_fold) otherfolds = deepcopy(outer_train_sets) otherfolds.pop(val_foldind) otherfoldsinds = [item for sublist in otherfolds for item in sublist] train_sets.append(otherfoldsinds) test_sets.append(test_set) print("val set", str(len(val_fold))) print("train set", str(len(otherfoldsinds))) # chon hyperparameter tot nhat với tập dữ liệu training và validation bestparamind, best_param_list, bestperf, all_predictions_not_need, losses_not_need = general_nfold_cv( XD, XT, Y, label_row_inds, label_col_inds, measure, FLAGS, train_sets, val_sets) # print("Test Set len", str(len(test_set))) # print("Outer Train Set len", str(len(outer_train_sets))) # chon hyperparameter tot nhat voi tap du lieu training va testing bestparam, best_param_list, bestperf, all_predictions, all_losses = general_nfold_cv( XD, XT, Y, label_row_inds, label_col_inds, measure, FLAGS, train_sets, test_sets) testperf = all_predictions[bestparamind] ##pointer pos logging("---FINAL RESULTS-----", FLAGS) logging( "best param index = %s, best param = %.5f" % # Kết quả tham số tốt nhất traning dùng tập training và validation (bestparamind, bestparam), FLAGS) testperfs = [] testloss = [] avgperf = 0. for test_foldind in range(len(test_sets)): foldperf = all_predictions[bestparamind][ test_foldind] # lấy ra kết quả CI loss của mỗi tổ hợp hyperparameter foldloss = all_losses[bestparamind][ test_foldind] # lấy ra kết quả MSE loss của mỗi tố hợp hyperparameter testperfs.append(foldperf) testloss.append(foldloss) avgperf += foldperf # tính CI loss của tất cả quá trình traninng của các model avgperf = avgperf / len(test_sets) # tính CI loss trung bình avgloss = np.mean(testloss) # tính loss mse teststd = np.std(testperfs) # tính độ lệch chuẩn logging("Test Performance CI", FLAGS) logging(testperfs, FLAGS) logging("Test Performance MSE", FLAGS) logging(testloss, FLAGS) return avgperf, avgloss, teststd
def general_nfold_cv(XD, XT, Y, label_row_inds, label_col_inds, prfmeasure, runmethod, FLAGS, labeled_sets, val_sets): ## BURAYA DA FLAGS LAZIM???? paramset1 = FLAGS.num_windows #[32]#[32, 512] #[32, 128] # filter numbers paramset2 = FLAGS.smi_window_lengths #[4, 8]#[4, 32] #[4, 8] #filter length smi paramset3 = FLAGS.seq_window_lengths #[8, 12]#[64, 256] #[64, 192]#[8, 192, 384] epoch = FLAGS.num_epoch #100 batchsz = FLAGS.batch_size #256 logging("---Parameter Search-----", FLAGS) w = len(val_sets) h = len(paramset1) * len(paramset2) * len(paramset3) all_predictions = [[0 for x in range(w)] for y in range(h)] all_losses = [[0 for x in range(w)] for y in range(h)] print(all_predictions) for foldind in range(len(val_sets)): valinds = val_sets[foldind] labeledinds = labeled_sets[foldind] Y_train = np.mat(np.copy(Y)) params = {} XD_train = XD XT_train = XT trrows = label_row_inds[labeledinds] trcols = label_col_inds[labeledinds] #print("trrows", str(trrows), str(len(trrows))) #print("trcols", str(trcols), str(len(trcols))) XD_train = XD[trrows] XT_train = XT[trcols] train_drugs, train_prots, train_Y = prepare_interaction_pairs( XD, XT, Y, trrows, trcols) terows = label_row_inds[valinds] tecols = label_col_inds[valinds] #print("terows", str(terows), str(len(terows))) #print("tecols", str(tecols), str(len(tecols))) val_drugs, val_prots, val_Y = prepare_interaction_pairs( XD, XT, Y, terows, tecols) pointer = 0 for param1ind in range(len(paramset1)): #hidden neurons param1value = paramset1[param1ind] for param2ind in range(len(paramset2)): #learning rate param2value = paramset2[param2ind] for param3ind in range(len(paramset3)): param3value = paramset3[param3ind] gridmodel = runmethod(FLAGS, param1value, param2value, param3value) gridres = gridmodel.fit( ([np.array(train_drugs), np.array(train_prots)]), np.array(train_Y), batch_size=batchsz, epochs=epoch, validation_data=(([ np.array(val_drugs), np.array(val_prots) ]), np.array(val_Y)), shuffle=False) predicted_labels = gridmodel.predict( [np.array(val_drugs), np.array(val_prots)]) loss, rperf2 = gridmodel.evaluate( ([np.array(val_drugs), np.array(val_prots)]), np.array(val_Y), verbose=0) rperf = prfmeasure(val_Y, predicted_labels) rperf = rperf[0] logging( "P1 = %d, P2 = %d, P3 = %d, Fold = %d, CI-i = %f, CI-ii = %f, MSE = %f" % (param1ind, param2ind, param3ind, foldind, rperf, rperf2, loss), FLAGS) plotLoss(gridres, param1ind, param2ind, param3ind, foldind) all_predictions[pointer][ foldind] = rperf #TODO FOR EACH VAL SET allpredictions[pointer][foldind] all_losses[pointer][foldind] = loss pointer += 1 bestperf = -float('Inf') bestpointer = None best_param_list = [] ##Take average according to folds, then chooose best params pointer = 0 for param1ind in range(len(paramset1)): for param2ind in range(len(paramset2)): for param3ind in range(len(paramset3)): avgperf = 0. for foldind in range(len(val_sets)): foldperf = all_predictions[pointer][foldind] avgperf += foldperf avgperf /= len(val_sets) #print(epoch, batchsz, avgperf) if avgperf > bestperf: bestperf = avgperf bestpointer = pointer best_param_list = [param1ind, param2ind, param3ind] pointer += 1 return bestpointer, best_param_list, bestperf, all_predictions, all_losses
perfmeasure = get_cindex deepmethod = build_combined_categorical experiment(FLAGS, perfmeasure, deepmethod) class CustomStopper(keras.callbacks.EarlyStopping): def __init__(self, monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto', start_epoch=100): # add argument for starting epoch super(CustomStopper, self).__init__() self.start_epoch = start_epoch def on_epoch_end(self, epoch, logs=None): if epoch > self.start_epoch: super().on_epoch_end(epoch, logs) if __name__ == "__main__": FLAGS = argparser() FLAGS.log_dir = FLAGS.log_dir + str(time.time()) + "/" if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) logging(str(FLAGS), FLAGS) run_regression(FLAGS)
def general_nfold_cv(XD, XD_STRU, XT, XT_STRU, Y, label_row_inds, label_col_inds, prfmeasure, runmethod, FLAGS, labeled_sets, val_sets, ep): NumWinParam = FLAGS.num_windows #paramset1 SmiWinLen = FLAGS.smi_window_lengths #paramset2 SeqWinLen = FLAGS.seq_window_lengths #paramset3 epoch = FLAGS.num_epoch batchsz = FLAGS.batch_size w = len(val_sets) h = len(NumWinParam) * len(SmiWinLen) * len(SeqWinLen) all_predictions = [[0 for x in range(w)] for y in range(h)] all_losses = [[0 for x in range(w)] for y in range(h)] for foldind in range(len(val_sets)): valinds = val_sets[foldind] labeledinds = labeled_sets[foldind] trrows = label_row_inds[labeledinds] trcols = label_col_inds[labeledinds] train_drugs, train_drugs_stru, train_prots, train_prots_stru, train_Y = prepare_interaction_pairs( XD, XD_STRU, XT, XT_STRU, Y, trrows, trcols) terows = label_row_inds[valinds] tecols = label_col_inds[valinds] val_drugs, val_drugs_stru, val_prots, val_prots_stru, val_Y = prepare_interaction_pairs( XD, XD_STRU, XT, XT_STRU, Y, terows, tecols) pointer = 0 for numwinvalue in NumWinParam: for smiwinlenvalue in SmiWinLen: for seqwinlenvalue in SeqWinLen: model = runmethod(FLAGS, numwinvalue, smiwinlenvalue, seqwinlenvalue) my_callbacks = [ EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=15), TensorBoard(log_dir=FLAGS.log_dir) ] model.fit(([ np.array(train_drugs), np.array(train_drugs_stru), np.array(train_prots), np.array(train_prots_stru) ]), np.array(train_Y), batch_size=batchsz, epochs=epoch, validation_data=(([ np.array(val_drugs), np.array(val_drugs_stru), np.array(val_prots), np.array(val_prots_stru) ]), np.array(val_Y)), shuffle=False, callbacks=my_callbacks) model.save(f'{pointer}_{ep}_model.h5') pred = model.predict([ np.array(val_drugs), np.array(val_drugs_stru), np.array(val_prots), np.array(val_prots_stru) ]) loss, prf2 = model.evaluate(([ np.array(val_drugs), np.array(val_drugs_stru), np.array(val_prots), np.array(val_prots_stru) ]), np.array(val_Y), verbose=0) prf = prfmeasure(val_Y, pred) logging( "ValidSets: P1 = %d, P2 = %d, P3 = %d, Fold = %d, CI-i = %f, CI-ii = %f, MSE = %f" % (numwinvalue, smiwinlenvalue, seqwinlenvalue, foldind, prf, prf2, loss), FLAGS) all_predictions[pointer][ foldind] = prf #TODO FOR EACH VAL SET allpredictions[pointer][foldind] all_losses[pointer][foldind] = loss pointer += 1 bestperf = -float('Inf') bestpointer = None best_param_list = [] pointer = 0 for numwinvalue in NumWinParam: for smiwinlenvalue in SmiWinLen: for seqwinlenvalue in SeqWinLen: avgperf = 0. for foldind in range(len(val_sets)): foldperf = all_predictions[pointer][foldind] avgperf += foldperf avgperf /= len(val_sets) if avgperf > bestperf: bestperf = avgperf bestpointer = pointer best_param_list = [ numwinvalue, smiwinlenvalue, seqwinlenvalue ] pointer += 1 return bestpointer, best_param_list, bestperf, all_predictions, all_losses
def general_nfold_cv(XD, XT, Y, label_row_inds, label_col_inds, prfmeasure, FLAGS, labeled_sets, val_sets): # Tách parameter1 từ FLAGS paramset1 = FLAGS.num_windows # [32]#[32, 512] #[32, 128] # filter numbers # Tách parameter2 từ FLAGS paramset2 = FLAGS.smi_window_lengths # [4, 8]#[4, 32] #[4, 8] #filter length smi # Tách parameter3 từ FLAGS paramset3 = FLAGS.seq_window_lengths # [8, 12]#[64, 256] #[64, 192]#[8, 192, 384] # Tách parameter4 từ FLAGS epoch = FLAGS.num_epoch # 100 # Tách batch size từ FLAGS batchsz = FLAGS.batch_size # 256 # Log vào file logging("---Parameter Search-----", FLAGS) # Log vào file print("---Parameter Search-----") # Tính kích thước val_sets w = len(val_sets) # Tính kích thước h = len(paramset1) * len(paramset2) * len(paramset3) # Khởi tạo list all_predictions all_predictions = [[0 for x in range(w)] for y in range(h)] # Khởi tạo list all_losses all_losses = [[0 for x in range(w)] for y in range(h)] # Duyệt từng chỉ số foldind for foldind in range(len(val_sets)): # Lấy giá trị valinds từ foldind valinds = val_sets[foldind] # Lấy giá trị valinds từ foldind labeledinds = labeled_sets[foldind] # Lấy giá trị trrows từ label_row_inds trrows = label_row_inds[labeledinds] # Lấy giá trị trcols từ label_col_inds trcols = label_col_inds[labeledinds] # Lấy các tập train train_drugs, train_prots, train_Y = prepare_interaction_pairs(XD, XT, Y, trrows, trcols) # Lấy giá trị terows từ label_row_inds terows = label_row_inds[valinds] # Lấy giá trị tecols từ label_col_inds tecols = label_col_inds[valinds] # Lấy các tập validation val_drugs, val_prots, val_Y = prepare_interaction_pairs(XD, XT, Y, terows, tecols) # Khởi tạo biến đếm vòng lặp pointer = 0 # Duyệt từng chỉ số param1ind for param1ind in range(len(paramset1)): # Lấy giá trị từ paramset1 param1value = paramset1[param1ind] # Duyệt từng chỉ số param2ind for param2ind in range(len(paramset2)): # Lấy giá trị từ paramset2 param2value = paramset2[param2ind] # Duyệt từng chỉ số param3ind for param3ind in range(len(paramset3)): # Lấy giá trị từ paramset3 param3value = paramset3[param3ind] # Khởi tạo model dta_model = DtaNet(param2value, param1value, param3value) # Chạy trên GPU dta_model.cuda() # Khởi tạo hàm loss loss_func = nn.MSELoss() # Khởi tạo optimizer optimizer = optim.Adam(dta_model.parameters(), lr=0.0005) # Khởi tạo predicted_labels predicted_labels = [] # Duyệt từng epoch for i in range(epoch): # Khởi tạo loss_epoch loss_epoch = 0 # Bắt đầy training dta_model.train() # Tính số lượng train_drugs train_drugs_count = len(train_drugs) # Duyệt các bộ dữ liệu for j in range(0, train_drugs_count, batchsz): # Xóa cache cuda torch.cuda.empty_cache() # Reset gradient optimizer.zero_grad() # Tính dữ liệu k = min(j + batchsz, train_drugs_count) # Lấy bộ dữ liệu con sub_train_drugs = train_drugs[j:k] # Lấy bộ dữ liệu con sub_train_prots = train_prots[j:k] # Lấy bộ dữ liệu con target = train_Y[j:k] # Chuyển thành tensor target = torch.FloatTensor(target) # Sử dụng cuda target = target.cuda() # Chuyển thành tensor sub_train_drugs = torch.tensor(sub_train_drugs, dtype=torch.long) # Sử dụng cuda sub_train_drugs = sub_train_drugs.cuda() # Chuyển thành tensor sub_train_prots = torch.tensor(sub_train_prots, dtype=torch.long) # Sử dụng cuda sub_train_prots = sub_train_prots.cuda() # Tính dự đoán output = dta_model(sub_train_drugs, sub_train_prots) # Khởi tạo hàm loss loss = loss_func(output, target) # Tính gradient theo parameter loss.backward() # Back propagation optimizer.step() # Tính tổng loss loss_epoch += loss.item() * len(sub_train_drugs) # Chuyển sang chế độ đánh giá dta_model.eval() # Khởi tạo loss_eval loss_eval = 0 # Duyệt các bộ dữ liệu for j in range(0, int(len(val_drugs)), batchsz): # Xóa cache cuda torch.cuda.empty_cache() # Tính dữ liệu k = min(j + batchsz, len(val_drugs)) # Lấy bộ dữ liệu con sub_train_drugs = val_drugs[j:k] # Lấy bộ dữ liệu con sub_train_prots = val_prots[j:k] # Lấy bộ dữ liệu con target = val_Y[j:k] # Chuyển thành tensor target = torch.FloatTensor(target) # Sử dụng cuda target = target.cuda() # Chuyển thành tensor sub_train_drugs = torch.tensor(sub_train_drugs, dtype=torch.long) # Sử dụng cuda sub_train_drugs = sub_train_drugs.cuda() # Chuyển thành tensor sub_train_prots = torch.tensor(sub_train_prots, dtype=torch.long) # Sử dụng cuda sub_train_prots = sub_train_prots.cuda() # Tính dự đoán output = dta_model(sub_train_drugs, sub_train_prots) # Tính loss loss = loss_func(output, target) # Tính tổng loss loss_eval += loss.item() * len(sub_train_drugs) # Nếu là epoch cuối if i == epoch - 1: # Nếu kết quả phù hợp if len(predicted_labels) == 0: # Chuyển về numpy predicted_labels = output.cpu().detach().numpy() else: # Chuyển về numpy predicted_labels = np.concatenate((predicted_labels, output.cpu().detach().numpy()), 0) # In ra màn hình print("epoch #", i + 1, ", train loss", loss_epoch * 1.0 / len(train_drugs), ", validation loss", loss_eval / len(val_drugs)) # Log vào file logging("epoch #" + str(i + 1) + ", train loss " + str( loss_epoch * 1.0 / len(train_drugs)) + ", validation loss " + str( loss_eval / len(val_drugs)), FLAGS) # Tính CI-i rperf = prfmeasure(val_Y, predicted_labels) rperf = rperf[0] # Log vào file logging("P1 = %d, P2 = %d, P3 = %d, Fold = %d, CI-i = %f, MSE = %f" % (param1ind, param2ind, param3ind, foldind, rperf, loss_eval / len(val_drugs)), FLAGS) # In ra màn hình print("P1 = %d, P2 = %d, P3 = %d, Fold = %d, CI-i = %f, MSE = %f" % (param1ind, param2ind, param3ind, foldind, rperf, loss_eval / len(val_drugs))) # Lưu lại chỉ số rperf all_predictions[pointer][ foldind] = rperf # Lưu lại loss all_losses[pointer][foldind] = loss_eval / len(val_drugs) # Tăng pointer thêm 1 pointer += 1 # Khởi tạo bestperf bestperf = -float('Inf') # Khởi tạo bestpointer bestpointer = None # Khởi tạo best_param_list best_param_list = [] # Khởi tạo biến đếm pointer = 0 for param1ind in range(len(paramset1)): for param2ind in range(len(paramset2)): for param3ind in range(len(paramset3)): avgperf = 0. # Tính avgPerf trung bình for foldind in range(len(val_sets)): foldperf = all_predictions[pointer][foldind] avgperf += foldperf avgperf /= len(val_sets) # avgPerf tốt hơn đã ghi nhận if avgperf > bestperf: # Lưu lại bestperf = avgperf # Lưu lại bestpointer = pointer # Lưu lại best_param_list = [param1ind, param2ind, param3ind] # Tăng biến đếm thêm 1 pointer += 1 # Trả về các giá trị return bestpointer, best_param_list, bestperf, all_predictions, all_losses
def nfold_1_2_3_setting_sample(XD, XT, Y, label_row_inds, label_col_inds, measure, FLAGS, dataset): # Đọc dữ liệu test_set, outer_train_sets = dataset.read_sets(FLAGS.dataset_path, FLAGS.problem_type) # Tính số lượng fold foldinds = len(outer_train_sets) # Khởi tạo test_sets test_sets = [] # Khởi tạo val_sets val_sets = [] # Khởi tạo train_sets train_sets = [] # Duyệt từng fold for val_foldind in range(foldinds): # Lấy val_fold từ outer_train_sets val_fold = outer_train_sets[val_foldind] # Thêm vào val_sets val_sets.append(val_fold) # Sao chép outer_train_sets otherfolds = deepcopy(outer_train_sets) # Xóa entry cuối của otherfolds otherfolds.pop(val_foldind) # Tạo list các item trong otherfolds otherfoldsinds = [item for sublist in otherfolds for item in sublist] # Thêm list vừa tạo vào train_sets train_sets.append(otherfoldsinds) # Thêm test_set vào test_sets test_sets.append(test_set) # In val_fold ra màn hình print("val set", str(len(val_fold))) # In otherfoldsinds ra màn hình print("train set", str(len(otherfoldsinds))) # Tính hyperparameter của train_sets và val_sets bestparamind, best_param_list, bestperf, all_predictions_not_need, losses_not_need = general_nfold_cv( XD, XT, Y, label_row_inds, label_col_inds, measure, FLAGS, train_sets, val_sets) # Tính hyperparameter của train_sets và test_sets bestparam, best_param_list, bestperf, all_predictions, all_losses = general_nfold_cv( XD, XT, Y, label_row_inds, label_col_inds, measure, FLAGS, train_sets, test_sets) # Log file logging("---FINAL RESULTS-----", FLAGS) # Log best param vào file log logging("best param index = %s, best param = %.5f" % (bestparamind, bestparam), FLAGS) # In ra màn hình print("---FINAL RESULTS-----") # In best param ra màn hình print("best param index = %s, best param = %.5f" % (bestparamind, bestparam)) # Khởi tạo testperfs testperfs = [] # Khởi tạo testloss testloss = [] # Khỏi tạo avgperf avgperf = 0. # Duyệt từng test_set for test_foldind in range(len(test_sets)): # Lấy test performance CI foldperf = all_predictions[bestparamind][test_foldind] # Lấy test performance MSE foldloss = all_losses[bestparamind][test_foldind] # Thêm vào testperfs testperfs.append(foldperf) # Thêm vào testloss testloss.append(foldloss) # Tính avgperf avgperf += foldperf # Tính avgperf avgperf = avgperf / len(test_sets) # Tính trung bình loss avgloss = np.mean(testloss) # Tính độ lệch chuẩn loss teststd = np.std(testperfs) # Log vào file logging("Test Performance CI", FLAGS) # Log vào file logging(testperfs, FLAGS) # Log vào file logging("Test Performance MSE", FLAGS) # Log vào file logging(testloss, FLAGS) # Trả về giá trị return avgperf, avgloss, teststd
label_row_inds, label_col_inds = np.where( np.isnan(Y) == False) # basically finds the point address of affinity [x,Y] print("Logdir: " + FLAGS.log_dir) s1_avgperf, s1_avgloss, s1_teststd = nfold_1_2_3_setting_sample(XD, XT, Y, label_row_inds, label_col_inds, perfmeasure, FLAGS, dataset) logging("Setting " + str(FLAGS.problem_type), FLAGS) logging("avg_perf = %.5f, avg_mse = %.5f, std = %.5f" % (s1_avgperf, s1_avgloss, s1_teststd), FLAGS) print("Setting " + str(FLAGS.problem_type)) print("avg_perf = %.5f, avg_mse = %.5f, std = %.5f" % (s1_avgperf, s1_avgloss, s1_teststd)) def run_regression(FLAGS): perfmeasure = get_cindex experiment(FLAGS, perfmeasure) if __name__ == "__main__": FLAGS = argparser() FLAGS.log_dir = FLAGS.log_dir + str(time.time()) + "/" if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) logging(str(FLAGS), FLAGS) print(str(FLAGS)) run_regression(FLAGS)
def nfold_1_2_3_setting_sample(XD, XT, Y, label_row_inds, label_col_inds, measure, runmethod, FLAGS, dataset): bestparamlist = [] test_set, outer_train_sets = dataset.read_sets(FLAGS.dataset_path, FLAGS.problem_type) foldinds = len(outer_train_sets) test_sets = [] ## TRAIN AND VAL val_sets = [] train_sets = [] #logger.info('Start training') for val_foldind in range(foldinds): val_fold = outer_train_sets[val_foldind] val_sets.append(val_fold) otherfolds = deepcopy(outer_train_sets) otherfolds.pop(val_foldind) otherfoldsinds = [item for sublist in otherfolds for item in sublist] train_sets.append(otherfoldsinds) test_sets.append(test_set) print("val set", str(len(val_fold))) print("train set", str(len(otherfoldsinds))) bestparamind, best_param_list, bestperf, all_predictions_not_need, losses_not_need = general_nfold_cv( XD, XT, Y, label_row_inds, label_col_inds, measure, runmethod, FLAGS, train_sets, val_sets) #print("Test Set len", str(len(test_set))) #print("Outer Train Set len", str(len(outer_train_sets))) bestparam, best_param_list, bestperf, all_predictions, all_losses = general_nfold_cv( XD, XT, Y, label_row_inds, label_col_inds, measure, runmethod, FLAGS, train_sets, test_sets) testperf = all_predictions[bestparamind] ##pointer pos logging("---FINAL RESULTS-----", FLAGS) logging( "best param index = %s, best param = %.5f" % (bestparamind, bestparam), FLAGS) testperfs = [] testloss = [] avgperf = 0. for test_foldind in range(len(test_sets)): foldperf = all_predictions[bestparamind][test_foldind] foldloss = all_losses[bestparamind][test_foldind] testperfs.append(foldperf) testloss.append(foldloss) avgperf += foldperf avgperf = avgperf / len(test_sets) avgloss = np.mean(testloss) teststd = np.std(testperfs) logging("Test Performance CI", FLAGS) logging(testperfs, FLAGS) logging("Test Performance MSE", FLAGS) logging(testloss, FLAGS) return avgperf, avgloss, teststd
def nfold_setting_sample(XD, XT, Y, label_row_inds, label_col_inds, measure, FLAGS, dataset, nfolds, i): test_set = nfolds[5] outer_train_sets = nfolds[0:5] # test_set, outer_train_sets=dataset.read_sets(FLAGS) # if FLAGS.problem_type==1: # test_set, outer_train_sets = dataset.read_sets(FLAGS) foldinds = len(outer_train_sets) ## TRAIN AND VAL val_sets = [] train_sets = [] test_sets = [] for val_foldind in range(foldinds): val_fold = outer_train_sets[val_foldind] val_sets.append(val_fold) otherfolds = deepcopy(outer_train_sets) otherfolds.pop(val_foldind) otherfoldsinds = [item for sublist in otherfolds for item in sublist] train_sets.append(otherfoldsinds) test_sets.append(test_set) print("val set", str(len(val_sets))) print("train set", str(len(train_sets))) bestparamind, best_param_list, bestperf, all_predictions, all_losses = general_nfold_cv( XD, XT, Y, label_row_inds, label_col_inds, measure, FLAGS, train_sets, test_sets, get_aupr, get_rm2) best_param, bestperf, all_predictions, all_losses, all_auc, all_aupr = general_nfold_cv_test( XD, XT, Y, label_row_inds, label_col_inds, measure, FLAGS, train_sets, test_sets, get_rm2, best_param_list, i) logging("---FINAL RESULTS-----", FLAGS) logging("best param = %s" % best_param_list, FLAGS) testperfs = [] testloss = [] testauc = [] testaupr = [] avgperf = 0. for test_foldind in range(len(test_sets)): foldperf = all_predictions[test_foldind] foldloss = all_losses[test_foldind] testperfs.append(foldperf) testloss.append(foldloss) testauc.append(all_auc[test_foldind]) testaupr.append(all_aupr[test_foldind]) avgperf += foldperf avgperf = avgperf / len(test_sets) avgloss = np.mean(testloss) perf_std = np.std(testperfs) loss_std = np.std(testloss) avg_auc = np.mean(testauc) auc_std = np.std(testauc) avg_aupr = np.mean(testaupr) aupr_std = np.std(testaupr) logging("Test Performance CI", FLAGS) logging(testperfs, FLAGS) logging("Test Performance MSE", FLAGS) logging(testloss, FLAGS) print(best_param_list) print('averaged performance', avgperf) return avgperf, avgloss, perf_std, loss_std, avg_auc, auc_std, avg_aupr, aupr_std
def general_nfold_cv(XD, XT, Y, label_row_inds, label_col_inds, prfmeasure, FLAGS, labeled_sets, val_sets, get_aupr, get_rm2): paramset1 = FLAGS.num_windows paramset2 = FLAGS.smi_window_lengths paramset3 = FLAGS.seq_window_lengths lamda_set = FLAGS.lamda batchsz = FLAGS.batch_size # 256 logging("---Parameter Search-----", FLAGS) w = len(val_sets) h = len(paramset1) * len(paramset2) * len(paramset3) * len(lamda_set) all_predictions = [[0 for x in range(w)] for y in range(h)] all_losses = [[0 for x in range(w)] for y in range(h)] for foldind in range(len(val_sets)): valinds = val_sets[foldind] labeledinds = labeled_sets[foldind] trrows = label_row_inds[labeledinds] trcols = label_col_inds[labeledinds] train_dataset = prepare_interaction_pairs(XD, XT, Y, trrows, trcols) terows = label_row_inds[valinds] tecols = label_col_inds[valinds] test_dataset = prepare_interaction_pairs(XD, XT, Y, terows, tecols) pointer = 0 train_loader = DataLoader(dataset=train_dataset, batch_size=batchsz, shuffle=True) test_loader = DataLoader(dataset=test_dataset, batch_size=batchsz) for param1value in paramset1: # hidden neurons for param2value in paramset2: # learning rate for param3value in paramset3: for lamda in lamda_set: model = net(FLAGS, param1value, param2value, param3value).cuda() model.apply(weights_init) rperf_list = [] for epochind in range(FLAGS.num_epoch): model = train(train_loader, model, FLAGS, param1value, param2value, param3value, lamda) rperf, loss, rm2, auc = test( model, test_loader, FLAGS, param1value, param2value, param3value, lamda) rperf_list.append(rperf) ##Set the conditions for early stopping if (epochind + 1) % 5 == 0: print( 'val: epoch:{},p1:{},p2:{},p3:{},loss:{:.5f},rperf:{:.5f}, rm2:{:.5f}' .format(epochind, param1value, param2value, param3value, loss, rperf, rm2)) if rperf >= max(rperf_list): torch.save(model, 'checkpoint.pth') if rperf < max(rperf_list) - 0.1: print( 'The program is stopped early for better performance.' ) break logging( "P1 = %d, P2 = %d, P3 = %d, Fold = %d, CI-i = %f, MSE = %f, rm2 = %f" % (param1value, param2value, param3value, foldind, rperf, loss, rm2), FLAGS) all_predictions[pointer][ foldind] = rperf # TODO FOR EACH VAL SET allpredictions[pointer][foldind] all_losses[pointer][foldind] = loss pointer += 1 bestperf = -float('Inf') bestpointer = None best_param_list = [] ##Take average according to folds, then chooose best params pointer = 0 for param1value in paramset1: for param2value in paramset2: for param3value in paramset3: for lamda in lamda_set: avgperf = 0. for foldind in range(len(val_sets)): foldperf = all_predictions[pointer][foldind] avgperf += foldperf avgperf /= len(val_sets) if avgperf > bestperf: bestperf = avgperf bestpointer = pointer best_param_list = [ param1value, param2value, param3value, lamda, ] pointer += 1 return bestpointer, best_param_list, bestperf, all_predictions, all_losses
def general_nfold_cv(XD, XT, Y, label_row_inds, label_col_inds, prfmeasure, FLAGS, labeled_sets, val_sets): ## BURAYA DA FLAGS LAZIM???? paramset1 = FLAGS.num_windows # [32]#[32, 512] #[32, 128] # filter numbers paramset2 = FLAGS.smi_window_lengths # [4, 8]#[4, 32] #[4, 8] #filter length smi paramset3 = FLAGS.seq_window_lengths # [8, 12]#[64, 256] #[64, 192]#[8, 192, 384] epoch = FLAGS.num_epoch # 100 batchsz = FLAGS.batch_size # 256 logging("---Parameter Search-----", FLAGS) w = len(val_sets) h = len(paramset1) * len(paramset2) * len(paramset3) all_predictions = [[0 for x in range(w)] for y in range(h)] all_losses = [[0 for x in range(w)] for y in range(h)] print(all_predictions) for foldind in range(len(val_sets)): valinds = val_sets[foldind] labeledinds = labeled_sets[foldind] trrows = label_row_inds[labeledinds] trcols = label_col_inds[labeledinds] train_drugs, train_prots, train_Y = prepare_interaction_pairs( XD, XT, Y, trrows, trcols) train_drugs = np.array(train_drugs) # chuyển dữ liệu sang array numpy train_prots = np.array(train_prots) # chuyển dữ liệu sang array numpy train_Y = np.array(train_Y) # chuyển dữ liệu sang array numpy # train_drugs =train_drugs[:1000] # train_prots =train_prots[:1000] # train_Y= train_Y[:1000] terows = label_row_inds[valinds] tecols = label_col_inds[valinds] val_drugs, val_prots, val_Y = prepare_interaction_pairs( XD, XT, Y, terows, tecols) val_drugs = np.array(val_drugs) # chuyển dữ liệu sang array numpy val_prots = np.array(val_prots) # chuyển dữ liệu sang array numpy val_Y = np.array(val_Y) # chuyển dữ liệu sang array numpy # val_drugs= val_drugs[:1000] # val_prots= val_prots[:1000] # val_Y= val_Y[:1000] pointer = 0 print(paramset1) print(paramset2) print(paramset3) for param1ind in range( len(paramset1)): # chọn số filter của các mạng convolution param1value = paramset1[param1ind] for param2ind in range( len(paramset2) ): # chọn kernel_size cho các mạng convolution phần tính encode_smiles param2value = paramset2[param2ind] for param3ind in range( len(paramset3) ): # chọn kernel_size cho các mạng convolution phần tính encode_protein save_model_path = "../model_weight/fold" + str( foldind) + "_param" + str( param1ind) + "_param2ind" + str(param2ind) + str( param3ind) + ".pt" # đường dẫn lưu model param3value = paramset3[param3ind] model = Net(param2value, param1value, param3value) # khởi tạo model print("param ", param2value, " ", param1value, " ", param3value) # in ra tham số criterion = nn.MSELoss( ) # khởi tạo function tính hàm loss , mean square error optimizer = optim.Adam( model.parameters(), lr=0.001 ) # chọn giải thuật Adam là giải thuật tối ưu, dùng learning rate = 0.001 best_predicted_labels = [ ] # lưu kết quả của dự đoán tập validation tốt nhất best_vali_loss = 100000000 # lưu loss vali tốt nhất for i in range(epoch): # vòng lặp epoch loss_epoch = 0 # loss train theo từng epoch model.train() # bắt đầu training for j in range(0, int(len(train_drugs)), batchsz): # vòng lặp step optimizer.zero_grad() # restart lại gradisent end = min(j + batchsz, len(train_drugs)) train_drug_batch = train_drugs[ j:end] # lấy dữ liệu drug training theo step train_prot_batch = train_prots[ j: end] # lấy dữ liệu prots otstraining theo step target = train_Y[j:end] # lấý label theo steps target = torch.FloatTensor( target) # chuyển numpy array thanh tensor train_drug_batch = torch.tensor( train_drug_batch, dtype=torch.long ) # chuyển numpy array thanh tensor train_prot_batch = torch.tensor( train_prot_batch, dtype=torch.long ) # chuyển numpy array thanh tensor output = model( train_drug_batch, train_prot_batch) # tính y dự đoán ( forward) loss = criterion(output, target) # tính loss step loss.backward( ) # tính gradisent trên từng parameter optimizer.step() # back propagation loss_epoch += loss.item() * len( train_drug_batch ) # cộng loss từng step cho loss theo epoch model.eval( ) # khai báo để chuẩn bị đánh giá mô hình, tắt traning, dropout=0,... loss_eval = 0 predicted_labels = [] for j in range(0, int(len(val_drugs)), batchsz): end = min(j + batchsz, len(val_drugs)) vali_drug_batch = val_drugs[ j:end] # lấy dữ liệu drug validation theo step vali_prot_batch = val_prots[ j: end] # lấy dữ liệu prots validation theo step target = val_Y[j:end] # lấý label theo steps target = torch.FloatTensor( target) # chuyển numpy array thanh tensor vali_drug_batch = torch.tensor( vali_drug_batch, dtype=torch.long ) # chuyển numpy array thanh tensor vali_prot_batch = torch.tensor( vali_prot_batch, dtype=torch.long ) # chuyển numpy array thanh tensor output = model( vali_drug_batch, vali_prot_batch) # tính y dự đoán ( forward) loss = criterion(output, target) # tính loss step loss_eval += loss.item() * len( vali_drug_batch ) # cộng loss từng step cho loss theo epoch if len(predicted_labels ) == 0: # lưu toạn bộ y dự đoán predicted_labels = output.cpu().detach().numpy( ) else: predicted_labels = np.concatenate( (predicted_labels, output.cpu().detach().numpy()), 0) if best_vali_loss > ( loss_eval / len(val_drugs) ): # nếu loss ở epoch hiện tại tốt hơn best loss thì lưu model và lưu best predict torch.save(model.state_dict(), save_model_path) # save model best_vali_loss = loss_eval / len( val_drugs) # lưu best loss vali best_predicted_labels = predicted_labels # lưu best predict print("epoch ", i, " , train loss ", loss_epoch * 1.0 / len(train_drugs), " , vali loss ", loss_eval / len(val_drugs)) rperf = prfmeasure(val_Y, best_predicted_labels) # tính CI-i rperf = rperf[0] logging( "P1 = %d, P2 = %d, P3 = %d, Fold = %d, CI-i = %f, MSE = %f" % # ghi log (param1ind, param2ind, param3ind, foldind, rperf, best_vali_loss), FLAGS) all_predictions[pointer][ foldind] = rperf # lưu lại chỉ số rperf của từng lần training k fold và các tham số all_losses[pointer][ foldind] = best_vali_loss # lưu lại chỉ số loss của từng lần training k fold và các tham số khác nhau pointer += 1 bestperf = -float('Inf') bestpointer = None # chọn ra bộ hyper parameter tốt nhất best_param_list = [] ##Take average according to folds, then chooose best params pointer = 0 for param1ind in range(len(paramset1)): for param2ind in range(len(paramset2)): for param3ind in range(len(paramset3)): avgperf = 0. # tính avgperf trung bình của từng bộ hyper parameter ( lấy trung bình theo k =5) for foldind in range(len(val_sets)): foldperf = all_predictions[pointer][foldind] avgperf += foldperf avgperf /= len(val_sets) # print(epoch, batchsz, avgperf) if avgperf > bestperf: # chọn avgperf tốt nhất và lưu các bộ tham số bestperf = avgperf # lưu lại avgperf tốt nhất bestpointer = pointer # lưu lại lần training tốt nhất best_param_list = [param1ind, param2ind, param3ind ] # lưu lại bộ hyper parameter tốt nhấ pointer += 1 # trả về các bộ tham số tốt nhất và CI-i return bestpointer, best_param_list, bestperf, all_predictions, all_losses