Beispiel #1
0
def best_model_test(XD, XD_STRU, XT, XT_STRU, Y, label_row_inds,
                    label_col_inds, FLAGS, measure, test_sets, bestpointer,
                    bestpar_list):
    model = keras.models.load_model(
        f'{bestpar_list[bestpointer]}_{bestpointer+1}_model.h5',
        custom_objects={'cindex_score': cindex_score})
    logging(
        "LoadModel: pointer = %f, ep = %f" %
        (bestpar_list[bestpointer], bestpointer + 1), FLAGS)

    terows = label_row_inds[test_sets]
    tecols = label_col_inds[test_sets]
    test_drugs, test_drugs_stru, test_prots, test_prots_stru, test_Y = prepare_interaction_pairs(
        XD, XD_STRU, XT, XT_STRU, Y, terows, tecols)
    pred = model.predict([
        np.array(test_drugs),
        np.array(test_drugs_stru),
        np.array(test_prots),
        np.array(test_prots_stru)
    ])
    loss, prf2 = model.evaluate(([
        np.array(test_drugs),
        np.array(test_drugs_stru),
        np.array(test_prots),
        np.array(test_prots_stru)
    ]),
                                np.array(test_Y),
                                verbose=0)
    prf = measure(test_Y, pred)
    logging("TestSets: CI-i = %f, CI-ii = %f, MSE = %f" % (prf, prf2, loss),
            FLAGS)

    return prf, loss
Beispiel #2
0
def experiment(FLAGS,
               perfmeasure,
               deepmethod,
               foldcount=6):  #5-fold cross validation + test

    #Input
    #XD: [drugs, features] sized array (features may also be similarities with other drugs
    #XT: [targets, features] sized array (features may also be similarities with other targets
    #Y: interaction values, can be real values or binary (+1, -1), insert value float("nan") for unknown entries
    #perfmeasure: function that takes as input a list of correct and predicted outputs, and returns performance
    #higher values should be better, so if using error measures use instead e.g. the inverse -error(Y, P)
    #foldcount: number of cross-validation folds for settings 1-3, setting 4 always runs 3x3 cross-validation

    dataset = DataSet(fpath=FLAGS.train_path,
                      fpath_test=FLAGS.test_path,
                      setting_no=FLAGS.problem_type,
                      seqlen=FLAGS.max_seq_len,
                      smilen=FLAGS.max_smi_len,
                      need_shuffle=False)
    # set character set size
    FLAGS.charseqset_size = dataset.charseqset_size
    FLAGS.charsmiset_size = dataset.charsmiset_size

    #XD, XT, Y = dataset.parse_data(fpath = FLAGS.dataset_path)
    tr_XD, tr_XT, tr_Y, te_XD, te_XT, te_Y = dataset.parse_train_test_data(
        FLAGS)

    tr_XD = np.asarray(tr_XD)
    tr_XT = np.asarray(tr_XT)
    tr_Y = np.asarray(tr_Y)

    te_XD = np.asarray(te_XD)
    te_XT = np.asarray(te_XT)
    te_Y = np.asarray(te_Y)

    tr_drugcount = tr_XD.shape[0]
    print("train drugs: ", tr_drugcount)
    tr_targetcount = tr_XT.shape[0]
    print("train targets: ", tr_targetcount)

    te_drugcount = te_XD.shape[0]
    print("test drugs: ", te_drugcount)
    te_targetcount = te_XT.shape[0]
    print("test targets: ", te_targetcount)

    FLAGS.drug_count = tr_drugcount
    FLAGS.target_count = tr_targetcount

    if not os.path.exists(figdir):
        os.makedirs(figdir)

    print(FLAGS.log_dir)
    S1_avgperf, S1_avgloss, S1_teststd = nfold_1_2_3_setting_sample(
        tr_XD, tr_XT, tr_Y, te_XD, te_XT, te_Y, perfmeasure, deepmethod, FLAGS,
        dataset)

    logging("Setting " + str(FLAGS.problem_type), FLAGS)
    logging(
        "avg_perf = %.5f,  avg_mse = %.5f, std = %.5f" %
        (S1_avgperf, S1_avgloss, S1_teststd), FLAGS)
Beispiel #3
0
def experiment(FLAGS,
               perfmeasure,
               deepmethod,
               foldcount=6):  #5-fold cross validation + test

    #Input
    #XD: [drugs, features] sized array (features may also be similarities with other drugs
    #XT: [targets, features] sized array (features may also be similarities with other targets
    #Y: interaction values, can be real values or binary (+1, -1), insert value float("nan") for unknown entries
    #perfmeasure: function that takes as input a list of correct and predicted outputs, and returns performance
    #higher values should be better, so if using error measures use instead e.g. the inverse -error(Y, P)
    #foldcount: number of cross-validation folds for settings 1-3, setting 4 always runs 3x3 cross-validation

    dataset = DataSet(
        fpath=FLAGS.dataset_path,  ### BUNU ARGS DA GUNCELLE
        setting_no=FLAGS.problem_type,  ##BUNU ARGS A EKLE
        seqlen=FLAGS.max_seq_len,
        smilen=FLAGS.max_smi_len,
        need_shuffle=False)
    # set character set size
    FLAGS.charseqset_size = dataset.charseqset_size
    FLAGS.charsmiset_size = dataset.charsmiset_size

    XD, XT, Y = dataset.parse_data(fpath=FLAGS.dataset_path)

    XD = np.asarray(XD)
    XT = np.asarray(XT)
    Y = np.asarray(Y)

    drugcount = XD.shape[0]
    print(drugcount)
    targetcount = XT.shape[0]
    print(targetcount)

    FLAGS.drug_count = drugcount
    FLAGS.target_count = targetcount

    label_row_inds, label_col_inds = np.where(np.isnan(
        Y) == False)  #basically finds the point address of affinity [x,y]

    if not os.path.exists(figdir):
        os.makedirs(figdir)

    print(FLAGS.log_dir)
    S1_avgperf, S1_avgloss, S1_teststd = nfold_1_2_3_setting_sample(
        XD, XT, Y, label_row_inds, label_col_inds, perfmeasure, deepmethod,
        FLAGS, dataset)

    logging("Setting " + str(FLAGS.problem_type), FLAGS)
    logging(
        "avg_perf = %.5f,  avg_mse = %.5f, std = %.5f" %
        (S1_avgperf, S1_avgloss, S1_teststd), FLAGS)
Beispiel #4
0
def experiment(FLAGS, perfmeasure, foldcount=6):  # 5-fold cross validation + test

    # Input
    # XD: [drugs, features] sized array (features may also be similarities with other drugs
    # XT: [targets, features] sized array (features may also be similarities with other targets
    # Y: interaction values, can be real values or binary (+1, -1), insert value float("nan") for unknown entries
    # perfmeasure: function that takes as input a list of correct and predicted outputs, and returns performance
    # higher values should be better, so if using error measures use instead e.g. the inverse -error(Y, P)
    # foldcount: number of cross-validation folds for settings 1-3, setting 4 always runs 3x3 cross-validation

    dataset = DataSet(fpath=FLAGS.dataset_path,  # BUNU ARGS DA GUNCELLE
                      setting_no=FLAGS.problem_type,  # BUNU ARGS A EKLE
                      seqlen=FLAGS.max_seq_len,
                      smilen=FLAGS.max_smi_len,
                      need_shuffle=False)
    # set character set size
    FLAGS.charseqset_size = dataset.charseqset_size
    FLAGS.charsmiset_size = dataset.charsmiset_size

    XD, XT, Y = dataset.parse_data(fpath=FLAGS.dataset_path)

    XD = np.asarray(XD)
    XT = np.asarray(XT)
    Y = np.asarray(Y)

    drugcount = XD.shape[0]
    print(drugcount)
    targetcount = XT.shape[0]
    print(targetcount)

    FLAGS.drug_count = drugcount
    FLAGS.target_count = targetcount

    label_row_inds, label_col_inds = np.where(
        np.isnan(Y) == False)  # basically finds the point address of affinity [x,Y]

    print("Logdir: " + FLAGS.log_dir)
    s1_avgperf, s1_avgloss, s1_teststd = nfold_1_2_3_setting_sample(XD, XT, Y, label_row_inds, label_col_inds,
                                                                    perfmeasure, FLAGS, dataset)

    logging("Setting " + str(FLAGS.problem_type), FLAGS)
    logging("avg_perf = %.5f,  avg_mse = %.5f, std = %.5f" %
            (s1_avgperf, s1_avgloss, s1_teststd), FLAGS)
    print("Setting " + str(FLAGS.problem_type))
    print("avg_perf = %.5f,  avg_mse = %.5f, std = %.5f" %
          (s1_avgperf, s1_avgloss, s1_teststd))
Beispiel #5
0
def experiment(FLAGS,
               perfmeasure,
               foldcount=6):  # 5-fold cross validation + test

    dataset = DataSet(
        fpath=FLAGS.dataset_path,  # class lấy dataset
        setting_no=FLAGS.problem_type,
        seqlen=FLAGS.max_seq_len,
        smilen=FLAGS.max_smi_len,
        need_shuffle=False)
    # set character set size
    FLAGS.charseqset_size = dataset.charseqset_size
    FLAGS.charsmiset_size = dataset.charsmiset_size

    XD, XT, Y = dataset.parse_data(fpath=FLAGS.dataset_path)

    XD = np.asarray(XD)  # chuyển sang array
    XT = np.asarray(XT)  # chuyển sang array
    Y = np.asarray(Y)  # chuyển sang array

    drugcount = XD.shape[0]  # lấy số lượng dữ liệu XD
    print(drugcount)
    targetcount = XT.shape[0]  # lấy số lượng dữ liệu XT
    print(targetcount)

    FLAGS.drug_count = drugcount
    FLAGS.target_count = targetcount

    label_row_inds, label_col_inds = np.where(np.isnan(Y) == False)

    if not os.path.exists(figdir):
        os.makedirs(figdir)

    print(FLAGS.log_dir)
    S1_avgperf, S1_avgloss, S1_teststd = nfold_1_2_3_setting_sample(
        XD, XT, Y, label_row_inds, label_col_inds, perfmeasure, FLAGS,
        dataset)  # training

    logging("Setting " + str(FLAGS.problem_type), FLAGS)
    logging(
        "avg_perf = %.5f,  avg_mse = %.5f, std = %.5f" %
        (S1_avgperf, S1_avgloss, S1_teststd), FLAGS)
Beispiel #6
0
def general_nfold_cv(tr_XD, tr_XT, tr_Y, te_XD, te_XT, te_Y, prfmeasure,
                     runmethod, FLAGS, labeled_sets,
                     val_sets):  ## BURAYA DA FLAGS LAZIM????

    paramset1 = FLAGS.num_windows  #[32]#[32,  512] #[32, 128]  # filter numbers
    paramset2 = FLAGS.smi_window_lengths  #[4, 8]#[4,  32] #[4,  8] #filter length smi
    paramset3 = FLAGS.seq_window_lengths  #[8, 12]#[64,  256] #[64, 192]#[8, 192, 384]
    epoch = FLAGS.num_epoch  #100
    batchsz = FLAGS.batch_size  #256

    logging("---Parameter Search-----", FLAGS)

    ### MODIFIED FOR SINGLE TRAIN

    h = len(paramset1) * len(paramset2) * len(paramset3)

    all_predictions = [0 for y in range(h)]
    all_losses = [0 for y in range(h)]

    valinds = val_sets
    labeledinds = labeled_sets

    tr_label_row_inds, tr_label_col_inds = np.where(np.isnan(
        tr_Y) == False)  #basically finds the point address of affinity [x,y]
    te_label_row_inds, te_label_col_inds = np.where(np.isnan(
        te_Y) == False)  #basically finds the point address of affinity [x,y]

    Y_train = np.mat(np.copy(tr_Y))

    params = {}
    XD_train = tr_XD
    XT_train = tr_XT
    trrows = tr_label_row_inds[labeledinds]
    trcols = tr_label_col_inds[labeledinds]

    #print("trrows", str(trrows), str(len(trrows)))
    #print("trcols", str(trcols), str(len(trcols)))

    XD_train = tr_XD[trrows]
    XT_train = tr_XT[trcols]

    train_drugs, train_prots, train_Y = prepare_interaction_pairs(
        tr_XD, tr_XT, tr_Y, trrows, trcols)

    terows = te_label_row_inds[valinds]
    tecols = te_label_col_inds[valinds]
    #print("terows", str(terows), str(len(terows)))
    #print("tecols", str(tecols), str(len(tecols)))

    val_drugs, val_prots, val_Y = prepare_interaction_pairs(
        te_XD, te_XT, te_Y, terows, tecols)

    pointer = 0

    for param1ind in range(len(paramset1)):  #hidden neurons
        param1value = paramset1[param1ind]
        for param2ind in range(len(paramset2)):  #learning rate
            param2value = paramset2[param2ind]

            for param3ind in range(len(paramset3)):
                param3value = paramset3[param3ind]

                gridmodel = runmethod(FLAGS, param1value, param2value,
                                      param3value)

                # Set callback functions to early stop training and save the best model so far
                callbacks = [EarlyStopping(monitor='val_loss', patience=15)]

                gridres = gridmodel.fit(
                    ([np.array(train_drugs),
                      np.array(train_prots)]),
                    np.array(train_Y),
                    batch_size=batchsz,
                    epochs=epoch,
                    shuffle=False)
                #validation_data=( ([np.array(val_drugs), np.array(val_prots) ]), np.array(val_Y)),

                predicted_labels = gridmodel.predict(
                    [np.array(val_drugs),
                     np.array(val_prots)])
                json.dump(
                    predicted_labels.tolist(),
                    open("predicted_labels_" + str(pointer) + ".txt", "w"))
                loss, rperf2 = gridmodel.evaluate(
                    ([np.array(val_drugs),
                      np.array(val_prots)]),
                    np.array(val_Y),
                    verbose=0)
                rperf = prfmeasure(val_Y, predicted_labels)
                #rperf = rperf[0]

                logging(
                    "P1 = %d,  P2 = %d, P3 = %d,  CI-i = %f, CI-ii = %f, MSE = %f"
                    % (param1ind, param2ind, param3ind, rperf, rperf2, loss),
                    FLAGS)

                #plotLoss(gridres, param1ind, param2ind, param3ind, "1")

                all_predictions[
                    pointer] = rperf  #TODO FOR EACH VAL SET allpredictions[pointer][foldind]
                all_losses[pointer] = loss

                pointer += 1

    bestperf = -float('Inf')
    bestpointer = None

    best_param_list = []
    ##Take average according to folds, then chooose best params
    pointer = 0
    for param1ind in range(len(paramset1)):
        for param2ind in range(len(paramset2)):
            for param3ind in range(len(paramset3)):

                avgperf = 0.

                foldperf = all_predictions[pointer]
                avgperf += foldperf
                #avgperf /= len(val_sets)
                #print(epoch, batchsz, avgperf)
                if avgperf > bestperf:
                    bestperf = avgperf
                    bestpointer = pointer
                    best_param_list = [param1ind, param2ind, param3ind]

                pointer += 1

    return bestpointer, best_param_list, bestperf, all_predictions, all_losses
Beispiel #7
0
def nfold_1_2_3_setting_sample(tr_XD, tr_XT, tr_Y, te_XD, te_XT, te_Y, measure,
                               runmethod, FLAGS, dataset):

    bestparamlist = []
    test_set, outer_train_sets = dataset.read_sets(FLAGS)

    ### MODIFIED FOR SINGLE TRAIN AND TEST #####
    train_set = outer_train_sets
    #train_set = [item for sublist in outer_train_sets for item in sublist]

    bestparamind, best_param_list, bestperf, all_predictions, all_losses = general_nfold_cv(
        tr_XD, tr_XT, tr_Y, te_XD, te_XT, te_Y, measure, runmethod, FLAGS,
        train_set, test_set)

    testperf = all_predictions[bestparamind]  ##pointer pos

    logging("---FINAL RESULTS-----", FLAGS)
    logging("best param index = %s" % bestparamind, FLAGS)

    testperfs = []
    testloss = []

    avgperf = 0.

    foldperf = all_predictions[bestparamind]
    foldloss = all_losses[bestparamind]
    testperfs.append(foldperf)
    testloss.append(foldloss)
    avgperf += foldperf

    avgperf = avgperf / 1
    avgloss = np.mean(testloss)
    teststd = np.std(testperfs)

    logging("Test Performance CI", FLAGS)
    logging(testperfs, FLAGS)
    logging("Test Performance MSE", FLAGS)
    logging(testloss, FLAGS)

    return avgperf, avgloss, teststd
Beispiel #8
0
def general_nfold_cv(XD, XT, Y, label_row_inds, label_col_inds, prfmeasure,
                     FLAGS, labeled_sets,
                     val_sets):  ## BURAYA DA FLAGS LAZIM????

    paramset1 = FLAGS.num_windows  # [32]#[32,  512] #[32, 128]  # filter numbers
    paramset2 = FLAGS.smi_window_lengths  # [4, 8]#[4,  32] #[4,  8] #filter length smi
    paramset3 = FLAGS.seq_window_lengths  # [8, 12]#[64,  256] #[64, 192]#[8, 192, 384]
    epoch = FLAGS.num_epoch  # 100
    batchsz = FLAGS.batch_size  # 256

    logging("---Parameter Search-----", FLAGS)

    w = len(val_sets)
    h = len(paramset1) * len(paramset2) * len(paramset3)

    all_predictions = [[0 for x in range(w)] for y in range(h)]
    all_losses = [[0 for x in range(w)] for y in range(h)]
    print(all_predictions)

    for foldind in range(len(val_sets)):
        valinds = val_sets[foldind]
        labeledinds = labeled_sets[foldind]

        Y_train = np.mat(np.copy(Y))

        params = {}
        XD_train = XD
        XT_train = XT
        trrows = label_row_inds[labeledinds]
        trcols = label_col_inds[labeledinds]

        # print("trrows", str(trrows), str(len(trrows)))
        # print("trcols", str(trcols), str(len(trcols)))

        XD_train = XD[trrows]
        XT_train = XT[trcols]

        train_drugs, train_prots, train_Y = prepare_interaction_pairs(
            XD, XT, Y, trrows, trcols)
        train_drugs = np.array(train_drugs)
        train_prots = np.array(train_prots)
        train_Y = np.array(train_Y)
        train_drugs = train_drugs[:1000]
        train_prots = train_prots[:1000]
        train_Y = train_Y[:1000]
        terows = label_row_inds[valinds]
        tecols = label_col_inds[valinds]
        # print("terows", str(terows), str(len(terows)))
        # print("tecols", str(tecols), str(len(tecols)))

        val_drugs, val_prots, val_Y = prepare_interaction_pairs(
            XD, XT, Y, terows, tecols)
        val_drugs = np.array(val_drugs)
        val_prots = np.array(val_prots)
        val_Y = np.array(val_Y)
        val_drugs = val_drugs[:1000]
        val_prots = val_prots[:1000]
        val_Y = val_Y[:1000]
        pointer = 0
        print(paramset1)
        print(paramset2)
        print(paramset3)
        for param1ind in range(len(paramset1)):  # hidden neurons
            param1value = paramset1[param1ind]
            for param2ind in range(len(paramset2)):  # learning rate
                param2value = paramset2[param2ind]
                for param3ind in range(len(paramset3)):
                    for jj in range(100000000):
                        print(jj)
                        param3value = paramset3[param3ind]
                        torch.cuda.empty_cache()
                        model = Net(param2value, param1value, param3value)

    bestperf = -float('Inf')
    bestpointer = None

    best_param_list = []
    ##Take average according to folds, then chooose best params
    pointer = 0
    for param1ind in range(len(paramset1)):
        for param2ind in range(len(paramset2)):
            for param3ind in range(len(paramset3)):

                avgperf = 0.
                for foldind in range(len(val_sets)):
                    foldperf = all_predictions[pointer][foldind]
                    avgperf += foldperf
                avgperf /= len(val_sets)
                # print(epoch, batchsz, avgperf)
                if avgperf > bestperf:
                    bestperf = avgperf
                    bestpointer = pointer
                    best_param_list = [param1ind, param2ind, param3ind]

                pointer += 1

    return bestpointer, best_param_list, bestperf, all_predictions, all_losses
Beispiel #9
0
def experiment(FLAGS, foldcount=6):  # 5-fold cross validation + test

    # Input
    # XD: [drugs, features] sized array (features may also be similarities with other drugs
    # XT: [targets, features] sized array (features may also be similarities with other targets
    # Y: interaction values, can be real values or binary (+1, -1), insert value float("nan") for unknown entries
    # perfmeasure: function that takes as input a list of correct and predicted outputs, and returns performance
    # higher values should be better, so if using error measures use instead e.g. the inverse -error(Y, P)
    # foldcount: number of cross-validation folds for settings 1-3, setting 4 always runs 3x3 cross-validation

    dataset = DataSet(
        fpath=FLAGS.dataset_path,  ### BUNU ARGS DA GUNCELLE
        setting_no=FLAGS.problem_type,  ##BUNU ARGS A EKLE
        seqlen=FLAGS.max_seq_len,
        smilen=FLAGS.max_smi_len,
        need_shuffle=False)
    # set character set size
    FLAGS.charseqset_size = dataset.charseqset_size
    FLAGS.charsmiset_size = dataset.charsmiset_size

    XD, XT, Y = dataset.parse_data(FLAGS)

    XD = np.asarray(XD)
    XT = np.asarray(XT)
    Y = np.asarray(Y)

    drugcount = XD.shape[0]
    print(drugcount)
    targetcount = XT.shape[0]
    print(targetcount)

    FLAGS.drug_count = drugcount
    FLAGS.target_count = targetcount

    label_row_inds, label_col_inds = np.where(np.isnan(Y) == False)

    if not os.path.exists(figdir):
        os.makedirs(figdir)
    perf = []
    mseloss = []
    auc = []
    aupr = []
    for i in range(1):
        random.seed(i + 1000)
        if FLAGS.problem_type == 1:
            nfolds = get_random_folds(len(label_row_inds), foldcount)
        if FLAGS.problem_type == 2:
            nfolds = get_drugwise_folds(label_row_inds, label_col_inds,
                                        drugcount, foldcount)
        if FLAGS.problem_type == 3:
            nfolds = get_targetwise_folds(label_row_inds, label_col_inds,
                                          targetcount, foldcount)
        avgperf, avgloss, teststd, lossstd, avg_auc, auc_std, avg_aupr, aupr_std = nfold_setting_sample(
            XD, XT, Y, label_row_inds, label_col_inds, get_cindex, FLAGS,
            dataset, nfolds, i)
        logging("Setting " + str(FLAGS.problem_type), FLAGS)

        logging(
            "avg_perf = %.5f,  avg_mse = %.5f, std = %.5f, loss_std = %.5f, auc = %.5f, auc_std = %.5f, aupr =%.5f, aupr_std = %.5f"
            % (avgperf, avgloss, teststd, lossstd, avg_auc, auc_std, avg_aupr,
               aupr_std), FLAGS)

        perf.append(avgperf)
        mseloss.append(avgloss)
        auc.append(avg_auc)
        aupr.append(avg_aupr)
    print(FLAGS.log_dir)

    logging(("Finally"), FLAGS)

    logging(
        "avg_perf = %.5f,  avg_mse = %.5f, std = %.5f, loss_std = %.5f,auc = %.5f, auc_std = %.5f, aupr =%.5f, aupr_std = %.5f"
        % (np.mean(perf), np.mean(mseloss), np.std(perf), np.std(mseloss),
           np.mean(auc), np.std(auc), np.mean(aupr), np.std(aupr)), FLAGS)
Beispiel #10
0
def general_nfold_cv_test(XD, XT, Y, label_row_inds, label_col_inds,
                          prfmeasure, FLAGS, labeled_sets, val_sets, get_rm2,
                          best_param_list, i):
    param1value = best_param_list[0]
    param2value = best_param_list[1]
    param3value = best_param_list[2]
    lamda = best_param_list[3]
    batchsz = FLAGS.batch_size  # 256

    logging("---Parameter Search-----", FLAGS)

    w = len(val_sets)

    all_predictions = [0 for x in range(w)]
    all_losses = [0 for x in range(w)]
    all_auc = [0 for x in range(w)]
    all_aupr = [0 for x in range(w)]
    all_preaffinities = []
    all_affinities = []
    for foldind in range(len(val_sets)):
        valinds = val_sets[foldind]
        labeledinds = labeled_sets[foldind]

        trrows = label_row_inds[labeledinds]
        trcols = label_col_inds[labeledinds]

        train_dataset = prepare_interaction_pairs(XD, XT, Y, trrows, trcols)

        terows = label_row_inds[valinds]
        tecols = label_col_inds[valinds]

        test_dataset = prepare_interaction_pairs(XD, XT, Y, terows, tecols)

        train_loader = DataLoader(dataset=train_dataset,
                                  batch_size=batchsz,
                                  shuffle=True)
        test_loader = DataLoader(dataset=test_dataset, batch_size=batchsz)
        model = net(FLAGS, param1value, param2value, param3value).cuda()
        model.apply(weights_init)
        rperf_list = []
        for epochind in range(FLAGS.num_epoch):
            model = train(train_loader, model, FLAGS, param1value, param2value,
                          param3value, lamda)
            if (epochind + 1) % 2 == 0:
                rperf, loss, rm2, auc = test(model, test_loader, FLAGS,
                                             param1value, param2value,
                                             param3value, lamda)
                rperf_list.append(rperf)
                print(
                    'test: epoch:{},p1:{},p2:{},p3:{},loss:{:.5f},rperf:{:.5f}, rm2:{:.5f}'
                    .format(epochind, param1value, param2value, param3value,
                            loss, rperf, rm2))

                if rperf >= max(rperf_list):
                    torch.save(model, 'checkpoint.pth')
                if rperf < max(rperf_list) - 0.1:
                    break
        loss_func = nn.MSELoss()
        affinities = []
        pre_affinities = []
        model = torch.load('checkpoint.pth')
        model.eval()
        for drug_SMILES, target_protein, affinity in test_loader:
            pre_affinity, _, _, _, _, _, _, _, _ = model(
                drug_SMILES, target_protein, FLAGS, param1value, param2value,
                param3value)
            pre_affinities += pre_affinity.cpu().detach().numpy().tolist()
            affinities += affinity.cpu().detach().numpy().tolist()

        pre_affinities = np.array(pre_affinities)
        affinities = np.array(affinities)
        if 'davis' in FLAGS.dataset_path:
            pre_label = pre_affinities
            label = np.int32(affinities > 7.0)
            auc = roc_auc_score(label, pre_label)
            aupr = get_aupr(label, pre_label)
        if 'kiba' in FLAGS.dataset_path:
            pre_label = pre_affinities
            label = np.int32(affinities > 12.1)
            auc = roc_auc_score(label, pre_label)
            aupr = get_aupr(label, pre_label)
        rperf = prfmeasure(affinities, pre_affinities)
        rm2 = get_rm2(affinities, pre_affinities)
        loss = loss_func(torch.Tensor(pre_affinities),
                         torch.Tensor(affinities))
        print('best: p1:{},p2:{},p3:{},loss:{:.5f},rperf:{:.5f}, rm2:{:.5f}'.
              format(param1value, param2value, param3value, loss, rperf, rm2))

        logging(
            "best: P1 = %d,  P2 = %d, P3 = %d, Fold = %d, CI-i = %f, MSE = %f, auc = %f, aupr = %f"
            % (param1value, param2value, param3value, foldind, rperf, loss,
               auc, aupr), FLAGS)

        all_predictions[
            foldind] = rperf  # TODO FOR EACH VAL SET allpredictions[pointer][foldind]
        all_losses[foldind] = loss
        all_auc[foldind] = auc
        all_aupr[foldind] = aupr
        all_affinities.append(affinities)
        all_preaffinities.append(pre_affinities)
    # save affinities and preaffinites for further analysis
    np.savetxt("./result/iter" + str(i) + "affinities.txt",
               np.array(all_affinities))
    np.savetxt("./result/iter" + str(i) + "preaffinities.txt",
               np.array(all_preaffinities))

    best_param_list = [param1value, param2value, param3value, lamda]
    best_perf = np.mean(all_predictions)

    return best_param_list, best_perf, all_predictions, all_losses, all_auc, all_aupr
Beispiel #11
0
def nfold_1_2_3_setting_sample(
    XD, XT, Y, label_row_inds, label_col_inds, measure, FLAGS, dataset
):  # tao du lieu training , validation, va testing chayj theo k fold
    bestparamlist = []
    test_set, outer_train_sets = dataset.read_sets(
        FLAGS.dataset_path, FLAGS.problem_type)  # du lieu training va testing

    foldinds = len(outer_train_sets)

    test_sets = []
    ## TRAIN AND VAL
    val_sets = []
    train_sets = []
    # phan chia du lieu thanh training, testing v
    for val_foldind in range(foldinds):
        val_fold = outer_train_sets[val_foldind]
        val_sets.append(val_fold)
        otherfolds = deepcopy(outer_train_sets)
        otherfolds.pop(val_foldind)
        otherfoldsinds = [item for sublist in otherfolds for item in sublist]
        train_sets.append(otherfoldsinds)
        test_sets.append(test_set)
        print("val set", str(len(val_fold)))
        print("train set", str(len(otherfoldsinds)))

    # chon hyperparameter tot nhat với tập dữ liệu training và validation
    bestparamind, best_param_list, bestperf, all_predictions_not_need, losses_not_need = general_nfold_cv(
        XD, XT, Y, label_row_inds, label_col_inds, measure, FLAGS, train_sets,
        val_sets)

    # print("Test Set len", str(len(test_set)))
    # print("Outer Train Set len", str(len(outer_train_sets)))
    # chon hyperparameter tot nhat voi tap du lieu training va testing
    bestparam, best_param_list, bestperf, all_predictions, all_losses = general_nfold_cv(
        XD, XT, Y, label_row_inds, label_col_inds, measure, FLAGS, train_sets,
        test_sets)

    testperf = all_predictions[bestparamind]  ##pointer pos
    logging("---FINAL RESULTS-----", FLAGS)
    logging(
        "best param index = %s,  best param = %.5f"
        %  # Kết quả tham số tốt nhất traning dùng tập training và validation
        (bestparamind, bestparam),
        FLAGS)

    testperfs = []
    testloss = []

    avgperf = 0.

    for test_foldind in range(len(test_sets)):
        foldperf = all_predictions[bestparamind][
            test_foldind]  # lấy ra kết quả CI loss của mỗi tổ hợp hyperparameter
        foldloss = all_losses[bestparamind][
            test_foldind]  # lấy ra kết quả MSE loss của mỗi tố hợp hyperparameter
        testperfs.append(foldperf)
        testloss.append(foldloss)
        avgperf += foldperf  # tính CI loss của tất cả quá trình traninng của các model

    avgperf = avgperf / len(test_sets)  # tính CI loss trung bình
    avgloss = np.mean(testloss)  # tính loss mse
    teststd = np.std(testperfs)  # tính độ lệch chuẩn

    logging("Test Performance CI", FLAGS)
    logging(testperfs, FLAGS)
    logging("Test Performance MSE", FLAGS)
    logging(testloss, FLAGS)

    return avgperf, avgloss, teststd
Beispiel #12
0
def general_nfold_cv(XD, XT, Y, label_row_inds, label_col_inds, prfmeasure,
                     runmethod, FLAGS, labeled_sets,
                     val_sets):  ## BURAYA DA FLAGS LAZIM????

    paramset1 = FLAGS.num_windows  #[32]#[32,  512] #[32, 128]  # filter numbers
    paramset2 = FLAGS.smi_window_lengths  #[4, 8]#[4,  32] #[4,  8] #filter length smi
    paramset3 = FLAGS.seq_window_lengths  #[8, 12]#[64,  256] #[64, 192]#[8, 192, 384]
    epoch = FLAGS.num_epoch  #100
    batchsz = FLAGS.batch_size  #256

    logging("---Parameter Search-----", FLAGS)

    w = len(val_sets)
    h = len(paramset1) * len(paramset2) * len(paramset3)

    all_predictions = [[0 for x in range(w)] for y in range(h)]
    all_losses = [[0 for x in range(w)] for y in range(h)]
    print(all_predictions)

    for foldind in range(len(val_sets)):
        valinds = val_sets[foldind]
        labeledinds = labeled_sets[foldind]

        Y_train = np.mat(np.copy(Y))

        params = {}
        XD_train = XD
        XT_train = XT
        trrows = label_row_inds[labeledinds]
        trcols = label_col_inds[labeledinds]

        #print("trrows", str(trrows), str(len(trrows)))
        #print("trcols", str(trcols), str(len(trcols)))

        XD_train = XD[trrows]
        XT_train = XT[trcols]

        train_drugs, train_prots, train_Y = prepare_interaction_pairs(
            XD, XT, Y, trrows, trcols)

        terows = label_row_inds[valinds]
        tecols = label_col_inds[valinds]
        #print("terows", str(terows), str(len(terows)))
        #print("tecols", str(tecols), str(len(tecols)))

        val_drugs, val_prots, val_Y = prepare_interaction_pairs(
            XD, XT, Y, terows, tecols)

        pointer = 0

        for param1ind in range(len(paramset1)):  #hidden neurons
            param1value = paramset1[param1ind]
            for param2ind in range(len(paramset2)):  #learning rate
                param2value = paramset2[param2ind]

                for param3ind in range(len(paramset3)):
                    param3value = paramset3[param3ind]

                    gridmodel = runmethod(FLAGS, param1value, param2value,
                                          param3value)
                    gridres = gridmodel.fit(
                        ([np.array(train_drugs),
                          np.array(train_prots)]),
                        np.array(train_Y),
                        batch_size=batchsz,
                        epochs=epoch,
                        validation_data=(([
                            np.array(val_drugs),
                            np.array(val_prots)
                        ]), np.array(val_Y)),
                        shuffle=False)

                    predicted_labels = gridmodel.predict(
                        [np.array(val_drugs),
                         np.array(val_prots)])
                    loss, rperf2 = gridmodel.evaluate(
                        ([np.array(val_drugs),
                          np.array(val_prots)]),
                        np.array(val_Y),
                        verbose=0)
                    rperf = prfmeasure(val_Y, predicted_labels)
                    rperf = rperf[0]

                    logging(
                        "P1 = %d,  P2 = %d, P3 = %d, Fold = %d, CI-i = %f, CI-ii = %f, MSE = %f"
                        % (param1ind, param2ind, param3ind, foldind, rperf,
                           rperf2, loss), FLAGS)

                    plotLoss(gridres, param1ind, param2ind, param3ind, foldind)

                    all_predictions[pointer][
                        foldind] = rperf  #TODO FOR EACH VAL SET allpredictions[pointer][foldind]
                    all_losses[pointer][foldind] = loss

                    pointer += 1

    bestperf = -float('Inf')
    bestpointer = None

    best_param_list = []
    ##Take average according to folds, then chooose best params
    pointer = 0
    for param1ind in range(len(paramset1)):
        for param2ind in range(len(paramset2)):
            for param3ind in range(len(paramset3)):

                avgperf = 0.
                for foldind in range(len(val_sets)):
                    foldperf = all_predictions[pointer][foldind]
                    avgperf += foldperf
                avgperf /= len(val_sets)
                #print(epoch, batchsz, avgperf)
                if avgperf > bestperf:
                    bestperf = avgperf
                    bestpointer = pointer
                    best_param_list = [param1ind, param2ind, param3ind]

                pointer += 1

    return bestpointer, best_param_list, bestperf, all_predictions, all_losses
Beispiel #13
0
    perfmeasure = get_cindex
    deepmethod = build_combined_categorical

    experiment(FLAGS, perfmeasure, deepmethod)


class CustomStopper(keras.callbacks.EarlyStopping):
    def __init__(self,
                 monitor='val_loss',
                 min_delta=0,
                 patience=0,
                 verbose=0,
                 mode='auto',
                 start_epoch=100):  # add argument for starting epoch
        super(CustomStopper, self).__init__()
        self.start_epoch = start_epoch

    def on_epoch_end(self, epoch, logs=None):
        if epoch > self.start_epoch:
            super().on_epoch_end(epoch, logs)


if __name__ == "__main__":
    FLAGS = argparser()
    FLAGS.log_dir = FLAGS.log_dir + str(time.time()) + "/"

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)

    logging(str(FLAGS), FLAGS)
    run_regression(FLAGS)
Beispiel #14
0
def general_nfold_cv(XD, XD_STRU, XT, XT_STRU, Y, label_row_inds,
                     label_col_inds, prfmeasure, runmethod, FLAGS,
                     labeled_sets, val_sets, ep):
    NumWinParam = FLAGS.num_windows  #paramset1
    SmiWinLen = FLAGS.smi_window_lengths  #paramset2
    SeqWinLen = FLAGS.seq_window_lengths  #paramset3
    epoch = FLAGS.num_epoch
    batchsz = FLAGS.batch_size

    w = len(val_sets)
    h = len(NumWinParam) * len(SmiWinLen) * len(SeqWinLen)
    all_predictions = [[0 for x in range(w)] for y in range(h)]
    all_losses = [[0 for x in range(w)] for y in range(h)]

    for foldind in range(len(val_sets)):
        valinds = val_sets[foldind]
        labeledinds = labeled_sets[foldind]

        trrows = label_row_inds[labeledinds]
        trcols = label_col_inds[labeledinds]
        train_drugs, train_drugs_stru, train_prots, train_prots_stru, train_Y = prepare_interaction_pairs(
            XD, XD_STRU, XT, XT_STRU, Y, trrows, trcols)

        terows = label_row_inds[valinds]
        tecols = label_col_inds[valinds]
        val_drugs, val_drugs_stru, val_prots, val_prots_stru, val_Y = prepare_interaction_pairs(
            XD, XD_STRU, XT, XT_STRU, Y, terows, tecols)

        pointer = 0
        for numwinvalue in NumWinParam:
            for smiwinlenvalue in SmiWinLen:
                for seqwinlenvalue in SeqWinLen:
                    model = runmethod(FLAGS, numwinvalue, smiwinlenvalue,
                                      seqwinlenvalue)
                    my_callbacks = [
                        EarlyStopping(monitor='val_loss',
                                      mode='min',
                                      verbose=1,
                                      patience=15),
                        TensorBoard(log_dir=FLAGS.log_dir)
                    ]

                    model.fit(([
                        np.array(train_drugs),
                        np.array(train_drugs_stru),
                        np.array(train_prots),
                        np.array(train_prots_stru)
                    ]),
                              np.array(train_Y),
                              batch_size=batchsz,
                              epochs=epoch,
                              validation_data=(([
                                  np.array(val_drugs),
                                  np.array(val_drugs_stru),
                                  np.array(val_prots),
                                  np.array(val_prots_stru)
                              ]), np.array(val_Y)),
                              shuffle=False,
                              callbacks=my_callbacks)

                    model.save(f'{pointer}_{ep}_model.h5')
                    pred = model.predict([
                        np.array(val_drugs),
                        np.array(val_drugs_stru),
                        np.array(val_prots),
                        np.array(val_prots_stru)
                    ])
                    loss, prf2 = model.evaluate(([
                        np.array(val_drugs),
                        np.array(val_drugs_stru),
                        np.array(val_prots),
                        np.array(val_prots_stru)
                    ]),
                                                np.array(val_Y),
                                                verbose=0)
                    prf = prfmeasure(val_Y, pred)
                    logging(
                        "ValidSets: P1 = %d,  P2 = %d, P3 = %d, Fold = %d, CI-i = %f, CI-ii = %f, MSE = %f"
                        % (numwinvalue, smiwinlenvalue, seqwinlenvalue,
                           foldind, prf, prf2, loss), FLAGS)

                    all_predictions[pointer][
                        foldind] = prf  #TODO FOR EACH VAL SET allpredictions[pointer][foldind]
                    all_losses[pointer][foldind] = loss
                    pointer += 1

    bestperf = -float('Inf')
    bestpointer = None
    best_param_list = []
    pointer = 0
    for numwinvalue in NumWinParam:
        for smiwinlenvalue in SmiWinLen:
            for seqwinlenvalue in SeqWinLen:
                avgperf = 0.
                for foldind in range(len(val_sets)):
                    foldperf = all_predictions[pointer][foldind]
                    avgperf += foldperf
                avgperf /= len(val_sets)
                if avgperf > bestperf:
                    bestperf = avgperf
                    bestpointer = pointer
                    best_param_list = [
                        numwinvalue, smiwinlenvalue, seqwinlenvalue
                    ]

                pointer += 1
    return bestpointer, best_param_list, bestperf, all_predictions, all_losses
Beispiel #15
0
def general_nfold_cv(XD, XT, Y, label_row_inds, label_col_inds, prfmeasure, FLAGS, labeled_sets,
                     val_sets):
    # Tách parameter1 từ FLAGS
    paramset1 = FLAGS.num_windows  # [32]#[32,  512] #[32, 128]  # filter numbers
    # Tách parameter2 từ FLAGS
    paramset2 = FLAGS.smi_window_lengths  # [4, 8]#[4,  32] #[4,  8] #filter length smi
    # Tách parameter3 từ FLAGS
    paramset3 = FLAGS.seq_window_lengths  # [8, 12]#[64,  256] #[64, 192]#[8, 192, 384]
    # Tách parameter4 từ FLAGS
    epoch = FLAGS.num_epoch  # 100
    # Tách batch size từ FLAGS
    batchsz = FLAGS.batch_size  # 256

    # Log vào file
    logging("---Parameter Search-----", FLAGS)
    # Log vào file
    print("---Parameter Search-----")

    # Tính kích thước val_sets
    w = len(val_sets)
    # Tính kích thước
    h = len(paramset1) * len(paramset2) * len(paramset3)

    # Khởi tạo list all_predictions
    all_predictions = [[0 for x in range(w)] for y in range(h)]
    # Khởi tạo list all_losses
    all_losses = [[0 for x in range(w)] for y in range(h)]

    # Duyệt từng chỉ số foldind
    for foldind in range(len(val_sets)):
        # Lấy giá trị valinds từ foldind
        valinds = val_sets[foldind]
        # Lấy giá trị valinds từ foldind
        labeledinds = labeled_sets[foldind]

        # Lấy giá trị trrows từ label_row_inds
        trrows = label_row_inds[labeledinds]
        # Lấy giá trị trcols từ label_col_inds
        trcols = label_col_inds[labeledinds]

        # Lấy các tập train
        train_drugs, train_prots, train_Y = prepare_interaction_pairs(XD, XT, Y, trrows, trcols)

        # Lấy giá trị terows từ label_row_inds
        terows = label_row_inds[valinds]
        # Lấy giá trị tecols từ label_col_inds
        tecols = label_col_inds[valinds]

        # Lấy các tập validation
        val_drugs, val_prots, val_Y = prepare_interaction_pairs(XD, XT, Y, terows, tecols)

        # Khởi tạo biến đếm vòng lặp
        pointer = 0

        # Duyệt từng chỉ số param1ind
        for param1ind in range(len(paramset1)):
            # Lấy giá trị từ paramset1
            param1value = paramset1[param1ind]

            # Duyệt từng chỉ số param2ind
            for param2ind in range(len(paramset2)):
                # Lấy giá trị từ paramset2
                param2value = paramset2[param2ind]

                # Duyệt từng chỉ số param3ind
                for param3ind in range(len(paramset3)):
                    # Lấy giá trị từ paramset3
                    param3value = paramset3[param3ind]

                    # Khởi tạo model
                    dta_model = DtaNet(param2value, param1value, param3value)
                    # Chạy trên GPU
                    dta_model.cuda()
                    # Khởi tạo hàm loss
                    loss_func = nn.MSELoss()
                    # Khởi tạo optimizer
                    optimizer = optim.Adam(dta_model.parameters(), lr=0.0005)
                    # Khởi tạo predicted_labels
                    predicted_labels = []

                    # Duyệt từng epoch
                    for i in range(epoch):
                        # Khởi tạo loss_epoch
                        loss_epoch = 0
                        # Bắt đầy training
                        dta_model.train()
                        # Tính số lượng train_drugs
                        train_drugs_count = len(train_drugs)
                        # Duyệt các bộ dữ liệu
                        for j in range(0, train_drugs_count, batchsz):
                            # Xóa cache cuda
                            torch.cuda.empty_cache()
                            # Reset gradient
                            optimizer.zero_grad()
                            # Tính dữ liệu
                            k = min(j + batchsz, train_drugs_count)
                            # Lấy bộ dữ liệu con
                            sub_train_drugs = train_drugs[j:k]
                            # Lấy bộ dữ liệu con
                            sub_train_prots = train_prots[j:k]
                            # Lấy bộ dữ liệu con
                            target = train_Y[j:k]
                            # Chuyển thành tensor
                            target = torch.FloatTensor(target)
                            # Sử dụng cuda
                            target = target.cuda()
                            # Chuyển thành tensor
                            sub_train_drugs = torch.tensor(sub_train_drugs, dtype=torch.long)
                            # Sử dụng cuda
                            sub_train_drugs = sub_train_drugs.cuda()
                            # Chuyển thành tensor
                            sub_train_prots = torch.tensor(sub_train_prots, dtype=torch.long)
                            # Sử dụng cuda
                            sub_train_prots = sub_train_prots.cuda()
                            # Tính dự đoán
                            output = dta_model(sub_train_drugs, sub_train_prots)
                            # Khởi tạo hàm loss
                            loss = loss_func(output, target)
                            # Tính gradient theo parameter
                            loss.backward()
                            # Back propagation
                            optimizer.step()
                            # Tính tổng loss
                            loss_epoch += loss.item() * len(sub_train_drugs)

                        # Chuyển sang chế độ đánh giá
                        dta_model.eval()
                        # Khởi tạo loss_eval
                        loss_eval = 0
                        # Duyệt các bộ dữ liệu
                        for j in range(0, int(len(val_drugs)), batchsz):
                            # Xóa cache cuda
                            torch.cuda.empty_cache()
                            # Tính dữ liệu
                            k = min(j + batchsz, len(val_drugs))
                            # Lấy bộ dữ liệu con
                            sub_train_drugs = val_drugs[j:k]
                            # Lấy bộ dữ liệu con
                            sub_train_prots = val_prots[j:k]
                            # Lấy bộ dữ liệu con
                            target = val_Y[j:k]
                            # Chuyển thành tensor
                            target = torch.FloatTensor(target)
                            # Sử dụng cuda
                            target = target.cuda()
                            # Chuyển thành tensor
                            sub_train_drugs = torch.tensor(sub_train_drugs, dtype=torch.long)
                            # Sử dụng cuda
                            sub_train_drugs = sub_train_drugs.cuda()
                            # Chuyển thành tensor
                            sub_train_prots = torch.tensor(sub_train_prots, dtype=torch.long)
                            # Sử dụng cuda
                            sub_train_prots = sub_train_prots.cuda()
                            # Tính dự đoán
                            output = dta_model(sub_train_drugs, sub_train_prots)
                            # Tính loss
                            loss = loss_func(output, target)
                            # Tính tổng loss
                            loss_eval += loss.item() * len(sub_train_drugs)
                            # Nếu là epoch cuối
                            if i == epoch - 1:
                                # Nếu kết quả phù hợp
                                if len(predicted_labels) == 0:
                                    # Chuyển về numpy
                                    predicted_labels = output.cpu().detach().numpy()
                                else:
                                    # Chuyển về numpy
                                    predicted_labels = np.concatenate((predicted_labels, output.cpu().detach().numpy()),
                                                                      0)
                        # In ra màn hình
                        print("epoch #", i + 1, ", train loss", loss_epoch * 1.0 / len(train_drugs),
                              ", validation loss", loss_eval / len(val_drugs))
                        # Log vào file
                        logging("epoch #" + str(i + 1) + ", train loss " + str(
                            loss_epoch * 1.0 / len(train_drugs)) + ", validation loss " + str(
                            loss_eval / len(val_drugs)), FLAGS)

                    # Tính CI-i
                    rperf = prfmeasure(val_Y, predicted_labels)
                    rperf = rperf[0]

                    # Log vào file
                    logging("P1 = %d,  P2 = %d, P3 = %d, Fold = %d, CI-i = %f, MSE = %f" %
                            (param1ind, param2ind, param3ind, foldind, rperf, loss_eval / len(val_drugs)), FLAGS)
                    # In ra màn hình
                    print("P1 = %d,  P2 = %d, P3 = %d, Fold = %d, CI-i = %f, MSE = %f" %
                          (param1ind, param2ind, param3ind, foldind, rperf, loss_eval / len(val_drugs)))

                    # Lưu lại chỉ số rperf
                    all_predictions[pointer][
                        foldind] = rperf
                    # Lưu lại loss
                    all_losses[pointer][foldind] = loss_eval / len(val_drugs)
                    # Tăng pointer thêm 1
                    pointer += 1
    # Khởi tạo bestperf
    bestperf = -float('Inf')
    # Khởi tạo bestpointer
    bestpointer = None

    # Khởi tạo best_param_list
    best_param_list = []
    # Khởi tạo biến đếm
    pointer = 0

    for param1ind in range(len(paramset1)):
        for param2ind in range(len(paramset2)):
            for param3ind in range(len(paramset3)):

                avgperf = 0.
                # Tính avgPerf trung bình
                for foldind in range(len(val_sets)):
                    foldperf = all_predictions[pointer][foldind]
                    avgperf += foldperf
                avgperf /= len(val_sets)
                # avgPerf tốt hơn đã ghi nhận
                if avgperf > bestperf:
                    # Lưu lại
                    bestperf = avgperf
                    # Lưu lại
                    bestpointer = pointer
                    # Lưu lại
                    best_param_list = [param1ind, param2ind, param3ind]
                # Tăng biến đếm thêm 1
                pointer += 1
    # Trả về các giá trị
    return bestpointer, best_param_list, bestperf, all_predictions, all_losses
Beispiel #16
0
def nfold_1_2_3_setting_sample(XD, XT, Y, label_row_inds, label_col_inds, measure, FLAGS, dataset):
    # Đọc dữ liệu
    test_set, outer_train_sets = dataset.read_sets(FLAGS.dataset_path, FLAGS.problem_type)
    # Tính số lượng fold
    foldinds = len(outer_train_sets)

    # Khởi tạo test_sets
    test_sets = []
    # Khởi tạo val_sets
    val_sets = []
    # Khởi tạo train_sets
    train_sets = []

    # Duyệt từng fold
    for val_foldind in range(foldinds):
        # Lấy val_fold từ outer_train_sets
        val_fold = outer_train_sets[val_foldind]
        # Thêm vào val_sets
        val_sets.append(val_fold)
        # Sao chép outer_train_sets
        otherfolds = deepcopy(outer_train_sets)
        # Xóa entry cuối của otherfolds
        otherfolds.pop(val_foldind)
        # Tạo list các item trong otherfolds
        otherfoldsinds = [item for sublist in otherfolds for item in sublist]
        # Thêm list vừa tạo vào train_sets
        train_sets.append(otherfoldsinds)
        # Thêm test_set vào test_sets
        test_sets.append(test_set)
        # In val_fold ra màn hình
        print("val set", str(len(val_fold)))
        # In otherfoldsinds ra màn hình
        print("train set", str(len(otherfoldsinds)))

    # Tính hyperparameter của train_sets và val_sets
    bestparamind, best_param_list, bestperf, all_predictions_not_need, losses_not_need = general_nfold_cv(
        XD, XT, Y,
        label_row_inds,
        label_col_inds,
        measure,
        FLAGS,
        train_sets,
        val_sets)

    # Tính hyperparameter của train_sets và test_sets
    bestparam, best_param_list, bestperf, all_predictions, all_losses = general_nfold_cv(
        XD, XT, Y,
        label_row_inds,
        label_col_inds,
        measure,
        FLAGS,
        train_sets,
        test_sets)

    # Log file
    logging("---FINAL RESULTS-----", FLAGS)
    # Log best param vào file log
    logging("best param index = %s,  best param = %.5f" %
            (bestparamind, bestparam), FLAGS)
    # In ra màn hình
    print("---FINAL RESULTS-----")
    # In best param ra màn hình
    print("best param index = %s,  best param = %.5f" %
          (bestparamind, bestparam))

    # Khởi tạo testperfs
    testperfs = []
    # Khởi tạo testloss
    testloss = []

    # Khỏi tạo avgperf
    avgperf = 0.

    # Duyệt từng test_set
    for test_foldind in range(len(test_sets)):
        # Lấy test performance CI
        foldperf = all_predictions[bestparamind][test_foldind]
        # Lấy test performance MSE
        foldloss = all_losses[bestparamind][test_foldind]
        # Thêm vào testperfs
        testperfs.append(foldperf)
        # Thêm vào testloss
        testloss.append(foldloss)
        # Tính avgperf
        avgperf += foldperf

    # Tính avgperf
    avgperf = avgperf / len(test_sets)
    # Tính trung bình loss
    avgloss = np.mean(testloss)
    # Tính độ lệch chuẩn loss
    teststd = np.std(testperfs)

    # Log vào file
    logging("Test Performance CI", FLAGS)
    # Log vào file
    logging(testperfs, FLAGS)
    # Log vào file
    logging("Test Performance MSE", FLAGS)
    # Log vào file
    logging(testloss, FLAGS)

    # Trả về giá trị
    return avgperf, avgloss, teststd
Beispiel #17
0
    label_row_inds, label_col_inds = np.where(
        np.isnan(Y) == False)  # basically finds the point address of affinity [x,Y]

    print("Logdir: " + FLAGS.log_dir)
    s1_avgperf, s1_avgloss, s1_teststd = nfold_1_2_3_setting_sample(XD, XT, Y, label_row_inds, label_col_inds,
                                                                    perfmeasure, FLAGS, dataset)

    logging("Setting " + str(FLAGS.problem_type), FLAGS)
    logging("avg_perf = %.5f,  avg_mse = %.5f, std = %.5f" %
            (s1_avgperf, s1_avgloss, s1_teststd), FLAGS)
    print("Setting " + str(FLAGS.problem_type))
    print("avg_perf = %.5f,  avg_mse = %.5f, std = %.5f" %
          (s1_avgperf, s1_avgloss, s1_teststd))


def run_regression(FLAGS):
    perfmeasure = get_cindex
    experiment(FLAGS, perfmeasure)


if __name__ == "__main__":
    FLAGS = argparser()
    FLAGS.log_dir = FLAGS.log_dir + str(time.time()) + "/"
    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)

    logging(str(FLAGS), FLAGS)
    print(str(FLAGS))
    run_regression(FLAGS)
Beispiel #18
0
def nfold_1_2_3_setting_sample(XD, XT, Y, label_row_inds, label_col_inds,
                               measure, runmethod, FLAGS, dataset):

    bestparamlist = []
    test_set, outer_train_sets = dataset.read_sets(FLAGS.dataset_path,
                                                   FLAGS.problem_type)

    foldinds = len(outer_train_sets)

    test_sets = []
    ## TRAIN AND VAL
    val_sets = []
    train_sets = []

    #logger.info('Start training')
    for val_foldind in range(foldinds):
        val_fold = outer_train_sets[val_foldind]
        val_sets.append(val_fold)
        otherfolds = deepcopy(outer_train_sets)
        otherfolds.pop(val_foldind)
        otherfoldsinds = [item for sublist in otherfolds for item in sublist]
        train_sets.append(otherfoldsinds)
        test_sets.append(test_set)
        print("val set", str(len(val_fold)))
        print("train set", str(len(otherfoldsinds)))

    bestparamind, best_param_list, bestperf, all_predictions_not_need, losses_not_need = general_nfold_cv(
        XD, XT, Y, label_row_inds, label_col_inds, measure, runmethod, FLAGS,
        train_sets, val_sets)

    #print("Test Set len", str(len(test_set)))
    #print("Outer Train Set len", str(len(outer_train_sets)))
    bestparam, best_param_list, bestperf, all_predictions, all_losses = general_nfold_cv(
        XD, XT, Y, label_row_inds, label_col_inds, measure, runmethod, FLAGS,
        train_sets, test_sets)

    testperf = all_predictions[bestparamind]  ##pointer pos

    logging("---FINAL RESULTS-----", FLAGS)
    logging(
        "best param index = %s,  best param = %.5f" %
        (bestparamind, bestparam), FLAGS)

    testperfs = []
    testloss = []

    avgperf = 0.

    for test_foldind in range(len(test_sets)):
        foldperf = all_predictions[bestparamind][test_foldind]
        foldloss = all_losses[bestparamind][test_foldind]
        testperfs.append(foldperf)
        testloss.append(foldloss)
        avgperf += foldperf

    avgperf = avgperf / len(test_sets)
    avgloss = np.mean(testloss)
    teststd = np.std(testperfs)

    logging("Test Performance CI", FLAGS)
    logging(testperfs, FLAGS)
    logging("Test Performance MSE", FLAGS)
    logging(testloss, FLAGS)

    return avgperf, avgloss, teststd
Beispiel #19
0
def nfold_setting_sample(XD, XT, Y, label_row_inds, label_col_inds, measure,
                         FLAGS, dataset, nfolds, i):
    test_set = nfolds[5]
    outer_train_sets = nfolds[0:5]
    # test_set, outer_train_sets=dataset.read_sets(FLAGS)

    # if FLAGS.problem_type==1:
    #     test_set, outer_train_sets = dataset.read_sets(FLAGS)
    foldinds = len(outer_train_sets)
    ## TRAIN AND VAL
    val_sets = []
    train_sets = []
    test_sets = []
    for val_foldind in range(foldinds):
        val_fold = outer_train_sets[val_foldind]
        val_sets.append(val_fold)
        otherfolds = deepcopy(outer_train_sets)
        otherfolds.pop(val_foldind)
        otherfoldsinds = [item for sublist in otherfolds for item in sublist]
        train_sets.append(otherfoldsinds)
        test_sets.append(test_set)
        print("val set", str(len(val_sets)))
        print("train set", str(len(train_sets)))

    bestparamind, best_param_list, bestperf, all_predictions, all_losses = general_nfold_cv(
        XD, XT, Y, label_row_inds, label_col_inds, measure, FLAGS, train_sets,
        test_sets, get_aupr, get_rm2)

    best_param, bestperf, all_predictions, all_losses, all_auc, all_aupr = general_nfold_cv_test(
        XD, XT, Y, label_row_inds, label_col_inds, measure, FLAGS, train_sets,
        test_sets, get_rm2, best_param_list, i)

    logging("---FINAL RESULTS-----", FLAGS)
    logging("best param = %s" % best_param_list, FLAGS)

    testperfs = []
    testloss = []
    testauc = []
    testaupr = []
    avgperf = 0.

    for test_foldind in range(len(test_sets)):
        foldperf = all_predictions[test_foldind]
        foldloss = all_losses[test_foldind]
        testperfs.append(foldperf)
        testloss.append(foldloss)
        testauc.append(all_auc[test_foldind])
        testaupr.append(all_aupr[test_foldind])
        avgperf += foldperf

    avgperf = avgperf / len(test_sets)
    avgloss = np.mean(testloss)
    perf_std = np.std(testperfs)
    loss_std = np.std(testloss)
    avg_auc = np.mean(testauc)
    auc_std = np.std(testauc)
    avg_aupr = np.mean(testaupr)
    aupr_std = np.std(testaupr)

    logging("Test Performance CI", FLAGS)
    logging(testperfs, FLAGS)
    logging("Test Performance MSE", FLAGS)
    logging(testloss, FLAGS)

    print(best_param_list)
    print('averaged performance', avgperf)
    return avgperf, avgloss, perf_std, loss_std, avg_auc, auc_std, avg_aupr, aupr_std
Beispiel #20
0
def general_nfold_cv(XD, XT, Y, label_row_inds, label_col_inds, prfmeasure,
                     FLAGS, labeled_sets, val_sets, get_aupr, get_rm2):

    paramset1 = FLAGS.num_windows
    paramset2 = FLAGS.smi_window_lengths
    paramset3 = FLAGS.seq_window_lengths
    lamda_set = FLAGS.lamda
    batchsz = FLAGS.batch_size  # 256

    logging("---Parameter Search-----", FLAGS)

    w = len(val_sets)
    h = len(paramset1) * len(paramset2) * len(paramset3) * len(lamda_set)

    all_predictions = [[0 for x in range(w)] for y in range(h)]
    all_losses = [[0 for x in range(w)] for y in range(h)]

    for foldind in range(len(val_sets)):
        valinds = val_sets[foldind]
        labeledinds = labeled_sets[foldind]

        trrows = label_row_inds[labeledinds]
        trcols = label_col_inds[labeledinds]

        train_dataset = prepare_interaction_pairs(XD, XT, Y, trrows, trcols)

        terows = label_row_inds[valinds]
        tecols = label_col_inds[valinds]

        test_dataset = prepare_interaction_pairs(XD, XT, Y, terows, tecols)

        pointer = 0

        train_loader = DataLoader(dataset=train_dataset,
                                  batch_size=batchsz,
                                  shuffle=True)
        test_loader = DataLoader(dataset=test_dataset, batch_size=batchsz)
        for param1value in paramset1:  # hidden neurons
            for param2value in paramset2:  # learning rate
                for param3value in paramset3:
                    for lamda in lamda_set:
                        model = net(FLAGS, param1value, param2value,
                                    param3value).cuda()
                        model.apply(weights_init)
                        rperf_list = []
                        for epochind in range(FLAGS.num_epoch):
                            model = train(train_loader, model, FLAGS,
                                          param1value, param2value,
                                          param3value, lamda)
                            rperf, loss, rm2, auc = test(
                                model, test_loader, FLAGS, param1value,
                                param2value, param3value, lamda)
                            rperf_list.append(rperf)
                            ##Set the conditions for early stopping
                            if (epochind + 1) % 5 == 0:
                                print(
                                    'val: epoch:{},p1:{},p2:{},p3:{},loss:{:.5f},rperf:{:.5f}, rm2:{:.5f}'
                                    .format(epochind, param1value, param2value,
                                            param3value, loss, rperf, rm2))

                                if rperf >= max(rperf_list):
                                    torch.save(model, 'checkpoint.pth')
                                if rperf < max(rperf_list) - 0.1:
                                    print(
                                        'The program is stopped early for better performance.'
                                    )
                                    break
                        logging(
                            "P1 = %d,  P2 = %d, P3 = %d, Fold = %d, CI-i = %f, MSE = %f, rm2 = %f"
                            % (param1value, param2value, param3value, foldind,
                               rperf, loss, rm2), FLAGS)

                        all_predictions[pointer][
                            foldind] = rperf  # TODO FOR EACH VAL SET allpredictions[pointer][foldind]
                        all_losses[pointer][foldind] = loss

                    pointer += 1

    bestperf = -float('Inf')
    bestpointer = None

    best_param_list = []
    ##Take average according to folds, then chooose best params
    pointer = 0
    for param1value in paramset1:
        for param2value in paramset2:
            for param3value in paramset3:
                for lamda in lamda_set:
                    avgperf = 0.
                    for foldind in range(len(val_sets)):
                        foldperf = all_predictions[pointer][foldind]
                        avgperf += foldperf
                    avgperf /= len(val_sets)

                    if avgperf > bestperf:
                        bestperf = avgperf
                        bestpointer = pointer
                        best_param_list = [
                            param1value,
                            param2value,
                            param3value,
                            lamda,
                        ]

                    pointer += 1

    return bestpointer, best_param_list, bestperf, all_predictions, all_losses
Beispiel #21
0
def general_nfold_cv(XD, XT, Y, label_row_inds, label_col_inds, prfmeasure,
                     FLAGS, labeled_sets,
                     val_sets):  ## BURAYA DA FLAGS LAZIM????

    paramset1 = FLAGS.num_windows  # [32]#[32,  512] #[32, 128]  # filter numbers
    paramset2 = FLAGS.smi_window_lengths  # [4, 8]#[4,  32] #[4,  8] #filter length smi
    paramset3 = FLAGS.seq_window_lengths  # [8, 12]#[64,  256] #[64, 192]#[8, 192, 384]
    epoch = FLAGS.num_epoch  # 100
    batchsz = FLAGS.batch_size  # 256

    logging("---Parameter Search-----", FLAGS)

    w = len(val_sets)
    h = len(paramset1) * len(paramset2) * len(paramset3)

    all_predictions = [[0 for x in range(w)] for y in range(h)]
    all_losses = [[0 for x in range(w)] for y in range(h)]
    print(all_predictions)

    for foldind in range(len(val_sets)):
        valinds = val_sets[foldind]
        labeledinds = labeled_sets[foldind]
        trrows = label_row_inds[labeledinds]
        trcols = label_col_inds[labeledinds]

        train_drugs, train_prots, train_Y = prepare_interaction_pairs(
            XD, XT, Y, trrows, trcols)
        train_drugs = np.array(train_drugs)  # chuyển dữ liệu sang array numpy
        train_prots = np.array(train_prots)  # chuyển dữ liệu sang array numpy
        train_Y = np.array(train_Y)  # chuyển dữ liệu sang array numpy
        # train_drugs =train_drugs[:1000]
        # train_prots =train_prots[:1000]
        # train_Y= train_Y[:1000]
        terows = label_row_inds[valinds]
        tecols = label_col_inds[valinds]
        val_drugs, val_prots, val_Y = prepare_interaction_pairs(
            XD, XT, Y, terows, tecols)
        val_drugs = np.array(val_drugs)  # chuyển dữ liệu sang array numpy
        val_prots = np.array(val_prots)  # chuyển dữ liệu sang array numpy
        val_Y = np.array(val_Y)  # chuyển dữ liệu sang array numpy
        # val_drugs= val_drugs[:1000]
        # val_prots= val_prots[:1000]
        # val_Y= val_Y[:1000]
        pointer = 0
        print(paramset1)
        print(paramset2)
        print(paramset3)
        for param1ind in range(
                len(paramset1)):  # chọn số filter của các mạng convolution
            param1value = paramset1[param1ind]
            for param2ind in range(
                    len(paramset2)
            ):  # chọn kernel_size cho các mạng convolution phần tính encode_smiles
                param2value = paramset2[param2ind]
                for param3ind in range(
                        len(paramset3)
                ):  # chọn kernel_size cho các mạng convolution phần tính encode_protein
                    save_model_path = "../model_weight/fold" + str(
                        foldind) + "_param" + str(
                            param1ind) + "_param2ind" + str(param2ind) + str(
                                param3ind) + ".pt"  # đường dẫn lưu model
                    param3value = paramset3[param3ind]
                    model = Net(param2value, param1value,
                                param3value)  # khởi tạo model
                    print("param ", param2value, " ", param1value, " ",
                          param3value)  # in ra tham số
                    criterion = nn.MSELoss(
                    )  # khởi tạo function tính hàm loss , mean square error
                    optimizer = optim.Adam(
                        model.parameters(), lr=0.001
                    )  # chọn giải thuật Adam là giải thuật tối ưu, dùng learning rate = 0.001
                    best_predicted_labels = [
                    ]  # lưu kết quả của dự đoán tập validation tốt nhất
                    best_vali_loss = 100000000  # lưu loss vali tốt nhất
                    for i in range(epoch):  # vòng lặp epoch
                        loss_epoch = 0  # loss train theo từng epoch
                        model.train()  # bắt đầu training
                        for j in range(0, int(len(train_drugs)),
                                       batchsz):  # vòng lặp step
                            optimizer.zero_grad()  # restart lại gradisent
                            end = min(j + batchsz, len(train_drugs))
                            train_drug_batch = train_drugs[
                                j:end]  # lấy dữ liệu drug training theo step
                            train_prot_batch = train_prots[
                                j:
                                end]  # lấy dữ liệu prots otstraining theo step
                            target = train_Y[j:end]  # lấý label theo steps
                            target = torch.FloatTensor(
                                target)  # chuyển numpy array thanh tensor
                            train_drug_batch = torch.tensor(
                                train_drug_batch, dtype=torch.long
                            )  # chuyển numpy array thanh tensor
                            train_prot_batch = torch.tensor(
                                train_prot_batch, dtype=torch.long
                            )  # chuyển numpy array thanh tensor
                            output = model(
                                train_drug_batch,
                                train_prot_batch)  # tính y dự đoán ( forward)
                            loss = criterion(output, target)  # tính loss step
                            loss.backward(
                            )  # tính gradisent trên từng parameter
                            optimizer.step()  # back propagation
                            loss_epoch += loss.item() * len(
                                train_drug_batch
                            )  # cộng loss từng step cho loss theo epoch
                        model.eval(
                        )  # khai báo để chuẩn bị đánh giá mô hình, tắt traning, dropout=0,...
                        loss_eval = 0
                        predicted_labels = []
                        for j in range(0, int(len(val_drugs)), batchsz):
                            end = min(j + batchsz, len(val_drugs))
                            vali_drug_batch = val_drugs[
                                j:end]  # lấy dữ liệu drug validation theo step
                            vali_prot_batch = val_prots[
                                j:
                                end]  # lấy dữ liệu prots validation theo step
                            target = val_Y[j:end]  # lấý label theo steps
                            target = torch.FloatTensor(
                                target)  # chuyển numpy array thanh tensor
                            vali_drug_batch = torch.tensor(
                                vali_drug_batch, dtype=torch.long
                            )  # chuyển numpy array thanh tensor
                            vali_prot_batch = torch.tensor(
                                vali_prot_batch, dtype=torch.long
                            )  # chuyển numpy array thanh tensor
                            output = model(
                                vali_drug_batch,
                                vali_prot_batch)  # tính y dự đoán ( forward)
                            loss = criterion(output, target)  # tính loss step
                            loss_eval += loss.item() * len(
                                vali_drug_batch
                            )  # cộng loss từng step cho loss theo epoch
                            if len(predicted_labels
                                   ) == 0:  # lưu toạn bộ y dự đoán
                                predicted_labels = output.cpu().detach().numpy(
                                )
                            else:
                                predicted_labels = np.concatenate(
                                    (predicted_labels,
                                     output.cpu().detach().numpy()), 0)
                        if best_vali_loss > (
                                loss_eval / len(val_drugs)
                        ):  # nếu loss ở epoch hiện tại tốt hơn best loss thì lưu model và lưu best predict
                            torch.save(model.state_dict(),
                                       save_model_path)  # save model
                            best_vali_loss = loss_eval / len(
                                val_drugs)  # lưu best loss vali
                            best_predicted_labels = predicted_labels  # lưu best predict
                            print("epoch ", i, " , train loss ",
                                  loss_epoch * 1.0 / len(train_drugs),
                                  "  , vali loss ", loss_eval / len(val_drugs))

                    rperf = prfmeasure(val_Y,
                                       best_predicted_labels)  # tính CI-i
                    rperf = rperf[0]

                    logging(
                        "P1 = %d,  P2 = %d, P3 = %d, Fold = %d, CI-i = %f, MSE = %f"
                        %  # ghi log
                        (param1ind, param2ind, param3ind, foldind, rperf,
                         best_vali_loss),
                        FLAGS)

                    all_predictions[pointer][
                        foldind] = rperf  # lưu lại chỉ số rperf của từng lần training  k fold và các tham số
                    all_losses[pointer][
                        foldind] = best_vali_loss  # lưu lại chỉ số loss của từng lần training k fold và các tham số khác nhau
                    pointer += 1
    bestperf = -float('Inf')
    bestpointer = None  # chọn ra bộ hyper parameter tốt nhất
    best_param_list = []
    ##Take average according to folds, then chooose best params
    pointer = 0
    for param1ind in range(len(paramset1)):
        for param2ind in range(len(paramset2)):
            for param3ind in range(len(paramset3)):
                avgperf = 0.
                # tính avgperf trung bình của từng  bộ hyper parameter ( lấy trung bình theo k =5)
                for foldind in range(len(val_sets)):
                    foldperf = all_predictions[pointer][foldind]
                    avgperf += foldperf
                avgperf /= len(val_sets)
                # print(epoch, batchsz, avgperf)
                if avgperf > bestperf:  # chọn avgperf tốt nhất và lưu các bộ tham số
                    bestperf = avgperf  # lưu lại avgperf tốt nhất
                    bestpointer = pointer  # lưu lại lần training tốt nhất
                    best_param_list = [param1ind, param2ind, param3ind
                                       ]  # lưu lại bộ hyper parameter tốt nhấ
                pointer += 1
    # trả về các bộ tham số tốt nhất và CI-i
    return bestpointer, best_param_list, bestperf, all_predictions, all_losses