Ejemplo n.º 1
0
def train_fold(train_ind, test_ind, val_ind, graph_feat, features, y, y_data,
               params, subject_IDs):
    """
        train_ind       : indices of the training samples
        test_ind        : indices of the test samples
        val_ind         : indices of the validation samples
        graph_feat      : population graph computed from phenotypic measures num_subjects x num_subjects
        features        : feature vectors num_subjects x num_features
        y               : ground truth labels (num_subjects x 1)
        y_data          : ground truth labels - different representation (num_subjects x 2)
        params          : dictionnary of GCNs parameters
        subject_IDs     : list of subject IDs

    returns:

        test_acc    : average accuracy over the test samples using GCNs
        test_auc    : average area under curve over the test samples using GCNs
        lin_acc     : average accuracy over the test samples using the linear classifier
        lin_auc     : average area under curve over the test samples using the linear classifier
        fold_size   : number of test samples
    """

    print(len(train_ind))

    # selection of a subset of data if running experiments with a subset of the training set
    labeled_ind = Reader.site_percentage(train_ind, params['num_training'],
                                         subject_IDs)

    # feature selection/dimensionality reduction step
    x_data = Reader.feature_selection(features, y, labeled_ind,
                                      params['num_features'])

    fold_size = len(test_ind)

    # Calculate all pairwise distances
    distv = distance.pdist(x_data, metric='correlation')
    # Convert to a square symmetric distance matrix
    dist = distance.squareform(distv)
    sigma = np.mean(dist)
    # Get affinity from similarity matrix
    sparse_graph = np.exp(-dist**2 / (2 * sigma**2))
    final_graph = graph_feat * sparse_graph

    # Linear classifier
    clf = RidgeClassifier()
    clf.fit(x_data[train_ind, :], y[train_ind].ravel())
    # Compute the accuracy
    lin_acc = clf.score(x_data[test_ind, :], y[test_ind].ravel())
    # Compute the AUC
    pred = clf.decision_function(x_data[test_ind, :])
    lin_auc = sklearn.metrics.roc_auc_score(y[test_ind] - 1, pred)

    print("Linear Accuracy: " + str(lin_acc))

    # Classification with GCNs
    test_acc, test_auc = Train.run_training(final_graph,
                                            sparse.coo_matrix(x_data).tolil(),
                                            y_data, train_ind, val_ind,
                                            test_ind, params)

    print(test_acc)

    # return number of correctly classified samples instead of percentage
    test_acc = int(round(test_acc * len(test_ind)))
    lin_acc = int(round(lin_acc * len(test_ind)))

    return test_acc, test_auc, lin_acc, lin_auc, fold_size
Ejemplo n.º 2
0
def train_fold(train_ind, test_ind, val_ind, graph_feat, features, y, y_data, params, subject_IDs, pathToSave, i, subject_labels, idx):
    """
        train_ind       : indices of the training samples
        test_ind        : indices of the test samples
        val_ind         : indices of the validation samples
        graph_feat      : population graph computed from phenotypic measures num_subjects x num_subjects
        features        : feature vectors num_subjects x num_features
        y               : ground truth labels (num_subjects x 1)
        y_data          : ground truth labels - different representation (num_subjects x 2)
        params          : dictionnary of GCNs parameters
        subject_IDs     : list of subject IDs

    returns:

        test_acc    : average accuracy over the test samples using GCNs
        test_auc    : average area under curve over the test samples using GCNs
        lin_acc     : average accuracy over the test samples using the linear classifier
        lin_auc     : average area under curve over the test samples using the linear classifier
        fold_size   : number of test samples
    """

    print(len(train_ind))
    tf.reset_default_graph()
    tf.app.flags._global_parser = argparse.ArgumentParser()

    # selection of a subset of data if running experiments with a subset of the training set
    # labeled_ind = Reader.site_percentage(train_ind, params['num_training'], subject_IDs)
    num_nodes = np.size(graph_feat, 0)
    #print features[0,:],"features"
    x_data_1 = features.astype(float)#Reader.feature_selection(features, y, labeled_ind, params['num_features'])
    xrow,xcol = np.shape(x_data_1)
    for i in range(xrow):
        for j in range(xcol):
            x_data_1[i, j] = round(x_data_1[i,j], 4)
    fold_size = len(test_ind)
    x_data_1[np.where(np.isnan(x_data_1))] = 0
    distv = distance.pdist(x_data_1, metric='correlation')

    dist = distance.squareform(distv)
    sigma = np.mean(dist)
    # Get affinity from similarity matrix
    sparse_graph = np.exp(- dist ** 2 / (2 * sigma ** 2))
    # plt.matshow(sparse_graph)
    # plt.savefig('features_sparsegraph.png', bbox_inches='tight')
    # exit()
    graph = Reader.get_affinity(sparse_graph, idx)

    x_data = features.astype(float)#np.identity(num_nodes)
    xrow,xcol = np.shape(x_data)
    for i in range(xrow):
        for j in range(xcol):
            x_data[i, j] = round(x_data[i,j], 4)
    np.savetxt("x_data.csv", x_data, delimiter=',')
    x_data[np.where(np.isnan(x_data))] = 0
    print(np.where(np.isnan(x_data)))
    #exit()
    # Linear classifier
    clf = RidgeClassifier()
    clf.fit(x_data[train_ind, :], y[train_ind].ravel())
    # Compute the accuracy
    lin_acc = clf.score(x_data[test_ind, :], y[test_ind].ravel())
    # Compute the AUC
    pred = clf.decision_function(x_data[test_ind, :])

    y_one_hot = label_binarize(y[test_ind], classes=np.arange(3))
    lin_auc = sklearn.metrics.roc_auc_score(y_one_hot, pred)

    # np.savetxt("x_data.csv", x_data, delimiter = ',')
    # Classification with GCNs
    test_acc, test_auc, weights, confusion = Train.run_training(graph, sparse.coo_matrix(x_data).tolil(), y_data, train_ind, val_ind,
                                            test_ind, params, pathToSave, i)

    # print(test_acc)
    scores_acc = [test_acc]
    scores_auc = [test_auc]
    scores_lin = [lin_acc]
    scores_auc_lin = [lin_auc]
    fold_size = [fold_size]
    if FLAGS.model == 'gcn_cheby':
        weights_0 = weights[0]
        weights_1 = weights[1]
        weights_2 = weights[2]

    scores_lin_ = np.sum(scores_lin)
    scores_auc_lin_ = np.mean(scores_auc_lin)
    scores_acc_ = int(np.sum(scores_acc) * len(test_ind))
    scores_auc_ = np.mean(scores_auc)

    if not os.path.exists(pathToSave + 'excel/'):
        os.makedirs(pathToSave + 'excel/')
    pathToSave2 = pathToSave + 'excel/'
    result_name = 'ABIDE_classification.mat'
    if FLAGS.model == 'gcn_cheby':
        sio.savemat(pathToSave2 + str(trial) + result_name,
                    {'lin': scores_lin_, 'lin_auc': scores_auc_lin_,
                     'acc': scores_acc_, 'auc': scores_auc_, 'folds': num_nodes, 'weights_0': weights_0,
                     'weights_1': weights_1, 'weights_2': weights_2})
        df = pd.DataFrame({'scores_acc': [scores_acc_], 'scores_auc': [scores_auc_], 'scores_lin': [scores_lin_],
                           'scores_auc_lin': [scores_auc_lin_], 'weights_0': weights_0, 'weights_1': weights_1,
                           'weights_2':weights_2, 'confusion_matrix': [confusion]})
    else:
        sio.savemat(pathToSave2 + str(trial) + result_name,
                    {'lin': scores_lin_, 'lin_auc': scores_auc_lin_,
                     'acc': scores_acc_, 'auc': scores_auc_, 'folds': num_nodes})
        df = pd.DataFrame({'scores_acc': [scores_acc_], 'scores_auc': [scores_auc_], 'scores_lin': [scores_lin_],
                           'scores_auc_lin': [scores_auc_lin_], 'confusion_matrix': [confusion]})

    prediction.append(df)

    # Create a Pandas Excel writer using XlsxWriter as the engine.
    writer_n = pd.ExcelWriter(pathToSave2 + str(test_ind[0]) + '.xlsx', engine='xlsxwriter')
    # Convert the dataframe to an XlsxWriter Excel object.
    df.to_excel(writer_n, sheet_name='Sheet1')
    # Close the Pandas Excel writer and output the Excel file.
    writer_n.save()

    lin_acc = int(round(lin_acc * len(test_ind)))
    scores_acc = [test_acc]
    scores_auc = [test_auc]
    scores_lin = [lin_acc]
    scores_auc_lin = [lin_auc]
    fold_size = [fold_size]

    # return number of correctly classified samples instead of percentage
    test_acc = int(round(test_acc * len(test_ind)))
    return test_acc, test_auc, lin_acc, lin_auc, fold_size, len(test_ind)
Ejemplo n.º 3
0
def train_fold(train_ind, test_ind, val_ind, graph_feat, graph_feat2, features, y, y_data, idx, lr, params, subject_IDs,
               pathToSave, i):
    """
        train_ind       : indices of the training samples
        test_ind        : indices of the test samples
        val_ind         : indices of the validation samples
        graph_feat      : population graph computed from phenotypic measures num_subjects x num_subjects
        features        : feature vectors num_subjects x num_features
        y               : ground truth labels (num_subjects x 1)
        y_data          : ground truth labels - different representation (num_subjects x 2)
        params          : dictionnary of GCNs parameters
        subject_IDs     : list of subject IDs

    returns:

        test_acc    : average accuracy over the test samples using GCNs
        test_auc    : average area under curve over the test samples using GCNs
        lin_acc     : average accuracy over the test samples using the linear classifier
        lin_auc     : average area under curve over the test samples using the linear classifier
        fold_size   : number of test samples
    """
    tf.reset_default_graph()
    tf.app.flags._global_parser = argparse.ArgumentParser()
    print(len(train_ind))
    # selection of a subset of data if running experiments with a subset of the training set
    #labeled_ind = Reader.site_percentage(train_ind, params['num_training'], subject_IDs)
    labeled_ind = reader.site_percentage(train_ind,1.0)
    # feature selection/dimensionality reduction step
    x_data = Reader.feature_selection(features, y, labeled_ind,  params['num_features'])
    fold_size = len(test_ind)

    # Calculate all pairwise distances
    distv = distance.pdist(x_data, metric='correlation')
    # Convert to a square symmetric distance matrix
    dist = distance.squareform(distv)
    sigma = np.mean(dist)
    # Get affinity from similarity matrix
    sparse_graph = np.exp(- dist ** 2 / (2 * sigma ** 2))
    num_nodes = 662
    final_graph = graph_feat * sparse_graph # Gender

    final_graph2 = graph_feat2 * sparse_graph # Age



    # Linear classifier
    clf = RidgeClassifier()
    clf.fit(x_data[train_ind, :], y[train_ind].ravel())
    # Compute the accuracy
    lin_acc = clf.score(x_data[test_ind, :], y[test_ind].ravel())
    # Compute the AUC
    pred = clf.decision_function(x_data[test_ind, :])
    lin_auc = sklearn.metrics.roc_auc_score(y[test_ind] - 1, pred)

    print("Linear Accuracy: " + str(lin_acc))
    # Classification with GCNs
    test_acc, test_auc, weights= Train.run_training(final_graph, final_graph2, sparse.coo_matrix(x_data).tolil(), y_data,
                                            train_ind, val_ind,
                                            test_ind, idx, lr, params, pathToSave, i)
    # return number of correctly classified samples instead of percentage
    # test_acc = int(round(test_acc * len(test_ind)))
    # lin_acc = int(round(lin_acc * len(test_ind)))
    scores_acc = [test_acc]
    scores_auc = [test_auc]
    scores_lin = [lin_acc]
    scores_auc_lin = [lin_auc]
    fold_size = [fold_size]
    weights_0 = weights[0]
    weights_1 = weights[1]

    scores_lin_ = np.sum(scores_lin)
    scores_auc_lin_ = np.mean(scores_auc_lin)
    scores_acc_ = np.sum(scores_acc)
    scores_auc_ = np.mean(scores_auc)

    if not os.path.exists(pathToSave + 'excel/'):
        os.makedirs(pathToSave + 'excel/')
    pathToSave2 = pathToSave + 'excel/'
    result_name = 'ABIDE_classification.mat'
    sio.savemat(pathToSave2 + str(trial) + result_name,
                {'lin': scores_lin_, 'lin_auc': scores_auc_lin_,
                 'acc': scores_acc_, 'auc': scores_auc_, 'folds': num_nodes, 'weights_0': weights_0, 'weights_1': weights_1})
    df = pd.DataFrame({'scores_acc': [scores_acc_], 'scores_auc': [scores_auc_], 'scores_lin': [scores_lin_],
                       'scores_auc_lin': [scores_auc_lin_], 'weights_0': weights_0, 'weights_1': weights_1})

    prediction.append(df)

    # Create a Pandas Excel writer using XlsxWriter as the engine.
    writer_n = pd.ExcelWriter(pathToSave2 + str(test_ind[0]) + '.xlsx', engine='xlsxwriter')
    # Convert the dataframe to an XlsxWriter Excel object.
    df.to_excel(writer_n, sheet_name='Sheet1')
    # Close the Pandas Excel writer and output the Excel file.
    writer_n.save()

    test_acc = int(round(test_acc * len(test_ind)))
    lin_acc = int(round(lin_acc * len(test_ind)))
    scores_acc = [test_acc]
    scores_auc = [test_auc]
    scores_lin = [lin_acc]
    scores_auc_lin = [lin_auc]
    fold_size = [fold_size]

    # return number of correctly classified samples instead of percentage
    test_acc = int(round(test_acc * len(test_ind)))
    return test_acc, test_auc, lin_acc, lin_auc, fold_size
Ejemplo n.º 4
0
def train_fold(cv, train_ind, test_ind, val_ind, graph_feat, features, y,
               y_data, params, subject_IDs, cur_time):
    """
        train_ind       : indices of the training samples
        test_ind        : indices of the test samples
        val_ind         : indices of the validation samples
        graph_feat      : population graph computed from phenotypic measures num_subjects x num_subjects
        features        : feature vectors num_subjects x num_features
        y               : ground truth labels (num_subjects x 1)
        y_data          : ground truth labels - different representation (num_subjects x 2)
        params          : dictionnary of GCNs parameters
        subject_IDs     : list of subject IDs

    #returns:

        test_acc    : average accuracy over the test samples using GCNs
        test_auc    : average area under curve over the test samples using GCNs
        lin_acc     : average accuracy over the test samples using the linear classifier
        lin_auc     : average area under curve over the test samples using the linear classifier
        fold_size   : number of test samples
    """

    # feature selection/dimensionality reduction step
    # x_data = features
    x_data = Reader.lasso_feature_selection(features, y, train_ind, cv)
    # x_data = Reader.feature_selection(features, y, labeled_ind, params['num_features'])
    # x_data = Reader.feature_selection(features, y, train_ind, params['num_features'])  # no need to consider site info.
    # x_data = Reader.ttest_feature_selection(cur_time, cv, features, y, train_ind)
    # x_data = Reader.bagging_based_ttest_feature_selection(cv, features, y, train_ind)
    # x_data = Reader.ElasticNet_feature_selection(features, y, train_ind)
    # x_data = Reader.bagging_based_ElasticNet_feature_selection(features, y, train_ind)
    # x_data = Reader.bagging_based_lasso_feature_selection(features, y, train_ind)
    print('fold: ' + str(cv) + ', shape: ', np.shape(x_data))

    # Calculate all pairwise distances
    distv = distance.pdist(x_data, metric='correlation')

    # Convert to a square symmetric distance matrix
    dist = distance.squareform(distv)
    sigma = np.mean(dist)

    # Get affinity from similarity matrix
    sparse_graph = np.exp(-dist**2 / (2 * sigma**2))
    final_graph = graph_feat * sparse_graph

    # Classification by BrainNetCNN
    # import tensorflow as tf
    # sess = tf.Session()

    # brainnetcnn = Reader.BrainNetCNN(np.reshape(x_data, [x_data.shape[0], 114, -1, 1]))
    # test_auc, test_accuracy, test_sensitivity, test_specificity, pred, lab = Reader.calculate_performance(eval(brainnetcnn), y_data, train_ind, val_ind, test_ind)

    # outs_val = sess.run(Reader.BrainNetCNN, feed_dict=np.reshape(x_data, [x_data.shape[0], 114, -1, 1]))

    # Classification by MLP
    # test_auc, test_accuracy, test_sensitivity, test_specificity, pred, lab = Reader.MLP_classification(x_data, y_data, train_ind, val_ind, test_ind)

    # Classification with SVM
    # test_auc, test_accuracy, test_sensitivity, test_specificity, pred, lab = Reader.SVM_classification(x_data, y_data, train_ind, val_ind, test_ind)

    # Classification by GCNs
    test_auc, test_accuracy, test_sensitivity, test_specificity, pred, lab = Train.run_training(
        cv, final_graph,
        sparse.coo_matrix(x_data).tolil(), y_data, train_ind, val_ind,
        test_ind, params, cur_time)

    return test_auc, test_accuracy, test_sensitivity, test_specificity, pred, lab