Ejemplo n.º 1
0
def dotheclassification(ds,
                        classifier,
                        bilateral
                        ):
    """ Dotheclassification does the classification.
    Input: the dataset on which to perform a leave-one-out crossvalidation with a classifier
    of choice.
    Specify: the classifier to be used (gnb (linear gnb), l-sgd (linear sgd), sgd)
             whether the sensitivities should be computed and stored for later use
             whether the dataset has ROIs combined across hemisphere (bilateral)
    """
    if classifier == 'gnb':

        # set up classifier
        prior = 'ratio'
        if bilateral:
            targets = 'bilat_ROIs'
        else:
            targets = 'all_ROIs'

        clf = mv.GNB(common_variance=True,
                 prior=prior,
                 space=targets)


    elif classifier == 'l-sgd':
        # set up the dataset: If I understand the sourcecode correctly, the
        # Stochastic Gradient Descent wants to have unique labels in a sample attribute
        # called 'targets' and is quite stubborn with this name - I could not convince
        # it to look for targets somewhere else, so now I catering to his demands
        if bilateral:
            ds.sa['targets'] = ds.sa.bilat_ROIs
        else:
            ds.sa['targets'] = ds.sa.all_ROIs

        # necessary I believe regardless of the SKLLearnerAdapter
        from sklearn.linear_model import SGDClassifier

        # get a stochastic gradient descent into pymvpa by using the SKLLearnerAdapter.
        # Get it to perform 1 vs 1 decisions (instead of one vs all) with the MulticlassClassifier
        clf = mv.MulticlassClassifier(mv.SKLLearnerAdapter(SGDClassifier(loss='hinge',
                                                                         penalty='l2',
                                                                         class_weight='balanced'
                                                                         )))


    cv = mv.CrossValidation(clf, mv.NFoldPartitioner(attr='participant'),
                            errorfx=mv.mean_match_accuracy,
                            enable_ca=['stats'])
    results = cv(ds)
    return cv
Ejemplo n.º 2
0
def dotheclassification(ds_movie,
                        ds_loc,
                        classifier,
                        bilateral):
    """ Dotheclassification does the classification.
    Input: the dataset on which to perform a leave-one-out crossvalidation with a classifier
    of choice.
    Specify: the classifier to be used (gnb (linear gnb), l-sgd (linear sgd), sgd)
             whether the sensitivities should be computed and stored for later use
             whether the dataset has ROIs combined across hemisphere (bilateral)
    """

    dfs = []
    for idx, ds in enumerate([ds_movie, ds_loc]):
        if bilateral:
            ds.sa['targets'] = ds.sa.bilat_ROIs
        else:
            ds.sa['targets'] = ds.sa.all_ROIs

        if classifier == 'gnb':
            # set up classifier
            prior = 'ratio'
            clf = mv.GNB(common_variance=True,
                         prior=prior)

        elif classifier == 'sgd':
            # necessary I believe regardless of the SKLLearnerAdapter
            from sklearn.linear_model import SGDClassifier
            clf = mv.SKLLearnerAdapter(SGDClassifier(loss='hinge',
                                                     penalty='l2',
                                                     class_weight='balanced'))
        elif classifier == 'l-sgd':
            # necessary I believe regardless of the SKLLearnerAdapter
            from sklearn.linear_model import SGDClassifier
            # get a stochastic gradient descent into pymvpa by using the SKLLearnerAdapter.
            # Get it to perform 1 vs 1 decisions (instead of one vs all) with the MulticlassClassifier
            clf = mv.MulticlassClassifier(mv.SKLLearnerAdapter(SGDClassifier(loss='hinge',
                                                                             penalty='l2',
                                                                             class_weight='balanced'
                                                                             )))

        # prepare for callback of sensitivity extraction within CrossValidation
        classifications = []

        def store_class(data, node, result):
            # import pdb; pdb.set_trace()
            class_ds = mv.Dataset(samples=data.sa.voxel_indices)
            class_ds.sa['targets'] = data.sa.targets
            class_ds.sa['partitions'] = data.sa.partitions
            class_ds.sa['predictions'] = clf.predict(data)
            class_ds.sa['participant'] = data.sa.participant
            classifications.append(class_ds)

        # do a crossvalidation classification and store the classification results
        cv = mv.CrossValidation(clf, mv.NFoldPartitioner(attr='participant'),
                                errorfx=mv.mean_match_accuracy,
                                enable_ca=['stats'],
                                callback=store_class)
        # import pdb; pdb.set_trace()
        results = cv(ds)
        # import pdb; pdb.set_trace()
        # save classification results as a Dataset
        ds_type = ['movie', 'loc']
        mv.h5save(results_dir + 'cv_classification_results_{}.hdf5'.format(ds_type[idx]), classifications)
        print('Saved the classification results obtained during crossvalidation.')

        # get the classification list into a pandas dataframe

        for i, classification in enumerate(classifications):
            df = pd.DataFrame(data={'voxel_indices': list(classification.samples),
                                    'targets': list(classification.sa.targets),
                                    'predictions': list(classification.sa.predictions),
                                    'partitions': list(classification.sa.partitions),
                                    'participants': list(classification.sa.participant),
                                    'ds_type': [ds_type[idx]] * len(classification.sa.predictions)
                                    }
                              )
            dfs.append(df)

    # two helper functions for later use in a lamda function
    def hits(row):
        if row['predictions'] == row['targets']:
            return 1
        else:
            return 0

    def parts(row):
        if row['partitions'] == 1:
            return "train"
        elif row['partitions'] == 2:
            return "test"

    # get all folds into one dataframe, disregard the index
    all_classifications = pd.concat(dfs, ignore_index=True)
    # compute hits as correspondence between target and prediction
    all_classifications['hits'] = all_classifications.apply(lambda row: hits(row), axis=1)
    # assign string labels to testing and training partitions (instead of 1, 2)
    all_classifications['parts'] = all_classifications.apply(lambda row: parts(row), axis=1)
    # transform voxel coordinates from arrays (unhashable) into tuples
    all_classifications['voxel_indices'] = all_classifications['voxel_indices'].apply(tuple)

    # subset the dataset to contain only the testing data
    all_testing = all_classifications[all_classifications.parts == "test"]
    # check that every participant is in the data
    assert len(all_testing.participants.unique()) == 15
    # to check for correspondence between the sum of the two experiments confusion matrices,
    # do sth like this: len(all_testing[(all_testing['predictions'] == 'PPA') & (all_testing['targets'] == 'VIS')])

    # this counts hits per fold across experiments (2 if both experiments classified correctly,
    # 1 if 1 experiment classified correctly, 0 is none did). Also, append the targets per voxel.
    # we use 'min' here because aggregate needs any function, but targets are the same between
    # the experiments
    compare_exp = all_testing.groupby(['voxel_indices', 'participants']).agg(
        {'hits': 'sum', 'targets': 'min'}).reset_index().sort_values(['voxel_indices', 'participants'])
    all_testing_movie = all_testing[all_testing.ds_type == 'movie'].sort_values(
        ['voxel_indices', 'participants']).reset_index()
    all_testing_loc = all_testing[all_testing.ds_type == 'loc'].sort_values(
        ['voxel_indices', 'participants']).reset_index()
    # append movie and loc predictions to the dataframe
    compare_exp['pred_movie'] = all_testing_movie.predictions
    compare_exp['pred_loc'] = all_testing_loc.predictions

    # get the ROIS from the classification
    ROIS = np.unique(ds_movie.sa.targets)

    # there can't be values greater than two or lower than zero
    assert compare_exp.hits.max() <= 2
    assert compare_exp.hits.min() >= 0
    return compare_exp, all_testing, ROIS
Ejemplo n.º 3
0
def dotheclassification(ds, bilateral, store_sens=True):
    """ Dotheclassification does the classification. It builds a
    linear gaussian naive bayes classifier, performs a leave-one-out
    crossvalidation and stores the sensitivities from the SGD classifier of each
    fold in a combined dataset for further use in a glm.
    If sens == False, the sensitivities are not stored, and only a
    classification is performed"""
    import matplotlib.pyplot as plt
    # set up the dataset: If I understand the sourcecode correctly, the
    # MulticlassClassifier wants to have unique labels in a sample attribute
    # called 'targets' and is quite stubborn with this name - I could not convince
    # it to look for targets somewhere else, so now I catering to his demands
    if bilateral:
        ds.sa['targets'] = ds.sa.bilat_ROIs
    else:
        ds.sa['targets'] = ds.sa.all_ROIs

    # necessary I believe regardless of the SKLLearnerAdapter
    from sklearn.linear_model import SGDClassifier

    # get a stochastic gradient descent into pymvpa by using the SKLLearnerAdapter.
    # Get it to perform 1 vs 1 decisions (instead of one vs all) with the MulticlassClassifier
    clf = mv.MulticlassClassifier(
        mv.SKLLearnerAdapter(
            SGDClassifier(loss='hinge', penalty='l2',
                          class_weight='balanced')))

    # prepare for callback of sensitivity extraction within CrossValidation
    sensitivities = []
    if store_sens:

        def store_sens(data, node, result):
            sens = node.measure.get_sensitivity_analyzer(
                force_train=False)(data)
            # we also need to manually append the time attributes to the sens ds
            sens.fa['time_coords'] = data.fa['time_coords']
            sens.fa['chunks'] = data.fa['chunks']
            sensitivities.append(sens)

            # do a crossvalidation classification

        cv = mv.CrossValidation(clf,
                                mv.NFoldPartitioner(attr='participant'),
                                errorfx=mv.mean_match_accuracy,
                                enable_ca=['stats'],
                                callback=store_sens)
    else:
        cv = mv.CrossValidation(clf,
                                mv.NFoldPartitioner(attr='participant'),
                                errorfx=mv.mean_match_accuracy,
                                enable_ca=['stats'])
    results = cv(ds)
    # save classification results

    with open(results_dir + 'avmovie_clf.txt', 'a') as f:
        f.write(cv.ca.stats.as_string(description=True))
    # printing of the confusion matrix
    if bilateral:
        desired_order = ['VIS', 'LOC', 'OFA', 'FFA', 'EBA', 'PPA']
    else:
        desired_order = [
            'brain', 'VIS', 'left LOC', 'right LOC', 'left OFA', 'right OFA',
            'left FFA', 'right FFA', 'left EBA', 'right EBA', 'left PPA',
            'right PPA'
        ]
    labels = get_known_labels(desired_order, cv.ca.stats.labels)

    # plot the confusion matrix with pymvpas build-in plot function currently fails
    #    cv.ca.stats.plot(labels=labels,
    #                     numbers=True,
    #                     cmap='gist_heat_r')
    #    plt.savefig(results_dir + 'confusion_matrix.png')
    #    if niceplot:
    #        ACC = cv.ca.stats.stats['mean(ACC)']
    #        plot_confusion(cv,
    #                       labels,
    #                       fn=results_dir + 'confusion_matrix_avmovie.svg',
    #                       figsize=(9, 9),
    #                       vmax=100,
    #                       cmap='Blues',
    #                       ACC='%.2f' % ACC)
    #    mv.h5save(results_dir + 'SGD_cv_classification_results.hdf5', results)
    print('Saved the crossvalidation results.')
    if store_sens:
        mv.h5save(results_dir + 'sensitivities_nfold.hdf5', sensitivities)
        print('Saved the sensitivities.')
    # results now has the overall accuracy. results.samples gives the
    # accuracy per participant.
    # sensitivities contains a dataset for each participant with the
    # sensitivities as samples and class-pairings as attributes
    return sensitivities, cv
Ejemplo n.º 4
0
def dotheclassification(ds, bilateral):
    """This functions performs the classification in a one-vs-all fashion with a
    stochastic gradient descent.
    Future TODO: Selection of alpha may be better performed via
    GridSearchCV. To quote sklearns documentation: 'Finding a reasonable
    regularization term is best done using GridSearchCV, usually in the range
    10.0**-np.arange(1,7).'"""

    # set up the dataset: If I understand the sourcecode correctly, the
    # SGDclassifier wants to have unique labels in a sample attribute
    # called 'targets' and is quite stubborn with this name - I could not convince
    # it to look for targets somewhere else, so now I'm catering to his demands
    if bilateral:
        ds.sa['targets'] = ds.sa.bilat_ROIs
    else:
        ds.sa['targets'] = ds.sa.all_ROIs

    clf = mv.SKLLearnerAdapter(
        SGDClassifier(loss='hinge', penalty='l2', class_weight='balanced'))

    cv = mv.CrossValidation(clf,
                            mv.NFoldPartitioner(attr='participant'),
                            errorfx=mv.mean_match_accuracy,
                            enable_ca=['stats'])

    results = cv(ds)

    # save classification results
    with open(results_dir + 'SGD_clf.txt', 'a') as f:
        f.write(cv.ca.stats.as_string(description=True))

    if bilateral:
        desired_order = ['brain', 'VIS', 'LOC', 'OFA', 'FFA', 'EBA', 'PPA']
    else:
        desired_order = [
            'brain', 'VIS', 'left LOC', 'right LOC', 'left OFA', 'right OFA',
            'left FFA', 'right FFA', 'left EBA', 'right EBA', 'left PPA',
            'right PPA'
        ]

    labels = get_known_labels(desired_order, cv.ca.stats.labels)

    # print confusion matrix with pymvpas build in function
    cv.ca.stats.plot(labels=labels, numbers=True, cmap='gist_heat_r')
    plt.savefig(results_dir + 'confusion_matrix.png')

    # print confusion matrix with matplotlib
    if niceplot:
        ACC = cv.ca.stats.stats['mean(ACC)']
        plot_confusion(cv,
                       labels,
                       fn=results_dir + 'confusion_matrix_SGD.svg',
                       figsize=(9, 9),
                       vmax=100,
                       cmap='Blues',
                       ACC='%.2f' % ACC)

    mv.h5save(results_dir + 'SGD_cv_classification_results.hdf5', results)
    print('Saved the crossvalidation results.')

    return cv