def dotheclassification(ds, classifier, bilateral ): """ Dotheclassification does the classification. Input: the dataset on which to perform a leave-one-out crossvalidation with a classifier of choice. Specify: the classifier to be used (gnb (linear gnb), l-sgd (linear sgd), sgd) whether the sensitivities should be computed and stored for later use whether the dataset has ROIs combined across hemisphere (bilateral) """ if classifier == 'gnb': # set up classifier prior = 'ratio' if bilateral: targets = 'bilat_ROIs' else: targets = 'all_ROIs' clf = mv.GNB(common_variance=True, prior=prior, space=targets) elif classifier == 'l-sgd': # set up the dataset: If I understand the sourcecode correctly, the # Stochastic Gradient Descent wants to have unique labels in a sample attribute # called 'targets' and is quite stubborn with this name - I could not convince # it to look for targets somewhere else, so now I catering to his demands if bilateral: ds.sa['targets'] = ds.sa.bilat_ROIs else: ds.sa['targets'] = ds.sa.all_ROIs # necessary I believe regardless of the SKLLearnerAdapter from sklearn.linear_model import SGDClassifier # get a stochastic gradient descent into pymvpa by using the SKLLearnerAdapter. # Get it to perform 1 vs 1 decisions (instead of one vs all) with the MulticlassClassifier clf = mv.MulticlassClassifier(mv.SKLLearnerAdapter(SGDClassifier(loss='hinge', penalty='l2', class_weight='balanced' ))) cv = mv.CrossValidation(clf, mv.NFoldPartitioner(attr='participant'), errorfx=mv.mean_match_accuracy, enable_ca=['stats']) results = cv(ds) return cv
def dotheclassification(ds_movie, ds_loc, classifier, bilateral): """ Dotheclassification does the classification. Input: the dataset on which to perform a leave-one-out crossvalidation with a classifier of choice. Specify: the classifier to be used (gnb (linear gnb), l-sgd (linear sgd), sgd) whether the sensitivities should be computed and stored for later use whether the dataset has ROIs combined across hemisphere (bilateral) """ dfs = [] for idx, ds in enumerate([ds_movie, ds_loc]): if bilateral: ds.sa['targets'] = ds.sa.bilat_ROIs else: ds.sa['targets'] = ds.sa.all_ROIs if classifier == 'gnb': # set up classifier prior = 'ratio' clf = mv.GNB(common_variance=True, prior=prior) elif classifier == 'sgd': # necessary I believe regardless of the SKLLearnerAdapter from sklearn.linear_model import SGDClassifier clf = mv.SKLLearnerAdapter(SGDClassifier(loss='hinge', penalty='l2', class_weight='balanced')) elif classifier == 'l-sgd': # necessary I believe regardless of the SKLLearnerAdapter from sklearn.linear_model import SGDClassifier # get a stochastic gradient descent into pymvpa by using the SKLLearnerAdapter. # Get it to perform 1 vs 1 decisions (instead of one vs all) with the MulticlassClassifier clf = mv.MulticlassClassifier(mv.SKLLearnerAdapter(SGDClassifier(loss='hinge', penalty='l2', class_weight='balanced' ))) # prepare for callback of sensitivity extraction within CrossValidation classifications = [] def store_class(data, node, result): # import pdb; pdb.set_trace() class_ds = mv.Dataset(samples=data.sa.voxel_indices) class_ds.sa['targets'] = data.sa.targets class_ds.sa['partitions'] = data.sa.partitions class_ds.sa['predictions'] = clf.predict(data) class_ds.sa['participant'] = data.sa.participant classifications.append(class_ds) # do a crossvalidation classification and store the classification results cv = mv.CrossValidation(clf, mv.NFoldPartitioner(attr='participant'), errorfx=mv.mean_match_accuracy, enable_ca=['stats'], callback=store_class) # import pdb; pdb.set_trace() results = cv(ds) # import pdb; pdb.set_trace() # save classification results as a Dataset ds_type = ['movie', 'loc'] mv.h5save(results_dir + 'cv_classification_results_{}.hdf5'.format(ds_type[idx]), classifications) print('Saved the classification results obtained during crossvalidation.') # get the classification list into a pandas dataframe for i, classification in enumerate(classifications): df = pd.DataFrame(data={'voxel_indices': list(classification.samples), 'targets': list(classification.sa.targets), 'predictions': list(classification.sa.predictions), 'partitions': list(classification.sa.partitions), 'participants': list(classification.sa.participant), 'ds_type': [ds_type[idx]] * len(classification.sa.predictions) } ) dfs.append(df) # two helper functions for later use in a lamda function def hits(row): if row['predictions'] == row['targets']: return 1 else: return 0 def parts(row): if row['partitions'] == 1: return "train" elif row['partitions'] == 2: return "test" # get all folds into one dataframe, disregard the index all_classifications = pd.concat(dfs, ignore_index=True) # compute hits as correspondence between target and prediction all_classifications['hits'] = all_classifications.apply(lambda row: hits(row), axis=1) # assign string labels to testing and training partitions (instead of 1, 2) all_classifications['parts'] = all_classifications.apply(lambda row: parts(row), axis=1) # transform voxel coordinates from arrays (unhashable) into tuples all_classifications['voxel_indices'] = all_classifications['voxel_indices'].apply(tuple) # subset the dataset to contain only the testing data all_testing = all_classifications[all_classifications.parts == "test"] # check that every participant is in the data assert len(all_testing.participants.unique()) == 15 # to check for correspondence between the sum of the two experiments confusion matrices, # do sth like this: len(all_testing[(all_testing['predictions'] == 'PPA') & (all_testing['targets'] == 'VIS')]) # this counts hits per fold across experiments (2 if both experiments classified correctly, # 1 if 1 experiment classified correctly, 0 is none did). Also, append the targets per voxel. # we use 'min' here because aggregate needs any function, but targets are the same between # the experiments compare_exp = all_testing.groupby(['voxel_indices', 'participants']).agg( {'hits': 'sum', 'targets': 'min'}).reset_index().sort_values(['voxel_indices', 'participants']) all_testing_movie = all_testing[all_testing.ds_type == 'movie'].sort_values( ['voxel_indices', 'participants']).reset_index() all_testing_loc = all_testing[all_testing.ds_type == 'loc'].sort_values( ['voxel_indices', 'participants']).reset_index() # append movie and loc predictions to the dataframe compare_exp['pred_movie'] = all_testing_movie.predictions compare_exp['pred_loc'] = all_testing_loc.predictions # get the ROIS from the classification ROIS = np.unique(ds_movie.sa.targets) # there can't be values greater than two or lower than zero assert compare_exp.hits.max() <= 2 assert compare_exp.hits.min() >= 0 return compare_exp, all_testing, ROIS
def dotheclassification(ds, bilateral, store_sens=True): """ Dotheclassification does the classification. It builds a linear gaussian naive bayes classifier, performs a leave-one-out crossvalidation and stores the sensitivities from the SGD classifier of each fold in a combined dataset for further use in a glm. If sens == False, the sensitivities are not stored, and only a classification is performed""" import matplotlib.pyplot as plt # set up the dataset: If I understand the sourcecode correctly, the # MulticlassClassifier wants to have unique labels in a sample attribute # called 'targets' and is quite stubborn with this name - I could not convince # it to look for targets somewhere else, so now I catering to his demands if bilateral: ds.sa['targets'] = ds.sa.bilat_ROIs else: ds.sa['targets'] = ds.sa.all_ROIs # necessary I believe regardless of the SKLLearnerAdapter from sklearn.linear_model import SGDClassifier # get a stochastic gradient descent into pymvpa by using the SKLLearnerAdapter. # Get it to perform 1 vs 1 decisions (instead of one vs all) with the MulticlassClassifier clf = mv.MulticlassClassifier( mv.SKLLearnerAdapter( SGDClassifier(loss='hinge', penalty='l2', class_weight='balanced'))) # prepare for callback of sensitivity extraction within CrossValidation sensitivities = [] if store_sens: def store_sens(data, node, result): sens = node.measure.get_sensitivity_analyzer( force_train=False)(data) # we also need to manually append the time attributes to the sens ds sens.fa['time_coords'] = data.fa['time_coords'] sens.fa['chunks'] = data.fa['chunks'] sensitivities.append(sens) # do a crossvalidation classification cv = mv.CrossValidation(clf, mv.NFoldPartitioner(attr='participant'), errorfx=mv.mean_match_accuracy, enable_ca=['stats'], callback=store_sens) else: cv = mv.CrossValidation(clf, mv.NFoldPartitioner(attr='participant'), errorfx=mv.mean_match_accuracy, enable_ca=['stats']) results = cv(ds) # save classification results with open(results_dir + 'avmovie_clf.txt', 'a') as f: f.write(cv.ca.stats.as_string(description=True)) # printing of the confusion matrix if bilateral: desired_order = ['VIS', 'LOC', 'OFA', 'FFA', 'EBA', 'PPA'] else: desired_order = [ 'brain', 'VIS', 'left LOC', 'right LOC', 'left OFA', 'right OFA', 'left FFA', 'right FFA', 'left EBA', 'right EBA', 'left PPA', 'right PPA' ] labels = get_known_labels(desired_order, cv.ca.stats.labels) # plot the confusion matrix with pymvpas build-in plot function currently fails # cv.ca.stats.plot(labels=labels, # numbers=True, # cmap='gist_heat_r') # plt.savefig(results_dir + 'confusion_matrix.png') # if niceplot: # ACC = cv.ca.stats.stats['mean(ACC)'] # plot_confusion(cv, # labels, # fn=results_dir + 'confusion_matrix_avmovie.svg', # figsize=(9, 9), # vmax=100, # cmap='Blues', # ACC='%.2f' % ACC) # mv.h5save(results_dir + 'SGD_cv_classification_results.hdf5', results) print('Saved the crossvalidation results.') if store_sens: mv.h5save(results_dir + 'sensitivities_nfold.hdf5', sensitivities) print('Saved the sensitivities.') # results now has the overall accuracy. results.samples gives the # accuracy per participant. # sensitivities contains a dataset for each participant with the # sensitivities as samples and class-pairings as attributes return sensitivities, cv
def dotheclassification(ds, bilateral): """This functions performs the classification in a one-vs-all fashion with a stochastic gradient descent. Future TODO: Selection of alpha may be better performed via GridSearchCV. To quote sklearns documentation: 'Finding a reasonable regularization term is best done using GridSearchCV, usually in the range 10.0**-np.arange(1,7).'""" # set up the dataset: If I understand the sourcecode correctly, the # SGDclassifier wants to have unique labels in a sample attribute # called 'targets' and is quite stubborn with this name - I could not convince # it to look for targets somewhere else, so now I'm catering to his demands if bilateral: ds.sa['targets'] = ds.sa.bilat_ROIs else: ds.sa['targets'] = ds.sa.all_ROIs clf = mv.SKLLearnerAdapter( SGDClassifier(loss='hinge', penalty='l2', class_weight='balanced')) cv = mv.CrossValidation(clf, mv.NFoldPartitioner(attr='participant'), errorfx=mv.mean_match_accuracy, enable_ca=['stats']) results = cv(ds) # save classification results with open(results_dir + 'SGD_clf.txt', 'a') as f: f.write(cv.ca.stats.as_string(description=True)) if bilateral: desired_order = ['brain', 'VIS', 'LOC', 'OFA', 'FFA', 'EBA', 'PPA'] else: desired_order = [ 'brain', 'VIS', 'left LOC', 'right LOC', 'left OFA', 'right OFA', 'left FFA', 'right FFA', 'left EBA', 'right EBA', 'left PPA', 'right PPA' ] labels = get_known_labels(desired_order, cv.ca.stats.labels) # print confusion matrix with pymvpas build in function cv.ca.stats.plot(labels=labels, numbers=True, cmap='gist_heat_r') plt.savefig(results_dir + 'confusion_matrix.png') # print confusion matrix with matplotlib if niceplot: ACC = cv.ca.stats.stats['mean(ACC)'] plot_confusion(cv, labels, fn=results_dir + 'confusion_matrix_SGD.svg', figsize=(9, 9), vmax=100, cmap='Blues', ACC='%.2f' % ACC) mv.h5save(results_dir + 'SGD_cv_classification_results.hdf5', results) print('Saved the crossvalidation results.') return cv