def dotheclassification(ds, classifier, bilateral ): """ Dotheclassification does the classification. Input: the dataset on which to perform a leave-one-out crossvalidation with a classifier of choice. Specify: the classifier to be used (gnb (linear gnb), l-sgd (linear sgd), sgd) whether the sensitivities should be computed and stored for later use whether the dataset has ROIs combined across hemisphere (bilateral) """ if classifier == 'gnb': # set up classifier prior = 'ratio' if bilateral: targets = 'bilat_ROIs' else: targets = 'all_ROIs' clf = mv.GNB(common_variance=True, prior=prior, space=targets) elif classifier == 'l-sgd': # set up the dataset: If I understand the sourcecode correctly, the # Stochastic Gradient Descent wants to have unique labels in a sample attribute # called 'targets' and is quite stubborn with this name - I could not convince # it to look for targets somewhere else, so now I catering to his demands if bilateral: ds.sa['targets'] = ds.sa.bilat_ROIs else: ds.sa['targets'] = ds.sa.all_ROIs # necessary I believe regardless of the SKLLearnerAdapter from sklearn.linear_model import SGDClassifier # get a stochastic gradient descent into pymvpa by using the SKLLearnerAdapter. # Get it to perform 1 vs 1 decisions (instead of one vs all) with the MulticlassClassifier clf = mv.MulticlassClassifier(mv.SKLLearnerAdapter(SGDClassifier(loss='hinge', penalty='l2', class_weight='balanced' ))) cv = mv.CrossValidation(clf, mv.NFoldPartitioner(attr='participant'), errorfx=mv.mean_match_accuracy, enable_ca=['stats']) results = cv(ds) return cv
def dotheclassification(ds_movie, ds_loc, classifier, bilateral): """ Dotheclassification does the classification. Input: the dataset on which to perform a leave-one-out crossvalidation with a classifier of choice. Specify: the classifier to be used (gnb (linear gnb), l-sgd (linear sgd), sgd) whether the sensitivities should be computed and stored for later use whether the dataset has ROIs combined across hemisphere (bilateral) """ dfs = [] for idx, ds in enumerate([ds_movie, ds_loc]): if bilateral: ds.sa['targets'] = ds.sa.bilat_ROIs else: ds.sa['targets'] = ds.sa.all_ROIs if classifier == 'gnb': # set up classifier prior = 'ratio' clf = mv.GNB(common_variance=True, prior=prior) elif classifier == 'sgd': # necessary I believe regardless of the SKLLearnerAdapter from sklearn.linear_model import SGDClassifier clf = mv.SKLLearnerAdapter(SGDClassifier(loss='hinge', penalty='l2', class_weight='balanced')) elif classifier == 'l-sgd': # necessary I believe regardless of the SKLLearnerAdapter from sklearn.linear_model import SGDClassifier # get a stochastic gradient descent into pymvpa by using the SKLLearnerAdapter. # Get it to perform 1 vs 1 decisions (instead of one vs all) with the MulticlassClassifier clf = mv.MulticlassClassifier(mv.SKLLearnerAdapter(SGDClassifier(loss='hinge', penalty='l2', class_weight='balanced' ))) # prepare for callback of sensitivity extraction within CrossValidation classifications = [] def store_class(data, node, result): # import pdb; pdb.set_trace() class_ds = mv.Dataset(samples=data.sa.voxel_indices) class_ds.sa['targets'] = data.sa.targets class_ds.sa['partitions'] = data.sa.partitions class_ds.sa['predictions'] = clf.predict(data) class_ds.sa['participant'] = data.sa.participant classifications.append(class_ds) # do a crossvalidation classification and store the classification results cv = mv.CrossValidation(clf, mv.NFoldPartitioner(attr='participant'), errorfx=mv.mean_match_accuracy, enable_ca=['stats'], callback=store_class) # import pdb; pdb.set_trace() results = cv(ds) # import pdb; pdb.set_trace() # save classification results as a Dataset ds_type = ['movie', 'loc'] mv.h5save(results_dir + 'cv_classification_results_{}.hdf5'.format(ds_type[idx]), classifications) print('Saved the classification results obtained during crossvalidation.') # get the classification list into a pandas dataframe for i, classification in enumerate(classifications): df = pd.DataFrame(data={'voxel_indices': list(classification.samples), 'targets': list(classification.sa.targets), 'predictions': list(classification.sa.predictions), 'partitions': list(classification.sa.partitions), 'participants': list(classification.sa.participant), 'ds_type': [ds_type[idx]] * len(classification.sa.predictions) } ) dfs.append(df) # two helper functions for later use in a lamda function def hits(row): if row['predictions'] == row['targets']: return 1 else: return 0 def parts(row): if row['partitions'] == 1: return "train" elif row['partitions'] == 2: return "test" # get all folds into one dataframe, disregard the index all_classifications = pd.concat(dfs, ignore_index=True) # compute hits as correspondence between target and prediction all_classifications['hits'] = all_classifications.apply(lambda row: hits(row), axis=1) # assign string labels to testing and training partitions (instead of 1, 2) all_classifications['parts'] = all_classifications.apply(lambda row: parts(row), axis=1) # transform voxel coordinates from arrays (unhashable) into tuples all_classifications['voxel_indices'] = all_classifications['voxel_indices'].apply(tuple) # subset the dataset to contain only the testing data all_testing = all_classifications[all_classifications.parts == "test"] # check that every participant is in the data assert len(all_testing.participants.unique()) == 15 # to check for correspondence between the sum of the two experiments confusion matrices, # do sth like this: len(all_testing[(all_testing['predictions'] == 'PPA') & (all_testing['targets'] == 'VIS')]) # this counts hits per fold across experiments (2 if both experiments classified correctly, # 1 if 1 experiment classified correctly, 0 is none did). Also, append the targets per voxel. # we use 'min' here because aggregate needs any function, but targets are the same between # the experiments compare_exp = all_testing.groupby(['voxel_indices', 'participants']).agg( {'hits': 'sum', 'targets': 'min'}).reset_index().sort_values(['voxel_indices', 'participants']) all_testing_movie = all_testing[all_testing.ds_type == 'movie'].sort_values( ['voxel_indices', 'participants']).reset_index() all_testing_loc = all_testing[all_testing.ds_type == 'loc'].sort_values( ['voxel_indices', 'participants']).reset_index() # append movie and loc predictions to the dataframe compare_exp['pred_movie'] = all_testing_movie.predictions compare_exp['pred_loc'] = all_testing_loc.predictions # get the ROIS from the classification ROIS = np.unique(ds_movie.sa.targets) # there can't be values greater than two or lower than zero assert compare_exp.hits.max() <= 2 assert compare_exp.hits.min() >= 0 return compare_exp, all_testing, ROIS
def dotheclassification(ds, bilateral, store_sens=True): """ Dotheclassification does the classification. It builds a linear gaussian naive bayes classifier, performs a leave-one-out crossvalidation and stores the sensitivities from the GNB classifier of each fold in a combined dataset for further use in a glm. If sens == False, the sensitivities are not stored, and only a classification is performed""" import matplotlib.pyplot as plt # set up classifier prior = 'ratio' if bilateral: targets = 'bilat_ROIs' else: targets = 'all_ROIs' gnb = mv.GNB(common_variance=True, prior=prior, space=targets) # prepare for callback of sensitivity extraction within CrossValidation sensitivities = [] if store_sens: def store_sens(data, node, result): sens = node.measure.get_sensitivity_analyzer( force_train=False)(data) # we also need to manually append the time attributes to the sens ds sens.fa['time_coords'] = data.fa['time_coords'] sens.fa['chunks'] = data.fa['chunks'] sensitivities.append(sens) # do a crossvalidation classification cv = mv.CrossValidation(gnb, mv.NFoldPartitioner(attr='participant'), errorfx=mv.mean_match_accuracy, enable_ca=['stats'], callback=store_sens) else: cv = mv.CrossValidation(gnb, mv.NFoldPartitioner(attr='participant'), errorfx=mv.mean_match_accuracy, enable_ca=['stats']) results = cv(ds) # save classification results with open(results_dir + 'avmovie_clf.txt', 'a') as f: f.write(cv.ca.stats.as_string(description=True)) # printing of the confusion matrix if bilateral: desired_order = ['VIS', 'LOC', 'OFA', 'FFA', 'EBA', 'PPA'] else: desired_order = [ 'brain', 'VIS', 'left LOC', 'right LOC', 'left OFA', 'right OFA', 'left FFA', 'right FFA', 'left EBA', 'right EBA', 'left PPA', 'right PPA' ] labels = get_known_labels(desired_order, cv.ca.stats.labels) # plot the confusion matrix with pymvpas build-in plot function currently fails # cv.ca.stats.plot(labels=labels, # numbers=True, # cmap='gist_heat_r') # plt.savefig(results_dir + 'confusion_matrix.png') if niceplot: ACC = cv.ca.stats.stats['mean(ACC)'] plot_confusion(cv, labels, fn=results_dir + 'confusion_matrix_avmovie.svg', figsize=(9, 9), vmax=100, cmap='Blues', ACC='%.2f' % ACC) mv.h5save(results_dir + 'gnb_cv_classification_results.hdf5', results) print('Saved the crossvalidation results.') if store_sens: mv.h5save(results_dir + 'sensitivities_nfold.hdf5', sensitivities) print('Saved the sensitivities.') # results now has the overall accuracy. results.samples gives the # accuracy per participant. # sensitivities contains a dataset for each participant with the # sensitivities as samples and class-pairings as attributes return sensitivities, cv
# print len(dataset.targets) # print dataset.chunks # print len(dataset.chunks) # REDUCE TO CLASS LABELS, AND ONLY KEEP CONDITIONS OF INTEREST (JAPANESE VS ENGLISH) dataset.targets = [t[0:2] for t in dataset.targets] dataset = dataset[N.array([l in ['jj', 'je', 'ej', 'ee'] for l in dataset.sa.targets], dtype='bool')] print '... and only',dataset.shape[0],'cases of interest (Language Switch between Japanese vs English)' dataset=M.datasets.miscfx.remove_invariant_features(dataset) print 'saving as compressed file',trimmedCache pickleFile = gzip.open(trimmedCache, 'wb', 5); pickle.dump(dataset, pickleFile); anovaSelectedSMLR = M.FeatureSelectionClassifier( M.GNB(common_variance=True), M.SensitivityBasedFeatureSelection( M.OneWayAnova(), M.FixedNElementTailSelector(500, mode='select', tail='upper') ), ) foldwiseCvedAnovaSelectedSMLR = M.CrossValidation( anovaSelectedSMLR, M.NFoldPartitioner(), enable_ca=['samples_error','stats', 'calling_time','confusion'] ) # run classifier print 'learning on detrended, normalised, averaged, Language Switch ...',datetime.datetime.now() results = foldwiseCvedAnovaSelectedSMLR(dataset) print '... done',datetime.datetime.now() print 'accuracy',N.round(100-N.mean(results)*100,1),'%',datetime.datetime.now()