def dotheclassification(ds, classifier, bilateral ): """ Dotheclassification does the classification. Input: the dataset on which to perform a leave-one-out crossvalidation with a classifier of choice. Specify: the classifier to be used (gnb (linear gnb), l-sgd (linear sgd), sgd) whether the sensitivities should be computed and stored for later use whether the dataset has ROIs combined across hemisphere (bilateral) """ if classifier == 'gnb': # set up classifier prior = 'ratio' if bilateral: targets = 'bilat_ROIs' else: targets = 'all_ROIs' clf = mv.GNB(common_variance=True, prior=prior, space=targets) elif classifier == 'l-sgd': # set up the dataset: If I understand the sourcecode correctly, the # Stochastic Gradient Descent wants to have unique labels in a sample attribute # called 'targets' and is quite stubborn with this name - I could not convince # it to look for targets somewhere else, so now I catering to his demands if bilateral: ds.sa['targets'] = ds.sa.bilat_ROIs else: ds.sa['targets'] = ds.sa.all_ROIs # necessary I believe regardless of the SKLLearnerAdapter from sklearn.linear_model import SGDClassifier # get a stochastic gradient descent into pymvpa by using the SKLLearnerAdapter. # Get it to perform 1 vs 1 decisions (instead of one vs all) with the MulticlassClassifier clf = mv.MulticlassClassifier(mv.SKLLearnerAdapter(SGDClassifier(loss='hinge', penalty='l2', class_weight='balanced' ))) cv = mv.CrossValidation(clf, mv.NFoldPartitioner(attr='participant'), errorfx=mv.mean_match_accuracy, enable_ca=['stats']) results = cv(ds) return cv
def __call__(self, valsTrain, labelsTrain, valsTest, doAncestralCV=True): """Trains on ancestral population followed by testing on admixed population. Optionally does cross validation on ancestral population. Arguments: - `valsTrain`: numpy array (nSamplesxnFeatures) of training samples - `labelsTrain`: list of nSamples labels - `valsTest`: numpy array of (nSamples2xnFeatures) of test samples """ #Create and normalize data ds = pymvpa.Dataset(valsTrain) ds.sa['targets'] = labelsTrain runtype = np.zeros(valsTrain.shape[0]) runtype[0::3] = 0 runtype[1::3] = 1 runtype[2::3] = 2 ds.sa['runtype'] = runtype try: #Train on ancestral self.classifier.train(ds) admixedClass = self.classifier.predict(valsTest) except pymvpa.DegenerateInputError: #The valsTrain is to small to contain information print "WARNING: Window is degenerate; guessing ancestry" admixedClass = np.zeros( valsTest.shape[0]) #Just assign ancestry to first pop if doAncestralCV: return 1. / len( np.unique(labelsTrain )), admixedClass #Assign success to create equal return admixedClass if doAncestralCV: #Cross Validated ancestral population hspl = pymvpa.NGroupPartitioner(3, attr='runtype') # cvte = pymvpa.CrossValidation(self.classifier, hspl) cvte = pymvpa.CrossValidation(self.classifier, hspl, enable_ca='stats') cv_results = cvte(ds) return cvte.ca.stats.matrix, admixedClass # ancestralSuccess=1-np.mean(cv_results) # return ancestralSuccess, admixedClass return admixedClass
dtype='bool')] print '... and only', dataset.shape[ 0], 'cases of interest (Keep vs Switch Language)' dataset = M.datasets.miscfx.remove_invariant_features(dataset) print 'saving as compressed file', trimmedCache pickleFile = gzip.open(trimmedCache, 'wb', 5) pickle.dump(dataset, pickleFile) anovaSelectedSMLR = M.FeatureSelectionClassifier( M.PLR(), M.SensitivityBasedFeatureSelection( M.OneWayAnova(), M.FixedNElementTailSelector(500, mode='select', tail='upper')), ) foldwiseCvedAnovaSelectedSMLR = M.CrossValidation( anovaSelectedSMLR, M.NFoldPartitioner(), enable_ca=['samples_error', 'stats', 'calling_time', 'confusion']) # run classifier print 'learning on detrended, normalised, averaged, Keep vs Switch ...', datetime.datetime.now( ) results = foldwiseCvedAnovaSelectedSMLR(dataset) print '... done', datetime.datetime.now() print 'accuracy', N.round(100 - N.mean(results) * 100, 1), '%', datetime.datetime.now() #New lines for out putting the result into a csv file. precision = N.round(100 - N.mean(results) * 100, 1) st = str(boldDelay) + ',' + str(stimulusWidth) + ',' + str(precision) + '\n' f = open("withinPredictionResult.csv", "a") f.write(st) f.close
#del ds_q2.sa['intents'] del ds_q2.sa['stats'] mv.zscore(ds_q2, chunks_attr='chunks') n_medial = {'lh': 3486, 'rh': 3491} medial_wall = np.where(np.sum(ds_q2.samples == 0, axis=0) == 200)[0].tolist() cortical_vertices = np.where( np.sum(ds_q2.samples == 0, axis=0) < 200)[0].tolist() assert len(medial_wall) == n_medial[hemisphere] n_vertices = ds_q2.fa.node_indices.shape[0] assert len(medial_wall) + len(cortical_vertices) == n_vertices # 2. cross validation __________________________________________________________________ # setting up classifier clf = mv.LinearCSVMC(space='targets') cv = mv.CrossValidation(clf, mv.NFoldPartitioner(attr='chunks')) cv_within = cv(ds_q2) cv_within np.mean(cv_within) # why is the mean lower? # 3. searchlight _______________________________________________________________________ fsaverage_gii = os.path.join(main_dir, 'fs_templates', hemisphere + '.pial.gii') surf = mv.surf.read(fsaverage_gii) # note: surf.vertices.shape (81920, 3) and surf.faces.shape (40962, 3) surface = surf, qe = mv.SurfaceQueryEngine(surf, radius=radii, distance_metric='dijkstra') sl = mv.Searchlight(cv, queryengine=qe, roi_ids=cortical_vertices) sl_q2 = sl(ds_q2) # 4. save output _______________________________________________________________________
hemisphere = sys.argv[2] task_list = ['beh', 'tax'] radii = 10.0 # 1. create pymvpa dataset ____________________________________________________________ ds_q3 = generate_dataset.create_dataset(sub_name, main_dir, task_list, hemisphere) ds_q3.sa['chunks'] = ds_q3.sa['tax'] ds_q3.sa['targets'] = ds_q3.sa['beh'] del ds_q3.sa['intents'] mv.zscore(ds_q3, chunks_attr='chunks') # 2. cross validation __________________________________________________________________ # setting up classifier clf = mv.LinearCSVMC() cv = mv.CrossValidation(clf, mv.NFoldPartitioner()) cv_within = cv(ds_q3) cv_within np.mean(cv_within) # why is the mean lower? # 3. searchlight _______________________________________________________________________ fsaverage_gii = os.path.join(main_dir, 'fs_templates', hemisphere + '.pial.gii') surf = mv.surf.read(fsaverage_gii) # note: surf.vertices.shape (81920, 3) and surf.faces.shape (40962, 3) surface = surf, qe = mv.SurfaceQueryEngine(surf, radius=radii, distance_metric='dijkstra') sl = mv.Searchlight(cv, queryengine=qe, nproc=4) sl_q3 = sl(ds_q3) # 4. save output _______________________________________________________________________
def dotheclassification(ds, bilateral, store_sens=True): """ Dotheclassification does the classification. It builds a linear gaussian naive bayes classifier, performs a leave-one-out crossvalidation and stores the sensitivities from the SGD classifier of each fold in a combined dataset for further use in a glm. If sens == False, the sensitivities are not stored, and only a classification is performed""" import matplotlib.pyplot as plt # set up the dataset: If I understand the sourcecode correctly, the # MulticlassClassifier wants to have unique labels in a sample attribute # called 'targets' and is quite stubborn with this name - I could not convince # it to look for targets somewhere else, so now I catering to his demands if bilateral: ds.sa['targets'] = ds.sa.bilat_ROIs else: ds.sa['targets'] = ds.sa.all_ROIs # necessary I believe regardless of the SKLLearnerAdapter from sklearn.linear_model import SGDClassifier # get a stochastic gradient descent into pymvpa by using the SKLLearnerAdapter. # Get it to perform 1 vs 1 decisions (instead of one vs all) with the MulticlassClassifier clf = mv.MulticlassClassifier( mv.SKLLearnerAdapter( SGDClassifier(loss='hinge', penalty='l2', class_weight='balanced'))) # prepare for callback of sensitivity extraction within CrossValidation sensitivities = [] if store_sens: def store_sens(data, node, result): sens = node.measure.get_sensitivity_analyzer( force_train=False)(data) # we also need to manually append the time attributes to the sens ds sens.fa['time_coords'] = data.fa['time_coords'] sens.fa['chunks'] = data.fa['chunks'] sensitivities.append(sens) # do a crossvalidation classification cv = mv.CrossValidation(clf, mv.NFoldPartitioner(attr='participant'), errorfx=mv.mean_match_accuracy, enable_ca=['stats'], callback=store_sens) else: cv = mv.CrossValidation(clf, mv.NFoldPartitioner(attr='participant'), errorfx=mv.mean_match_accuracy, enable_ca=['stats']) results = cv(ds) # save classification results with open(results_dir + 'avmovie_clf.txt', 'a') as f: f.write(cv.ca.stats.as_string(description=True)) # printing of the confusion matrix if bilateral: desired_order = ['VIS', 'LOC', 'OFA', 'FFA', 'EBA', 'PPA'] else: desired_order = [ 'brain', 'VIS', 'left LOC', 'right LOC', 'left OFA', 'right OFA', 'left FFA', 'right FFA', 'left EBA', 'right EBA', 'left PPA', 'right PPA' ] labels = get_known_labels(desired_order, cv.ca.stats.labels) # plot the confusion matrix with pymvpas build-in plot function currently fails # cv.ca.stats.plot(labels=labels, # numbers=True, # cmap='gist_heat_r') # plt.savefig(results_dir + 'confusion_matrix.png') # if niceplot: # ACC = cv.ca.stats.stats['mean(ACC)'] # plot_confusion(cv, # labels, # fn=results_dir + 'confusion_matrix_avmovie.svg', # figsize=(9, 9), # vmax=100, # cmap='Blues', # ACC='%.2f' % ACC) # mv.h5save(results_dir + 'SGD_cv_classification_results.hdf5', results) print('Saved the crossvalidation results.') if store_sens: mv.h5save(results_dir + 'sensitivities_nfold.hdf5', sensitivities) print('Saved the sensitivities.') # results now has the overall accuracy. results.samples gives the # accuracy per participant. # sensitivities contains a dataset for each participant with the # sensitivities as samples and class-pairings as attributes return sensitivities, cv
d.sa['conditions'] = conditions d.sa['taxonomy'] = taxonomy d.sa['behavior'] = behavior if ds is None: ds = d else: ds = mv.vstack((ds, d)) ds.fa['node_indices'] = range(ds.shape[1]) # zscore all of our samples mv.zscore(ds, chunks_attr='chunks', dtype='float32') # load in surgace and get searchlight query radius = 10 surface = mv.surf.read(join(data_path, '{0}.pial.gii'.format(hemi))) # this is an arbitrary radius and distance metric! query = mv.SurfaceQueryEngine(surface, radius, distance_metric='dijkstra') # based off PyMVPA tutorial clf = mv.LinearNuSVMC(space=predict) cv = mv.CrossValidation(clf, mv.NFoldPartitioner(attr=train_on), errorfx=lambda p, t: np.mean(p == t), enable_ca=['stats']) searchlights = mv.Searchlight(cv, queryengine=query, postproc=mv.mean_sample(), roi_ids=None) sl_clf_results = searchlights(ds) outstr = save_path + 'results/sub' + sub + '_sl_clf_' + predict + '_' + hemi res = np.array(sl_clf_results) np.save(outstr, res)
def dotheclassification(ds, bilateral, store_sens=True): """ Dotheclassification does the classification. It builds a linear gaussian naive bayes classifier, performs a leave-one-out crossvalidation and stores the sensitivities from the GNB classifier of each fold in a combined dataset for further use in a glm. If sens == False, the sensitivities are not stored, and only a classification is performed""" import matplotlib.pyplot as plt # set up classifier prior = 'ratio' if bilateral: targets = 'bilat_ROIs' else: targets = 'all_ROIs' gnb = mv.GNB(common_variance=True, prior=prior, space=targets) # prepare for callback of sensitivity extraction within CrossValidation sensitivities = [] if store_sens: def store_sens(data, node, result): sens = node.measure.get_sensitivity_analyzer( force_train=False)(data) # we also need to manually append the time attributes to the sens ds sens.fa['time_coords'] = data.fa['time_coords'] sens.fa['chunks'] = data.fa['chunks'] sensitivities.append(sens) # do a crossvalidation classification cv = mv.CrossValidation(gnb, mv.NFoldPartitioner(attr='participant'), errorfx=mv.mean_match_accuracy, enable_ca=['stats'], callback=store_sens) else: cv = mv.CrossValidation(gnb, mv.NFoldPartitioner(attr='participant'), errorfx=mv.mean_match_accuracy, enable_ca=['stats']) results = cv(ds) # save classification results with open(results_dir + 'avmovie_clf.txt', 'a') as f: f.write(cv.ca.stats.as_string(description=True)) # printing of the confusion matrix if bilateral: desired_order = ['VIS', 'LOC', 'OFA', 'FFA', 'EBA', 'PPA'] else: desired_order = [ 'brain', 'VIS', 'left LOC', 'right LOC', 'left OFA', 'right OFA', 'left FFA', 'right FFA', 'left EBA', 'right EBA', 'left PPA', 'right PPA' ] labels = get_known_labels(desired_order, cv.ca.stats.labels) # plot the confusion matrix with pymvpas build-in plot function currently fails # cv.ca.stats.plot(labels=labels, # numbers=True, # cmap='gist_heat_r') # plt.savefig(results_dir + 'confusion_matrix.png') if niceplot: ACC = cv.ca.stats.stats['mean(ACC)'] plot_confusion(cv, labels, fn=results_dir + 'confusion_matrix_avmovie.svg', figsize=(9, 9), vmax=100, cmap='Blues', ACC='%.2f' % ACC) mv.h5save(results_dir + 'gnb_cv_classification_results.hdf5', results) print('Saved the crossvalidation results.') if store_sens: mv.h5save(results_dir + 'sensitivities_nfold.hdf5', sensitivities) print('Saved the sensitivities.') # results now has the overall accuracy. results.samples gives the # accuracy per participant. # sensitivities contains a dataset for each participant with the # sensitivities as samples and class-pairings as attributes return sensitivities, cv
np.sum(ds.samples == 0, axis=0) < n_conditions * 5)[0].tolist() assert len(medial_wall) == n_medial[hemi] assert len(medial_wall) + len(cortical_vertices) == n_vertices #np.save(join(mvpa_dir, 'cortical_vertices_{0}.npy'.format(hemi)), cortical_vertices) #cortical_vertices = = np.load(join(mvpa_dir, 'cortical_vertices_{0}.npy').tolist() # Z-score features across samples #mv.zscore(ds, chunks_attr='runs') ds.samples = ((ds.samples - np.mean(ds.samples, axis=1)[:, None]) / np.std(ds.samples, axis=1)[:, None]) clf = mv.LinearCSVMC(space=targets) cv = mv.CrossValidation(clf, mv.NFoldPartitioner(attr=chunks), errorfx=mv.mean_match_accuracy) sl = mv.Searchlight(cv, queryengine=qe, enable_ca=['roi_sizes'], nproc=1, roi_ids=cortical_vertices) #sl = mv.Searchlight(cv_rsa, queryengine=qe, enable_ca=['roi_sizes'], # nproc=1, results_backend='native', roi_ids=cortical_vertices) #tmp_prefix='/local/tmp/sam_sl_p{0}_{1}_'.format(participant_id, hemi) mv.debug.active += ['SLC'] sl_result = sl(ds) # Average across folds and finalize result on surface print("Average searchlight size = {0}".format(np.mean(sl.ca.roi_sizes)))
def dotheclassification(ds_movie, ds_loc, classifier, bilateral): """ Dotheclassification does the classification. Input: the dataset on which to perform a leave-one-out crossvalidation with a classifier of choice. Specify: the classifier to be used (gnb (linear gnb), l-sgd (linear sgd), sgd) whether the sensitivities should be computed and stored for later use whether the dataset has ROIs combined across hemisphere (bilateral) """ dfs = [] for idx, ds in enumerate([ds_movie, ds_loc]): if bilateral: ds.sa['targets'] = ds.sa.bilat_ROIs else: ds.sa['targets'] = ds.sa.all_ROIs if classifier == 'gnb': # set up classifier prior = 'ratio' clf = mv.GNB(common_variance=True, prior=prior) elif classifier == 'sgd': # necessary I believe regardless of the SKLLearnerAdapter from sklearn.linear_model import SGDClassifier clf = mv.SKLLearnerAdapter(SGDClassifier(loss='hinge', penalty='l2', class_weight='balanced')) elif classifier == 'l-sgd': # necessary I believe regardless of the SKLLearnerAdapter from sklearn.linear_model import SGDClassifier # get a stochastic gradient descent into pymvpa by using the SKLLearnerAdapter. # Get it to perform 1 vs 1 decisions (instead of one vs all) with the MulticlassClassifier clf = mv.MulticlassClassifier(mv.SKLLearnerAdapter(SGDClassifier(loss='hinge', penalty='l2', class_weight='balanced' ))) # prepare for callback of sensitivity extraction within CrossValidation classifications = [] def store_class(data, node, result): # import pdb; pdb.set_trace() class_ds = mv.Dataset(samples=data.sa.voxel_indices) class_ds.sa['targets'] = data.sa.targets class_ds.sa['partitions'] = data.sa.partitions class_ds.sa['predictions'] = clf.predict(data) class_ds.sa['participant'] = data.sa.participant classifications.append(class_ds) # do a crossvalidation classification and store the classification results cv = mv.CrossValidation(clf, mv.NFoldPartitioner(attr='participant'), errorfx=mv.mean_match_accuracy, enable_ca=['stats'], callback=store_class) # import pdb; pdb.set_trace() results = cv(ds) # import pdb; pdb.set_trace() # save classification results as a Dataset ds_type = ['movie', 'loc'] mv.h5save(results_dir + 'cv_classification_results_{}.hdf5'.format(ds_type[idx]), classifications) print('Saved the classification results obtained during crossvalidation.') # get the classification list into a pandas dataframe for i, classification in enumerate(classifications): df = pd.DataFrame(data={'voxel_indices': list(classification.samples), 'targets': list(classification.sa.targets), 'predictions': list(classification.sa.predictions), 'partitions': list(classification.sa.partitions), 'participants': list(classification.sa.participant), 'ds_type': [ds_type[idx]] * len(classification.sa.predictions) } ) dfs.append(df) # two helper functions for later use in a lamda function def hits(row): if row['predictions'] == row['targets']: return 1 else: return 0 def parts(row): if row['partitions'] == 1: return "train" elif row['partitions'] == 2: return "test" # get all folds into one dataframe, disregard the index all_classifications = pd.concat(dfs, ignore_index=True) # compute hits as correspondence between target and prediction all_classifications['hits'] = all_classifications.apply(lambda row: hits(row), axis=1) # assign string labels to testing and training partitions (instead of 1, 2) all_classifications['parts'] = all_classifications.apply(lambda row: parts(row), axis=1) # transform voxel coordinates from arrays (unhashable) into tuples all_classifications['voxel_indices'] = all_classifications['voxel_indices'].apply(tuple) # subset the dataset to contain only the testing data all_testing = all_classifications[all_classifications.parts == "test"] # check that every participant is in the data assert len(all_testing.participants.unique()) == 15 # to check for correspondence between the sum of the two experiments confusion matrices, # do sth like this: len(all_testing[(all_testing['predictions'] == 'PPA') & (all_testing['targets'] == 'VIS')]) # this counts hits per fold across experiments (2 if both experiments classified correctly, # 1 if 1 experiment classified correctly, 0 is none did). Also, append the targets per voxel. # we use 'min' here because aggregate needs any function, but targets are the same between # the experiments compare_exp = all_testing.groupby(['voxel_indices', 'participants']).agg( {'hits': 'sum', 'targets': 'min'}).reset_index().sort_values(['voxel_indices', 'participants']) all_testing_movie = all_testing[all_testing.ds_type == 'movie'].sort_values( ['voxel_indices', 'participants']).reset_index() all_testing_loc = all_testing[all_testing.ds_type == 'loc'].sort_values( ['voxel_indices', 'participants']).reset_index() # append movie and loc predictions to the dataframe compare_exp['pred_movie'] = all_testing_movie.predictions compare_exp['pred_loc'] = all_testing_loc.predictions # get the ROIS from the classification ROIS = np.unique(ds_movie.sa.targets) # there can't be values greater than two or lower than zero assert compare_exp.hits.max() <= 2 assert compare_exp.hits.min() >= 0 return compare_exp, all_testing, ROIS
if hyperalign: ds = mappers[i][participant].forward(ds) print("Hyperaligned participant {0}".format(participant)) if zscore_features: mv.zscore(ds, chunks_attr=None) ds.fa['node_indices'] = range(ds.shape[1]) ds.fa['center_ids'] = range(ds.shape[1]) ds_all = mv.vstack((ds1, ds2, ds3, ds4), fa='update') rsa.PDist(**kwargs) #variant_ids = mv.remove_invariant_features(ds_both).fa.center_ids.tolist() # Set up cross-validated RSA cv_rsa_ = mv.CrossValidation(mv.CDist(pairwise_metric='correlation'), mv.HalfPartitioner(attr='sessions'), errorfx=None) # cv_rsa above would return all kinds of .sa which are important # but must be the same across searchlights. so we first apply it # to the entire ds to capture them cv_rsa_out = cv_rsa_(ds_all) target_sa = cv_rsa_out.sa.copy(deep=True) # And now create a postproc which would verify and strip them off # to just return samples from mvpa2.testing.tools import assert_collections_equal from mvpa2.base.collections import SampleAttributesCollection from mvpa2.base.node import Node def lean_errorfx(ds):#Node): #def __call__(self, ds):
def dotheclassification(ds, bilateral): """This functions performs the classification in a one-vs-all fashion with a stochastic gradient descent. Future TODO: Selection of alpha may be better performed via GridSearchCV. To quote sklearns documentation: 'Finding a reasonable regularization term is best done using GridSearchCV, usually in the range 10.0**-np.arange(1,7).'""" # set up the dataset: If I understand the sourcecode correctly, the # SGDclassifier wants to have unique labels in a sample attribute # called 'targets' and is quite stubborn with this name - I could not convince # it to look for targets somewhere else, so now I'm catering to his demands if bilateral: ds.sa['targets'] = ds.sa.bilat_ROIs else: ds.sa['targets'] = ds.sa.all_ROIs clf = mv.SKLLearnerAdapter( SGDClassifier(loss='hinge', penalty='l2', class_weight='balanced')) cv = mv.CrossValidation(clf, mv.NFoldPartitioner(attr='participant'), errorfx=mv.mean_match_accuracy, enable_ca=['stats']) results = cv(ds) # save classification results with open(results_dir + 'SGD_clf.txt', 'a') as f: f.write(cv.ca.stats.as_string(description=True)) if bilateral: desired_order = ['brain', 'VIS', 'LOC', 'OFA', 'FFA', 'EBA', 'PPA'] else: desired_order = [ 'brain', 'VIS', 'left LOC', 'right LOC', 'left OFA', 'right OFA', 'left FFA', 'right FFA', 'left EBA', 'right EBA', 'left PPA', 'right PPA' ] labels = get_known_labels(desired_order, cv.ca.stats.labels) # print confusion matrix with pymvpas build in function cv.ca.stats.plot(labels=labels, numbers=True, cmap='gist_heat_r') plt.savefig(results_dir + 'confusion_matrix.png') # print confusion matrix with matplotlib if niceplot: ACC = cv.ca.stats.stats['mean(ACC)'] plot_confusion(cv, labels, fn=results_dir + 'confusion_matrix_SGD.svg', figsize=(9, 9), vmax=100, cmap='Blues', ACC='%.2f' % ACC) mv.h5save(results_dir + 'SGD_cv_classification_results.hdf5', results) print('Saved the crossvalidation results.') return cv