def test_mapper_vs_zscore(): """Test by comparing to results of elderly z-score function """ # data: 40 sample feature line in 20d space (40x20; samples x features) dss = [ dataset_wizard(np.concatenate( [np.arange(40) for i in range(20)]).reshape(20,-1).T, targets=1, chunks=1), ] + datasets.values() for ds in dss: ds1 = deepcopy(ds) ds2 = deepcopy(ds) zsm = ZScoreMapper(chunks_attr=None) assert_raises(RuntimeError, zsm.forward, ds1.samples) idhashes = (idhash(ds1), idhash(ds1.samples)) zsm.train(ds1) idhashes_train = (idhash(ds1), idhash(ds1.samples)) assert_equal(idhashes, idhashes_train) # forward dataset ds1z_ds = zsm.forward(ds1) idhashes_forwardds = (idhash(ds1), idhash(ds1.samples)) # must not modify samples in place! assert_equal(idhashes, idhashes_forwardds) # forward samples explicitly ds1z = zsm.forward(ds1.samples) idhashes_forward = (idhash(ds1), idhash(ds1.samples)) assert_equal(idhashes, idhashes_forward) zscore(ds2, chunks_attr=None) assert_array_almost_equal(ds1z, ds2.samples) assert_array_equal(ds1.samples, ds.samples)
def main(subject, study_dir, mask, suffix='_stim2'): from mvpa2.mappers.zscore import zscore from mvpa2.mappers.fx import mean_group_sample from wikisim import mvpa # load subject data sp = su.SubjPath(subject, study_dir) vols = task.prex_vols(sp.path('behav', 'log')) # load fmri data ds = mvpa.load_prex_beta(sp, suffix, mask, verbose=1) # zscore ds.sa['run'] = vols.run.values zscore(ds, chunks_attr='run') # average over item presentations ds.sa['itemno'] = vols.itemno.to_numpy() m = mean_group_sample(['itemno']) dsm = ds.get_mapped(m) m_vols = vols.groupby('itemno', as_index=False).mean() # save data samples and corresponding volume information res_dir = os.path.join(sp.study_dir, 'batch', 'glm', 'prex' + suffix, 'roi', mask) if not os.path.exists(res_dir): os.makedirs(res_dir) mat_file = os.path.join(res_dir, f'pattern_{subject}.txt') tab_file = os.path.join(res_dir, f'pattern_{subject}.csv') np.savetxt(mat_file, dsm.samples) m_vols.to_csv(tab_file)
def test_connectivity_hyperalignment(self): skip_if_no_external('scipy') skip_if_no_external('hdf5') # needed for default results backend hdf5 dss_train, dss_test, surface = self.get_testdata() qe = SurfaceQueryEngine(surface, 10, fa_node_key='node_indices') cha = ConnectivityHyperalignment(mask_ids=[0, 3, 6, 9], seed_indices=[0, 3, 6, 9], seed_queryengines=qe, queryengine=qe) mappers = cha(dss_train) aligned_train = [ mapper.forward(ds) for ds, mapper in zip(dss_train, mappers) ] aligned_test = [ mapper.forward(ds) for ds, mapper in zip(dss_test, mappers) ] for ds in aligned_train + aligned_test: zscore(ds, chunks_attr=None) sim_train_before = self.compute_connectivity_profile_similarity( dss_train) sim_train_after = self.compute_connectivity_profile_similarity( aligned_train) sim_test_before = self.compute_connectivity_profile_similarity( dss_test) sim_test_after = self.compute_connectivity_profile_similarity( aligned_test) # ISC should be higher after CHA for both training and testing data self.assertTrue(sim_train_after.mean() > sim_train_before.mean()) self.assertTrue(sim_test_after.mean() > sim_test_before.mean())
def prepare_subject_for_hyperalignment(subject_label, bold_fname, mask_fname, out_dir): print('Loading data %s with mask %s' % (bold_fname, mask_fname)) ds = fmri_dataset(samples=bold_fname, mask=mask_fname) zscore(ds, chunks_attr=None) out_fname = os.path.join(out_dir, 'sub-%s_data.hdf5' % subject_label) print('Saving to %s' % out_fname) h5save(out_fname, ds)
def _get_seed_means(self, measure, queryengine, dataset, seed_indices): # Computing seed data as mean timeseries in each SL seed_data = Searchlight(measure, queryengine=queryengine, nproc=self.params.nproc, roi_ids=seed_indices) seed_data = seed_data(dataset) zscore(seed_data, chunks_attr=None) return seed_data
def compute_connectomes(datasets, queryengine, target_indices): """ Parameters: ----------- datasets: a list of PyMVPA datasets, one per subject. queryengine: the trained PyMVPA Surface queryengine, trained on the surface defining this data and the searchlight radius matching your analysis. target_indices: the indices of the vertices where each searchlight for a connectivity target will be centered. Returns: ------- connectomes: a list of connectivity matrices, where each entry is the correlation between the timeseries of a corresponding searchlight centered on each a connectivity seed and a connectivity target. """ conn_metric = lambda x,y: np.dot(x.samples, y.samples)/x.nsamples connectivity_mapper = FxyMapper(conn_metric) mean_feature_measure = MeanFeatureMeasure() # compute means for aligning seed features conn_means = [seed_means(MeanFeatureMeasure(), queryengine, ds, target_indices) for ds in datasets] conn_targets = [] for csm in conn_means: zscore(csm, chunks_attr=None) conn_targets.append(csm) connectomes = [] for target, ds in zip(conn_targets, datasets): conn_mapper.train(target) connectome = connectivity_mapper.forward(ds) connectome.fa = ds.fa zscore(connectome, chunks_attr=None) connectomes.append(connectome) return connectomes
def _seed_means(self, measure, queryengine, ds, seed_indices): # Seed data is the mean timeseries for each searchlight seed_data = Searchlight(measure, queryengine=queryengine, nproc=self.nproc, roi_ids=np.concatenate(seed_indices).copy()) if isinstance(ds,np.ndarray): ds = Dataset(ds) seed_data = seed_data(ds) zscore(seed_data.samples, chunks_attr=None) return seed_data
def test_zscore_withoutchunks(): # just a smoke test to see if all issues of # https://github.com/PyMVPA/PyMVPA/issues/26 # are fixed from mvpa2.datasets import Dataset ds = Dataset(np.arange(32).reshape((8, -1)), sa=dict(targets=range(8))) zscore(ds, chunks_attr=None) assert (np.any(ds.samples != np.arange(32).reshape((8, -1)))) ds_summary = ds.summary() assert (ds_summary is not None)
def test_zscore_withoutchunks(): # just a smoke test to see if all issues of # https://github.com/PyMVPA/PyMVPA/issues/26 # are fixed from mvpa2.datasets import Dataset ds = Dataset(np.arange(32).reshape((8,-1)), sa=dict(targets=range(8))) zscore(ds, chunks_attr=None) assert(np.any(ds.samples != np.arange(32).reshape((8,-1)))) ds_summary = ds.summary() assert(ds_summary is not None)
def compute_connectivity_profile_similarity(self, dss): # from scipy.spatial.distance import pdist, squareform # conns = [1 - squareform(pdist(ds.samples.T, 'correlation')) for ds in dss] conns = [np.corrcoef(ds.samples.T) for ds in dss] conn_sum = np.sum(conns, axis=0) sim = np.zeros((len(dss), dss[0].shape[1])) for i, conn in enumerate(conns): conn_diff = conn_sum - conn zscore(conn_diff, chunks_attr=None) zscore(conn, chunks_attr=None) sim[i] = np.mean(conn_diff * conn, axis=0) return sim
def _prep_h2a_data(self, response_data, node_indices): for d in response_data: if isinstance(d, np.ndarray): d = Dataset(d) d.fa['node_indices']= node_indices.copy() connectivity_data = self._get_connectomes(response_data) h2a_input_data = self._frobenius_norm_and_merge(connectivity_data, response_data, node_indices) for d in h2a_input_data: d.fa['node_indices'] = node_indices.copy() zscore(d, chunks_attr=None) return h2a_input_data
def preprocess_betas(paths, sub, btype="LSS", c="trial_type", roi="grayMatter", z=True): from project_code import projectutils as pu rds = pu.loadsubbetas(paths, sub, btype=btype, m=roi) rds.sa['targets'] = rds.sa[c] if z: from mvpa2.mappers.zscore import zscore zscore(rds, chunks_attr='chunks') return rds
def main(subject, study_dir, mask, stat, res_name, items='ac', suffix='_stim_fix2', feature_mask=None, radius=3, n_perm=1000, n_proc=None): from mvpa2.datasets.mri import map2nifti from mvpa2.mappers.zscore import zscore from mvpa2.measures.searchlight import sphere_searchlight from nireact import mvpa # lookup subject directory sp = su.SubjPath(subject, study_dir) # load task information vols = task.disp_vols(sp.path('behav', 'log')) # unpack the items option to get item type code item_names = 'abc' item_comp = [item_names.index(name) + 1 for name in items] # get post runs, A and C items only include = ((vols.run >= 5) & np.isin(vols.item_type, item_comp) & vols.correct == 1) post = vols.loc[include, :] # load beta series ds = mvpa.load_disp_beta(sp, suffix, mask, feature_mask, verbose=1) # define measure and contrasts to write out contrasts = ['pos', 'block_inter', 'inter_block'] m = mvpa.TriadVector(post, item_comp, stat, contrasts, n_perm) # zscore ds.sa['run'] = vols.run.values zscore(ds, chunks_attr='run') # searchlight print('Running searchlight...') sl = sphere_searchlight(m, radius=radius, nproc=n_proc) sl_map = sl(ds[include]) # save results nifti_include = map2nifti(ds, sl_map[-1]) for i, contrast in enumerate(contrasts): res_dir = sp.path('rsa', f'{res_name}_{contrast}') if not os.path.exists(res_dir): os.makedirs(res_dir) nifti = map2nifti(ds, sl_map[i]) nifti.to_filename(su.impath(res_dir, 'zstat')) nifti_include.to_filename(su.impath(res_dir, 'included'))
def create_betas_per_trial_with_pymvpa_roni(study_path, subj, conf, mask_name, flavor, TR): dhandle = OpenFMRIDataset(study_path) model = 1 task = 1 # Do this for other tasks as well. not only the first mask_fname = _opj(study_path, "sub{:0>3d}".format(subj), "masks", conf.mvpa_tasks[0], "{}.nii.gz".format(mask_name)) print mask_fname run_datasets = [] for run_id in dhandle.get_task_bold_run_ids(task)[subj]: if type(run_id) == str: continue # all_events = dhandle.get_bold_run_model(model, subj, run_id) all_events = get_bold_run_model(dhandle, 2, subj, run_id) run_events = [] i = 0 for event in all_events: if event["task"] == task: event["condition"] = "{}-{}".format(event["condition"], event["id"]) run_events.append(event) i += 1 # load BOLD data for this run (with masking); add 0-based chunk ID run_ds = dhandle.get_bold_run_dataset(subj, task, run_id, flavor=flavor, chunks=run_id - 1, mask=mask_fname) # convert event info into a sample attribute and assign as 'targets' run_ds.sa.time_coords = run_ds.sa.time_indices * TR run_ds.sa["targets"] = events2sample_attr(run_events, run_ds.sa.time_coords, noinfolabel="rest") # additional time series preprocessing can go here poly_detrend(run_ds, polyord=1, chunks_attr="chunks") zscore(run_ds, chunks_attr="chunks", param_est=("targets", ["rest"]), dtype="float32") glm_dataset = fit_event_hrf_model(run_ds, run_events, time_attr="time_coords", condition_attr="condition") glm_dataset.sa["targets"] = [x[: x.find("-")] for x in glm_dataset.sa.condition] glm_dataset.sa["id"] = [x[x.find("-") + 1 :] for x in glm_dataset.sa.condition] glm_dataset.sa.condition = glm_dataset.sa["targets"] glm_dataset.sa["chunks"] = [run_id - 1] * len(glm_dataset.samples) # If a trial was dropped (the subject pressed on a button) than the counter trial from the # other condition should also be dropped for pair in conf.conditions_to_compare: cond_bool = np.array([c in pair for c in glm_dataset.sa["condition"]]) sub_dataset = glm_dataset[cond_bool] c = Counter(sub_dataset.sa.id) for value in c: if c[value] < 2: id_bool = np.array([value in cond_id for cond_id in glm_dataset.sa["id"]]) glm_dataset = glm_dataset[np.bitwise_not(np.logical_and(id_bool, cond_bool))] run_datasets.append(glm_dataset) return vstack(run_datasets, 0)
def get_dissim_roi(subnr): ds = h5load(fns.betafn(subnr)) ds = ds[:, mask_] ds = ds[ds.sa.condition != 'self'] zscore(ds, chunks_attr='chunks') ds = mean_group_sample(['condition'])(ds) names = [] dissims = [] for roi, (center, ids) in rois.iteritems(): names.append(roi) sample_roi = ds.samples[:, ids] dissim_roi = pdist(sample_roi, 'correlation') dissims.append(dissim_roi) dss = dataset_wizard(dissims, targets=names) return dss
def preprocess_data(paths, sublist, sub, filter_params=[49, 2], roi="grayMatter", z=True): dsdict = loadsubdata(paths, sublist, m=roi) tds = dsdict[sub] beta_events = loadevents(paths, sublist) # savitsky golay filtering from pythonutils import savgolfilter as SGF SGF.sg_filter(tds, filter_params[0], filter_params[1]) # zscore entire set. if done chunk-wise, there is no double-dipping (since we leave a chunk out at a time). if z: from mvpa2.mappers.zscore import zscore zscore(tds, chunks_attr='chunks') rds, events = amendtimings(tds, beta_events[sub]) return rds, events
def normalize(dss, norm_type): if norm_type == 'zscore': if isinstance(dss, (list, tuple, np.ndarray)): _ = [zscore(sd, chunks_attr=None) for sd in dss] else: zscore(dss, chunks_attr=None) elif norm_type == 'percent_signal_change': if isinstance(dss, (list, tuple, np.ndarray)): _ = [psc(sd, chunks_attr=None) for sd in dss] else: psc(dss, chunks_attr=None) elif norm_type == 'demean': if isinstance(dss, (list, tuple, np.ndarray)): _ = [psc(sd, scale=False, chunks_attr=None) for sd in dss] else: psc(dss, scale=False, chunks_attr=None) return dss
def _level1(self, datasets, commonspace, ref_ds, mappers, residuals): params = self.params # for quicker access ;) data_mapped = [ds.samples for ds in datasets] counts = 1 # number of datasets used so far for generating commonspace for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 1: ds #%i" % i) if i == ref_ds: continue # assign common space to ``space`` of the mapper, because this is # where it will be looking for it ds_new.sa[m.get_space()] = commonspace # find transformation of this dataset into the current common space m.train(ds_new) # remove common space attribute again to save on memory when the # common space is updated for the next iteration del ds_new.sa[m.get_space()] # project this dataset into the current common space ds_ = m.forward(ds_new.samples) if params.zscore_common: zscore(ds_, chunks_attr=None) # replace original dataset with mapped one -- only the reference # dataset will remain unchanged data_mapped[i] = ds_ # compute first-level residuals wrt to the initial common space if residuals is not None: residuals[0, i] = np.linalg.norm(ds_ - commonspace) # Update the common space. This is an incremental update after # processing each 1st-level dataset. Maybe there should be a flag # to make a batch update after processing all 1st-level datasets # to an identical 1st-level common space # TODO: make just a function so we dont' waste space if params.level1_equal_weight: commonspace = params.combiner1(ds_, commonspace, weights=(float(counts), 1.0)) else: commonspace = params.combiner1(ds_, commonspace) counts += 1 if params.zscore_common: zscore(commonspace, chunks_attr=None) return data_mapped
def hypalign(source, target, node, surfsel, projmat,mask): slneighbors = surfsel[node] if len(slneighbors) == 0: return projmat sl = [] sl = [source[:, slneighbors], target[:, slneighbors]] hmapper = ProcrusteanMapper(space='commonspace') refds = sl[1].copy() commonspace = refds.samples zscore(commonspace,chunks_attr=None) ds_new = sl[0].copy() ds_new.sa[hmapper.get_space()] = commonspace.astype(float) hmapper.train(ds_new) conproj = hmapper.proj m, n = conproj.shape index = np.array(slneighbors) projmat[np.ix_(index,index)] += np.asarray(conproj) return projmat
def test_hyper_input_dataset_check(self): # If supplied with only one dataset during training, # make sure it doesn't run multiple levels and crap out ha = Hyperalignment() ds_all = [datasets['uni4small'] for i in range(3)] # Make sure it raises TypeError if a list is not passed self.assertRaises(TypeError, ha, ds_all[0]) self.assertRaises(TypeError, ha.train, ds_all[0]) # And it doesn't crap out with a single dataset for training ha.train([ds_all[0]]) zscore(ds_all[0], chunks_attr=None) assert_array_equal(ha.commonspace, ds_all[0].samples) # make sure it accepts tuple of ndarray ha = Hyperalignment() m = ha(tuple(ds_all)) ha = Hyperalignment() dss_arr = np.empty(len(ds_all), dtype=object) for i in range(len(ds_all)): dss_arr[i] = ds_all[i] m = ha(dss_arr)
def get_testdata(self): # get a dataset with some prominent trends in it ds4l = datasets['uni4large'] # lets select for now only meaningful features ds_orig = ds4l[:, ds4l.a.nonbogus_features] zscore(ds_orig, chunks_attr=None) n = 4 # # of datasets to generate Rs, dss_rotated, dss_rotated_clean = [], [], [] # now lets compose derived datasets by using some random # rotation(s) while len(dss_rotated_clean) < n: ds_ = random_affine_transformation(ds_orig, scale_fac=1.0, shift_fac=0.) if ds_.a.random_scale <= 0: continue Rs.append(ds_.a.random_rotation) zscore(ds_, chunks_attr=None) dss_rotated_clean.append(ds_) i = len(dss_rotated_clean) - 1 ds_2 = hstack( [ds_, ds4l[:, ds4l.a.bogus_features[i * 4:i * 4 + 4]]]) zscore(ds_2, chunks_attr=None) dss_rotated.append(ds_2) return ds_orig, dss_rotated, dss_rotated_clean, Rs
def detrend(ds): #print ds.summary() ds.samples = ds.samples.astype('float') pl.figure() pl.subplot(221) plot_samples_distance(ds, sortbyattr='chunks') #plot_samples_distance(ds) pl.title('Sample distances (sorted by chunks)') poly_detrend(ds, polyord=2, chunks_attr='chunks') pl.subplot(222) plot_samples_distance(ds, sortbyattr='chunks') pl.show() zscore(ds, chunks_attr='chunks', dtype='float32') pl.subplot(223) plot_samples_distance(ds, sortbyattr='chunks') pl.subplot(224) # plot_samples_distance(ds, sortbyattr='targets') pl.title('Sample distances (sorted by condition)') pl.show() #poly_detrend(ds, polyord=1, chunks_attr='chunks') #zscore(ds, chunks_attr='chunks', dtype='float32') return ds
def _get_connectomes(self, datasets): conn_mapper = FxyMapper(self.conn_metric) mean_feature_measure = MeanFeatureMeasure() qe = self.queryengine roi_ids = self.target_indices # compute means for aligning seed features conn_means = [self._seed_means(MeanFeatureMeasure(), qe, ds, roi_ids) for ds in datasets] conn_targets = [] for csm in conn_means: zscore(csm, chunks_attr=None) conn_targets.append(csm) connectomes = [] for target, ds in zip(conn_targets, datasets): conn_mapper.train(target) connectome = conn_mapper.forward(ds) connectome.fa = ds.fa zscore(connectome, chunks_attr=None) connectomes.append(connectome) return connectomes
def get_testdata(self): # rs = np.random.RandomState(0) rs = np.random.RandomState() nt = 200 n_triangles = 4 ns = 10 nv = n_triangles * 3 vertices = np.zeros((nv, 3)) # 4 separated triangles faces = [] for i in range(n_triangles): vertices[i * 3] = [i * 2, 0, 0] vertices[i * 3 + 1] = [i * 2 + 1, 1 / np.sqrt(3), 0] vertices[i * 3 + 2] = [i * 2 + 1, -1 / np.sqrt(3), 0] faces.append([i * 3, i * 3 + 1, i * 3 + 2]) faces = np.array(faces) surface = Surface(vertices, faces) ds_orig = np.zeros((nt, nv)) # add coarse-scale information for i in range(n_triangles): ds_orig[:, i * 3:(i + 1) * 3] += rs.normal(size=(nt, 1)) # add fine-scale information ds_orig += rs.normal(size=(nt, nv)) dss_train, dss_test = [], [] for i in range(ns): ds = np.zeros_like(ds_orig) for j in range(n_triangles): ds[:, j * 3:(j + 1) * 3] = np.dot(ds_orig[:, j * 3:(j + 1) * 3], get_random_rotation(3)) # special_ortho_group.rvs(3, random_state=rs)) ds = Dataset(ds) ds.fa['node_indices'] = np.arange(nv) ds_train, ds_test = ds[:nt // 2, :], ds[nt // 2:, :] zscore(ds_train, chunks_attr=None) zscore(ds_test, chunks_attr=None) dss_train.append(ds_train) dss_test.append(ds_test) return dss_train, dss_test, surface
def prep_parcelwise_data(subject, parcel, datatype): from mvpa2.datasets import Dataset from mvpa2.mappers.zscore import zscore if datatype == 'sponpain': ds = Dataset( np.load( os.path.join( sponpain_by_parcel, subject + '_sponpain_connectome_parcel-' + str(parcel) + '.npy'))) ds.fa['voxel_indices'] = range(ds.shape[1]) zscore(ds, chunks_attr=None) elif datatype == 'bladderpain': ds = Dataset( np.load( os.path.join( bladderpain_by_parcel, subject + '_bladderpain-cleaned-ts_parcel-' + str(parcel) + '.npy'))) ds.fa['voxel_indices'] = range(ds.shape[1]) zscore(ds, chunks_attr=None) else: print('Must specify datatyp as either sponpain or bladderpain') return ds
def test_linear_svm_weights_per_class(self, svm): # assumming many defaults it is as simple as kwargs = dict(enable_ca=["sensitivities"]) sana_split = svm.get_sensitivity_analyzer( split_weights=True, **kwargs) sana_full = svm.get_sensitivity_analyzer( force_train=False, **kwargs) # and lets look at all sensitivities ds2 = datasets['uni4large'].copy() zscore(ds2, param_est=('targets', ['L2', 'L3'])) ds2 = ds2[np.logical_or(ds2.sa.targets == 'L0', ds2.sa.targets == 'L1')] senssplit = sana_split(ds2) sensfull = sana_full(ds2) self.assertEqual(senssplit.shape, (2, ds2.nfeatures)) self.assertEqual(sensfull.shape, (1, ds2.nfeatures)) # just to verify that we split properly and if we reconstruct # manually we obtain the same dmap = (-1 * senssplit.samples[1] + senssplit.samples[0]) \ - sensfull.samples self.assertTrue((np.abs(dmap) <= 1e-10).all()) #print "____" #print senssplit #print SMLR().get_sensitivity_analyzer(combiner=None)(ds2) # for now we can do split weights for binary tasks only, so # lets check if we raise a concern # we temporarily shutdown warning, since it is going to complain # otherwise, but we do it on purpose here handlers = warning.handlers warning.handlers = [] self.assertRaises(NotImplementedError, sana_split, datasets['uni3medium']) # reenable the warnings warning.handlers = handlers
def get_testdata(self): # rs = np.random.RandomState(0) rs = np.random.RandomState() nt = 200 n_triangles = 4 ns = 10 nv = n_triangles * 3 vertices = np.zeros((nv, 3)) # 4 separated triangles faces = [] for i in range(n_triangles): vertices[i*3] = [i*2, 0, 0] vertices[i*3+1] = [i*2+1, 1/np.sqrt(3), 0] vertices[i*3+2] = [i*2+1, -1/np.sqrt(3), 0] faces.append([i*3, i*3+1, i*3+2]) faces = np.array(faces) surface = Surface(vertices, faces) ds_orig = np.zeros((nt, nv)) # add coarse-scale information for i in range(n_triangles): ds_orig[:, i*3:(i+1)*3] += rs.normal(size=(nt, 1)) # add fine-scale information ds_orig += rs.normal(size=(nt, nv)) dss_train, dss_test = [], [] for i in range(ns): ds = np.zeros_like(ds_orig) for j in range(n_triangles): ds[:, j*3:(j+1)*3] = np.dot(ds_orig[:, j*3:(j+1)*3], get_random_rotation(3)) # special_ortho_group.rvs(3, random_state=rs)) ds = Dataset(ds) ds.fa['node_indices'] = np.arange(nv) ds_train, ds_test = ds[:nt//2, :], ds[nt//2:, :] zscore(ds_train, chunks_attr=None) zscore(ds_test, chunks_attr=None) dss_train.append(ds_train) dss_test.append(ds_test) return dss_train, dss_test, surface
def compute_seed_means(measure, queryengine, ds, roi_ids): """ Parameters: ---------- measure: a PyMVPA measure passed to the Searchlight. queryengine: a trained PyMVPA SurfaceQueryEngine. ds: a single PyMVPA dataset (samples: timeseries, features: vertices) roi_ids: the vertex indices where each searchlight will be centered. Returns: -------- seed_data: dataset with the mean timeseries for a searchlight centered on each ROI_id. """ # Seed data is the mean timeseries for each searchlight seed_data = Searchlight(measure, queryengine=queryengine, nproc=1, roi_ids=roi_ids.copy()) if isinstance(ds,np.ndarray): ds = Dataset(ds) seed_data = seed_data(ds) zscore(seed_data.samples, chunks_attr=None) return seed_data
def test_connectivity_hyperalignment(self): skip_if_no_external('scipy') skip_if_no_external('hdf5') # needed for default results backend hdf5 dss_train, dss_test, surface = self.get_testdata() qe = SurfaceQueryEngine(surface, 10, fa_node_key='node_indices') cha = ConnectivityHyperalignment( mask_ids=[0, 3, 6, 9], seed_indices=[0, 3, 6, 9], seed_queryengines=qe, queryengine=qe) mappers = cha(dss_train) aligned_train = [mapper.forward(ds) for ds, mapper in zip(dss_train, mappers)] aligned_test = [mapper.forward(ds) for ds, mapper in zip(dss_test, mappers)] for ds in aligned_train + aligned_test: zscore(ds, chunks_attr=None) sim_train_before = self.compute_connectivity_profile_similarity(dss_train) sim_train_after = self.compute_connectivity_profile_similarity(aligned_train) sim_test_before = self.compute_connectivity_profile_similarity(dss_test) sim_test_after = self.compute_connectivity_profile_similarity(aligned_test) # ISC should be higher after CHA for both training and testing data self.assertTrue(sim_train_after.mean() > sim_train_before.mean()) self.assertTrue(sim_test_after.mean() > sim_test_before.mean())
def create_betas_per_trial_with_pymvpa(study_path, subj, conf, mask_name, flavor, TR): dhandle = OpenFMRIDataset(study_path) model = 1 task = 1 # Do this for other tasks as well. not only the first mask_fname = _opj(study_path, "sub{:0>3d}".format(subj), "masks", conf.mvpa_tasks[0], "{}.nii.gz".format(mask_name)) print mask_fname run_datasets = [] for run_id in dhandle.get_task_bold_run_ids(task)[subj]: if type(run_id) == str: continue all_events = dhandle.get_bold_run_model(model, subj, run_id) run_events = [] i = 0 for event in all_events: if event["task"] == task: event["condition"] = "{}-{}".format(event["condition"], i) run_events.append(event) i += 1 # load BOLD data for this run (with masking); add 0-based chunk ID run_ds = dhandle.get_bold_run_dataset(subj, task, run_id, flavor=flavor, chunks=run_id - 1, mask=mask_fname) # convert event info into a sample attribute and assign as 'targets' run_ds.sa.time_coords = run_ds.sa.time_indices * TR print run_id run_ds.sa["targets"] = events2sample_attr(run_events, run_ds.sa.time_coords, noinfolabel="rest") # additional time series preprocessing can go here poly_detrend(run_ds, polyord=1, chunks_attr="chunks") zscore(run_ds, chunks_attr="chunks", param_est=("targets", ["rest"]), dtype="float32") glm_dataset = fit_event_hrf_model(run_ds, run_events, time_attr="time_coords", condition_attr="condition") glm_dataset.sa["targets"] = [x[: x.find("-")] for x in glm_dataset.sa.condition] glm_dataset.sa.condition = glm_dataset.sa["targets"] glm_dataset.sa["chunks"] = [run_id - 1] * len(glm_dataset.samples) run_datasets.append(glm_dataset) return vstack(run_datasets, 0)
def main(subject, study_dir, mask, suffix='_stim_fix2'): from mvpa2.mappers.zscore import zscore # load subject data sp = su.SubjPath(subject, study_dir) vols = task.disp_vols(sp.path('behav', 'log')) # load fmri data ds = mvpa.load_disp_beta(sp, suffix, mask, verbose=1) # zscore ds.sa['run'] = vols.run.values zscore(ds, chunks_attr='run') # save data samples and corresponding volume information res_dir = os.path.join(sp.study_dir, 'batch', 'glm', 'disp' + suffix, 'roi', mask) if not os.path.exists(res_dir): os.makedirs(res_dir) mat_file = os.path.join(res_dir, f'pattern_{subject}.txt') tab_file = os.path.join(res_dir, f'pattern_{subject}.csv') np.savetxt(mat_file, ds.samples) vols.to_csv(tab_file)
def test_mapper_vs_zscore(): """Test by comparing to results of elderly z-score function """ # data: 40 sample feature line in 20d space (40x20; samples x features) dss = [ dataset_wizard(np.concatenate([np.arange(40) for i in range(20)]).reshape(20, -1).T, targets=1, chunks=1), ] + datasets.values() for ds in dss: ds1 = deepcopy(ds) ds2 = deepcopy(ds) zsm = ZScoreMapper(chunks_attr=None) assert_raises(RuntimeError, zsm.forward, ds1.samples) idhashes = (idhash(ds1), idhash(ds1.samples)) zsm.train(ds1) idhashes_train = (idhash(ds1), idhash(ds1.samples)) assert_equal(idhashes, idhashes_train) # forward dataset ds1z_ds = zsm.forward(ds1) idhashes_forwardds = (idhash(ds1), idhash(ds1.samples)) # must not modify samples in place! assert_equal(idhashes, idhashes_forwardds) # forward samples explicitly ds1z = zsm.forward(ds1.samples) idhashes_forward = (idhash(ds1), idhash(ds1.samples)) assert_equal(idhashes, idhashes_forward) zscore(ds2, chunks_attr=None) assert_array_almost_equal(ds1z, ds2.samples) assert_array_equal(ds1.samples, ds.samples)
def _level2(self, datasets, lvl1_data, mappers, residuals): params = self.params # for quicker access ;) data_mapped = lvl1_data # aggregate all processed 1st-level datasets into a new 2nd-level # common space commonspace = params.combiner2(data_mapped) # XXX Why is this commented out? Who knows what combiner2 is doing and # whether it changes the distribution of the data #if params.zscore_common: #zscore(commonspace, chunks_attr=None) ndatasets = len(datasets) for loop in xrange(params.level2_niter): # 2nd-level alignment starts from the original/unprojected datasets # again for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 2 (%i-th iteration): ds #%i" % (loop, i)) # Optimization speed up heuristic # Slightly modify the common space towards other feature # spaces and reduce influence of this feature space for the # to-be-computed projection temp_commonspace = (commonspace * ndatasets - data_mapped[i]) \ / (ndatasets - 1) if params.zscore_common: zscore(temp_commonspace, chunks_attr=None) # assign current common space ds_new.sa[m.get_space()] = temp_commonspace # retrain the mapper for this dataset m.train(ds_new) # remove common space attribute again to save on memory when the # common space is updated for the next iteration del ds_new.sa[m.get_space()] # obtain the 2nd-level projection ds_ = m.forward(ds_new.samples) if params.zscore_common: zscore(ds_, chunks_attr=None) # store for 2nd-level combiner data_mapped[i] = ds_ # compute residuals if residuals is not None: residuals[1 + loop, i] = np.linalg.norm(ds_ - commonspace) commonspace = params.combiner2(data_mapped) # and again if params.zscore_common: zscore(commonspace, chunks_attr=None) # return the final common space return commonspace
def _level2(self, datasets, lvl1_data, mappers, residuals): params = self.params # for quicker access ;) data_mapped = lvl1_data # aggregate all processed 1st-level datasets into a new 2nd-level # common space commonspace = params.combiner2(data_mapped) # XXX Why is this commented out? Who knows what combiner2 is doing and # whether it changes the distribution of the data #if params.zscore_common: #zscore(commonspace, chunks_attr=None) ndatasets = len(datasets) for loop in xrange(params.level2_niter): # 2nd-level alignment starts from the original/unprojected datasets # again for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 2 (%i-th iteration): ds #%i" % (loop, i)) # Optimization speed up heuristic # Slightly modify the common space towards other feature # spaces and reduce influence of this feature space for the # to-be-computed projection temp_commonspace = (commonspace * ndatasets - data_mapped[i]) \ / (ndatasets - 1) if params.zscore_common: zscore(temp_commonspace, chunks_attr=None) # assign current common space ds_new.sa[m.get_space()] = temp_commonspace # retrain the mapper for this dataset m.train(ds_new) # remove common space attribute again to save on memory when the # common space is updated for the next iteration del ds_new.sa[m.get_space()] # obtain the 2nd-level projection ds_ = m.forward(ds_new.samples) if params.zscore_common: zscore(ds_, chunks_attr=None) # store for 2nd-level combiner data_mapped[i] = ds_ # compute residuals if residuals is not None: residuals[1+loop, i] = np.linalg.norm(ds_ - commonspace) commonspace = params.combiner2(data_mapped) # and again if params.zscore_common: zscore(commonspace, chunks_attr=None) # return the final common space return commonspace
def test_hypal_michael_caused_problem(self): from mvpa2.misc import data_generators from mvpa2.mappers.zscore import zscore # Fake data ds = data_generators.normal_feature_dataset(nfeatures=20) ds_all = [data_generators.random_affine_transformation(ds) for i in range(3)] _ = [zscore(sd, chunks_attr=None) for sd in ds_all] # Making random data per subject for testing with bias added to first subject ds_test = [np.random.rand(1, ds.nfeatures) for i in range(len(ds_all))] ds_test[0] += np.arange(1, ds.nfeatures + 1) * 100 assert(np.corrcoef(ds_test[2], ds_test[1])[0, 1] < 0.99) # that would have been rudiculous if it was # Test with varying alpha so we for sure to not have that issue now for alpha in (0, 0.01, 0.5, 0.99, 1.0): hyper09 = Hyperalignment(alpha=alpha) mappers = hyper09([sd for sd in ds_all]) ds_test_a = [m.forward(sd) for m, sd in zip(mappers, ds_test)] ds_test_a = [mappers[0].reverse(sd) for sd in ds_test_a] corr = np.corrcoef(ds_test_a[2], ds_test_a[1])[0, 1] assert(corr < 0.99)
def test_hypal_michael_caused_problem(self): from mvpa2.misc import data_generators from mvpa2.mappers.zscore import zscore # Fake data ds = data_generators.normal_feature_dataset(nfeatures=20) ds_all = [data_generators.random_affine_transformation(ds) for i in range(3)] _ = [zscore(sd, chunks_attr=None) for sd in ds_all] # Making random data per subject for testing with bias added to first subject ds_test = [np.random.rand(1, ds.nfeatures) for i in range(len(ds_all))] ds_test[0] += np.arange(1, ds.nfeatures + 1) * 100 assert(np.corrcoef(ds_test[2], ds_test[1])[0, 1] < 0.99) # that would have been ridiculous if it was # Test with varying alpha so we for sure to not have that issue now for alpha in (0, 0.01, 0.5, 0.99, 1.0): hyper09 = Hyperalignment(alpha=alpha) mappers = hyper09([sd for sd in ds_all]) ds_test_a = [m.forward(sd) for m, sd in zip(mappers, ds_test)] ds_test_a = [mappers[0].reverse(sd) for sd in ds_test_a] corr = np.corrcoef(ds_test_a[2], ds_test_a[1])[0, 1] assert(corr < 0.99)
def _get_hypesvs(self, sl_connectomes, local_common_model=None): ''' Hyperalign connectomes and return mapppers and trained SVDMapper of common space. Parameters ---------- sl_connectomes: a list of connectomes to hyperalign local_common_model: a reference common model to be used. Returns ------- a tuple (sl_hmappers, svm, local_common_model) sl_hmappers: a list of mappers corresponding to input list in that order. svm: a svm mapper based on the input data. if given a common model, this is None. local_common_model: If local_common_model is provided as input, this will be None. Otherwise, local_common_model will be computed here and returned. ''' # TODO Should we z-score sl_connectomes? return_model = False if self.params.save_model is None else True if local_common_model is not None: ha = Hyperalignment(level2_niter=0) if not is_datasetlike(local_common_model): local_common_model = Dataset(samples=local_common_model) ha.train([local_common_model]) sl_hmappers = ha(sl_connectomes) return sl_hmappers, None, None ha = Hyperalignment() sl_hmappers = ha(sl_connectomes) sl_connectomes = [ slhm.forward(slc) for slhm, slc in zip(sl_hmappers, sl_connectomes) ] _ = [zscore(slc, chunks_attr=None) for slc in sl_connectomes] sl_connectomes = np.dstack(sl_connectomes).mean(axis=-1) svm = SVDMapper(force_train=True) svm.train(sl_connectomes) if return_model: local_common_model = svm.forward(sl_connectomes) else: local_common_model = None return sl_hmappers, svm, local_common_model
def _get_hypesvs(self, sl_connectomes, local_common_model=None): ''' Hyperalign connectomes and return mapppers and trained SVDMapper of common space. Parameters ---------- sl_connectomes: a list of connectomes to hyperalign local_common_model: a reference common model to be used. Returns ------- a tuple (sl_hmappers, svm, local_common_model) sl_hmappers: a list of mappers corresponding to input list in that order. svm: a svm mapper based on the input data. if given a common model, this is None. local_common_model: If local_common_model is provided as input, this will be None. Otherwise, local_common_model will be computed here and returned. ''' # TODO Should we z-score sl_connectomes? return_model = False if self.params.save_model is None else True if local_common_model is not None: ha = Hyperalignment(level2_niter=0) if not is_datasetlike(local_common_model): local_common_model = Dataset(samples=local_common_model) ha.train([local_common_model]) sl_hmappers = ha(sl_connectomes) return sl_hmappers, None, None ha = Hyperalignment() sl_hmappers = ha(sl_connectomes) sl_connectomes = [slhm.forward(slc) for slhm, slc in zip(sl_hmappers, sl_connectomes)] _ = [zscore(slc, chunks_attr=None) for slc in sl_connectomes] sl_connectomes = np.dstack(sl_connectomes).mean(axis=-1) svm = SVDMapper(force_train=True) svm.train(sl_connectomes) if return_model: local_common_model = svm.forward(sl_connectomes) else: local_common_model = None return sl_hmappers, svm, local_common_model
def get_testdata(self): # get a dataset with some prominent trends in it ds4l = datasets['uni4large'] # lets select for now only meaningful features ds_orig = ds4l[:, ds4l.a.nonbogus_features] zscore(ds_orig, chunks_attr=None) n = 4 # # of datasets to generate Rs, dss_rotated, dss_rotated_clean = [], [], [] # now lets compose derived datasets by using some random # rotation(s) while len(dss_rotated_clean) < n: ds_ = random_affine_transformation(ds_orig, scale_fac=1.0, shift_fac=0.) if ds_.a.random_scale <= 0: continue Rs.append(ds_.a.random_rotation) zscore(ds_, chunks_attr=None) dss_rotated_clean.append(ds_) i = len(dss_rotated_clean) - 1 ds_2 = hstack([ds_, ds4l[:, ds4l.a.bogus_features[i * 4: i * 4 + 4]]]) zscore(ds_2, chunks_attr=None) dss_rotated.append(ds_2) return ds_orig, dss_rotated, dss_rotated_clean, Rs
def train(self, datasets): """Derive a common feature space from a series of datasets. Parameters ---------- datasets : sequence of datasets Returns ------- A list of trained Mappers matching the number of input datasets. """ params = self.params # for quicker access ;) ca = self.ca # Check to make sure we get a list of datasets as input. if not isinstance(datasets, (list, tuple, np.ndarray)): raise TypeError("Input datasets should be a sequence " "(of type list, tuple, or ndarray) of datasets.") ndatasets = len(datasets) nfeatures = [ds.nfeatures for ds in datasets] alpha = params.alpha residuals = None if ca['training_residual_errors'].enabled: residuals = np.zeros((1 + params.level2_niter, ndatasets)) ca.training_residual_errors = Dataset( samples=residuals, sa={ 'levels': ['1'] + ['2:%i' % i for i in xrange(params.level2_niter)] }) if __debug__: debug('HPAL', "Hyperalignment %s for %i datasets" % (self, ndatasets)) if params.ref_ds is None: ref_ds = np.argmax(nfeatures) else: ref_ds = params.ref_ds # Making sure that ref_ds is within range. #Parameter() already checks for it being a non-negative integer if ref_ds >= ndatasets: raise ValueError, "Requested reference dataset %i is out of " \ "bounds. We have only %i datasets provided" \ % (ref_ds, ndatasets) ca.chosen_ref_ds = ref_ds # zscore all data sets # ds = [ zscore(ds, chunks_attr=None) for ds in datasets] # TODO since we are doing in-place zscoring create deep copies # of the datasets with pruned targets and shallow copies of # the collections (if they would come needed in the transformation) # TODO: handle floats and non-floats differently to prevent # waste of memory if there is no need (e.g. no z-scoring) #otargets = [ds.sa.targets for ds in datasets] datasets = [ds.copy(deep=False) for ds in datasets] #datasets = [Dataset(ds.samples.astype(float), sa={'targets': [None] * len(ds)}) #datasets = [Dataset(ds.samples, sa={'targets': [None] * len(ds)}) # for ds in datasets] if params.zscore_all: if __debug__: debug('HPAL', "Z-scoring all datasets") for ids in xrange(len(datasets)): zmapper = ZScoreMapper(chunks_attr=None) zmapper.train(datasets[ids]) datasets[ids] = zmapper.forward(datasets[ids]) if alpha < 1: datasets, wmappers = self._regularize(datasets, alpha) # initial common space is the reference dataset commonspace = datasets[ref_ds].samples # the reference dataset might have been zscored already, don't do it # twice if params.zscore_common and not params.zscore_all: if __debug__: debug( 'HPAL_', "Creating copy of a commonspace and assuring " "it is of a floating type") commonspace = commonspace.astype(float) zscore(commonspace, chunks_attr=None) # If there is only one dataset in training phase, there is nothing to be done # just use that data as the common space if len(datasets) < 2: self.commonspace = commonspace else: # create a mapper per dataset # might prefer some other way to initialize... later mappers = [deepcopy(params.alignment) for ds in datasets] # # Level 1 -- initial projection # lvl1_projdata = self._level1(datasets, commonspace, ref_ds, mappers, residuals) # # Level 2 -- might iterate multiple times # # this is the final common space self.commonspace = self._level2(datasets, lvl1_projdata, mappers, residuals) if params.output_dim is not None: mappers = self._level3(datasets) self._svd_mapper = SVDMapper() self._svd_mapper.train(self._map_and_mean(datasets, mappers)) self._svd_mapper = StaticProjectionMapper( proj=self._svd_mapper.proj[:, :params.output_dim])
def runsub(sub, thisContrast, r, dstype='raw', roi='grayMatter', filterLen=49, filterOrd=3, write=False): if dstype == 'raw': outdir='PyMVPA' print "working with raw data" thisSub = {sub: subList[sub]} dsdict = lmvpa.loadsubdata(paths, thisSub, m=roi) thisDS = dsdict[sub] mc_params = lmvpa.loadmotionparams(paths, thisSub) beta_events = lmvpa.loadevents(paths, thisSub) # savitsky golay filtering sg.sg_filter(thisDS, filterLen, filterOrd) # gallant group zscores before regression. # zscore w.r.t. rest trials # zscore(thisDS, param_est=('targets', ['rest']), chunks_attr='chunks') # zscore entire set. if done chunk-wise, there is no double-dipping (since we leave a chunk out at a time). zscore(thisDS, chunks_attr='chunks') print "beta extraction" ## BETA EXTRACTION ## rds, events = lmvpa.amendtimings(thisDS.copy(), beta_events[sub]) evds = er.fit_event_hrf_model(rds, events, time_attr='time_coords', condition_attr=('trial_type', 'chunks'), design_kwargs={'add_regs': mc_params[sub], 'hrf_model': 'canonical'}, return_model=True) fds = lmvpa.replacetargets(evds, contrasts, thisContrast) fds = fds[fds.targets != '0'] else: outdir=os.path.join('LSS', dstype) print "loading betas" fds = lmvpa.loadsubbetas(paths, sub, btype=dstype, m=roi) fds.sa['targets'] = fds.sa[thisContrast] zscore(fds, chunks_attr='chunks') fds = lmvpa.sortds(fds) print "searchlights" ## initialize classifier clf = svm.LinearNuSVMC() cv = CrossValidation(clf, NFoldPartitioner()) from mvpa2.measures.searchlight import sphere_searchlight cvSL = sphere_searchlight(cv, radius=r) # now I have betas per chunk. could just correlate the betas, or correlate the predictions for corresponding runs lidx = fds.chunks < fds.sa['chunks'].unique[len(fds.sa['chunks'].unique)/2] pidx = fds.chunks >= fds.sa['chunks'].unique[len(fds.sa['chunks'].unique) / 2] lres = sl.run_cv_sl(cvSL, fds[lidx].copy(deep=False)) pres = sl.run_cv_sl(cvSL, fds[pidx].copy(deep=False)) if write: from mvpa2.base import dataset map2nifti(fds, dataset.vstack([lres, pres])).\ to_filename(os.path.join( paths[0], 'Maps', outdir, sub + '_' + roi + '_' + thisContrast + '_cvsl.nii.gz')) del lres, pres, cvSL cvSL = sphere_searchlight(cv, radius=r) crossSet = fds.copy() crossSet.chunks[lidx] = 1 crossSet.chunks[pidx] = 2 cres = sl.run_cv_sl(cvSL, crossSet.copy(deep=False)) if write: map2nifti(fds, cres[0]).to_filename( os.path.join(paths[0], 'Maps', outdir, sub + '_' + roi + '_' + (thisContrast) + '_P2L.nii.gz')) map2nifti(fds, cres[1]).to_filename( os.path.join(paths[0], 'Maps', outdir, sub + '_' + roi + '_' + (thisContrast) + '_L2P.nii.gz'))
def test_hyperalignment_measure(self): ref_ds = 0 fsha = FeatureSelectionHyperalignment() ds_orig, dss_rotated, dss_rotated_clean, Rs = self.get_testdata() # Lets test two scenarios -- in one with no noise -- we should get # close to perfect reconstruction. If noisy features were added -- not so good for noisy, dss in ((False, dss_rotated_clean), (True, dss_rotated)): # to verify that original datasets didn't get changed by # Hyperalignment store their idhashes of samples idhashes = [idhash(ds.samples) for ds in dss] idhashes_targets = [idhash(ds.targets) for ds in dss] mappers = fsha(dss) mappers = [StaticProjectionMapper(proj=m, recon=m.T) for m in mappers] idhashes_ = [idhash(ds.samples) for ds in dss] idhashes_targets_ = [idhash(ds.targets) for ds in dss] self.assertEqual( idhashes, idhashes_, msg="Hyperalignment must not change original data.") self.assertEqual( idhashes_targets, idhashes_targets_, msg="Hyperalignment must not change original data targets.") # Map data back dss_clean_back = [m.forward(ds_) for m, ds_ in zip(mappers, dss)] _ = [zscore(sd, chunks_attr=None) for sd in dss_clean_back] nddss = [] ndcss = [] nf = ds_orig.nfeatures ds_norm = np.linalg.norm(dss[ref_ds].samples[:, :nf]) ds_orig_Rref = np.dot(ds_orig.samples, Rs[ref_ds]) \ * np.sign(dss_rotated_clean[ref_ds].a.random_scale) zscore(ds_orig_Rref, chunks_attr=None) for ds_back in dss_clean_back: ndcs = np.diag(np.corrcoef(ds_back.samples.T[:nf, ], ds_orig_Rref.T)[nf:, :nf], k=0) ndcss += [ndcs] dds = ds_back.samples[:, :nf] - ds_orig_Rref ndds = np.linalg.norm(dds) / ds_norm nddss += [ndds] # First compare correlations snoisy = ('clean', 'noisy')[int(noisy)] self.assertTrue( np.all(np.array(ndcss) >= (0.9, 0.85)[int(noisy)]), msg="Should have reconstructed original dataset more or" " less. Got correlations %s in %s case." % (ndcss, snoisy)) # normed differences self.assertTrue( np.all(np.array(nddss) <= (.2, 3)[int(noisy)]), msg="Should have reconstructed original dataset more or" " less for all. Got normed differences %s in %s case." % (nddss, snoisy)) self.assertTrue( nddss[ref_ds] <= (.1, 0.3)[int(noisy)], msg="Should have reconstructed original dataset quite " "well even with zscoring. Got normed differences %s " "in %s case." % (nddss, snoisy)) self.assertTrue( np.all(np.array(nddss) / nddss[ref_ds] >= (0.95, 0.8)[int(noisy)]), msg="Should have reconstructed orig_ds best of all. " "Got normed differences %s in %s case with ref_ds=%d." % (nddss, snoisy, ref_ds)) # Testing feature selection within the measure using fraction and count # same features fsha_fsf = FeatureSelectionHyperalignment(featsel=0.5) fsha_fsn = FeatureSelectionHyperalignment(featsel=4) fsha_fsf_same = FeatureSelectionHyperalignment(featsel=0.5, use_same_features=True) fsha = FeatureSelectionHyperalignment(full_matrix=False) # check for valueerror if full_matrix=False and no roi_seed fa self.assertRaises(ValueError, fsha, dss_rotated) fsha = FeatureSelectionHyperalignment() dss_rotated[ref_ds].fa['roi_seed'] = [1, 0, 0, 0, 0, 0, 0, 0] mappers_fsf = fsha_fsf(dss_rotated) mappers_fsf_same = fsha_fsf_same(dss_rotated) mappers_fsn = fsha_fsn(dss_rotated) mappers = fsha(dss_rotated_clean) mappers_diffsizedss = fsha_fsf([sd[:, nfs] for nfs, sd in zip([np.arange(5), np.random.permutation(np.arange(8)), np.arange(8)[::-1], np.arange(8)], dss_rotated)]) # Testing that most of noisy features are eliminated from reference data assert_true(np.alltrue([np.sum(m[:4, :4].std(0) > 0) > 2 for m in mappers_fsf])) # using same features make it most likely to eliminate all noisy features assert_true(np.alltrue([np.sum(m[:4, :4].std(0) > 0) == 4 for m in mappers_fsf_same])) assert_true(np.alltrue([np.sum(m[:4, :4].std(0) > 0) > 2 for m in mappers_fsn])) # And it correctly maps the selected features if they are selected if np.alltrue([np.all(m[4:, :4] == 0) for m in mappers_fsf]): for m, mfs in zip(mappers, mappers_fsf): assert_array_equal(m, mfs[:4, :4]) if np.alltrue([np.all(m[4:, :4] == 0) for m in mappers_fsf_same]): for m, mfs in zip(mappers, mappers_fsf_same): assert_array_equal(m, mfs[:4, :4]) # testing roi_seed forces feature selection dss_rotated[ref_ds].fa['roi_seed'] = [0, 0, 0, 0, 0, 0, 0, 1] fsha_fsf = FeatureSelectionHyperalignment(featsel=0.5) mappers_fsf = fsha_fsf(dss_rotated) assert(np.alltrue([np.sum(m[7, :] == 0) == 4 for m in mappers_fsf]))
def test_custom_qas(self): # Test if we could provide custom QEs per each of the datasets skip_if_no_external('scipy') skip_if_no_external('hdf5') # needed for default results backend hdf5 ns, nf = 10, 4 # # of samples/features -- a very BIG dataset ;) ds0 = Dataset(np.random.normal(size=(ns, nf))) zscore(ds0, chunks_attr=None) ds1 = ds0[:, [3, 0, 1, 2]] # features circular shifted to the right qe0 = FancyQE([[0], [1], [2], [3]]) # does nothing qe1 = FancyQE([[1], [2], [3], [0]]) # knows to look into the right def apply_slhyper(queryengine, dss=[ds0, ds1], return_mappers=False, **kw): """Helper for a common code to create/call slhyper""" slhyper = SearchlightHyperalignment(queryengine=queryengine, **kw) mappers = slhyper(dss) proj = [m.proj.todense() for m in mappers] return (proj, mappers) if return_mappers else proj # since this single qe resulted in trying to match non-matching time series # projections should be non-identity, but no offdiagonal elements assert_no_offdiag(apply_slhyper(qe0)) # both are provided projs, mappers = apply_slhyper([qe0, qe1], return_mappers=True) tprojs_shifted = [np.eye(nf), np.roll(np.eye(nf), 1, axis=0)] assert_array_equal(projs[0], tprojs_shifted[0]) # must be identity since we made them so assert_array_equal(projs[1], tprojs_shifted[1]) # pretty much incorporating that shift # TODO -- not identity assert_array_equal(projs[0], np.eye(len(p))) # must be identity since we made them so # and must restore data properly assert_array_almost_equal(mappers[0].forward(ds0), mappers[1].forward(ds1)) # give more then # of qes assert_raises(ValueError, SearchlightHyperalignment(queryengine=[qe0, qe1]), [ds0, ds1, ds0]) # The one having no voxels for the "1st" id in "subj1" qe1_ = FancyQE([[1], [], [3], [0]]) # knows to look into the right projs = apply_slhyper(qe1_) assert_no_offdiag(projs) for proj in projs: # assess that both have '2nd' one 0 # but not the others! assert_array_equal(np.diagonal(proj) != 0, [True, True, False, True]) # smoke test whenever combine is False # In this case should work ok apply_slhyper(qe0, combine_neighbormappers=False) # this one ok as well since needs only matching ones in ref_ds apply_slhyper([qe0, qe1], combine_neighbormappers=False) # here since features do not match node_ids -- should raise ValueError assert_raises(ValueError, apply_slhyper, qe1, combine_neighbormappers=False) assert_raises(ValueError, apply_slhyper, [qe0, qe1], ref_ds=1, combine_neighbormappers=False) # and now only one qe lacking for that id projs = apply_slhyper([qe0, qe1_]) tproj0 = np.eye(nf) tproj0[1, 1] = 0 tprojs_shifted_1st0 = [tproj0, np.roll(tproj0, 1, axis=0)] for proj, tproj in zip(projs, tprojs_shifted_1st0): # assess that both have '2nd' one 0 # but not the others! assert_array_equal(proj, tproj) # And now a test with varying number of selected fids, no shift qe0 = FancyQE([[0], [1, 2], [1, 2, 3], [0, 1, 2, 3]]) projs = apply_slhyper(qe0) # Test that in general we get larger coefficients for "correct" transformation for p, tproj in zip(projs, tprojs_shifted): assert(np.all(np.asarray(p)[tproj>0] >= 1.0)) assert_array_lequal(np.mean(np.asarray(p)[tproj == 0]), 0.3) qe1 = FancyQE([[0, 1, 2, 3], [1, 2, 3], [2, 3], [3]]) # Just a smoke test, for now TODO projs = apply_slhyper([qe0, qe1])
def test_searchlight_hyperalignment(self): skip_if_no_external('scipy') skip_if_no_external('h5py') ds_orig = datasets['3dsmall'].copy()[:, :15] ds_orig.fa['voxel_indices'] = ds_orig.fa.myspace space = 'voxel_indices' # total number of datasets for the analysis nds = 5 zscore(ds_orig, chunks_attr=None) dss = [ds_orig] # create a few distorted datasets to match the desired number of datasets # not sure if this truly mimics the real data, but at least we can test # implementation while len(dss) < nds - 1: sd = local_random_affine_transformations( ds_orig, scatter_neighborhoods( Sphere(1), ds_orig.fa[space].value, deterministic=True)[1], Sphere(2), space=space, scale_fac=1.0, shift_fac=0.0) # sometimes above function returns dataset with nans, infs, we don't want that. if np.sum(np.isnan(sd.samples)+np.isinf(sd.samples)) == 0 \ and np.all(sd.samples.std(0)): dss.append(sd) ds_orig_noisy = ds_orig.copy() ds_orig_noisy.samples += 0.1 * np.random.random(size=ds_orig_noisy.shape) dss.append(ds_orig_noisy) _ = [zscore(sd, chunks_attr=None) for sd in dss[1:]] # we should have some distortion for ds in dss[1:]: assert_false(np.all(ds_orig.samples == ds.samples)) # testing checks slhyp = SearchlightHyperalignment(ref_ds=1, exclude_from_model=[1]) self.assertRaises(ValueError, slhyp, dss[:3]) slhyp = SearchlightHyperalignment(ref_ds=3) self.assertRaises(ValueError, slhyp, dss[:3]) # store projections for each mapper separately projs = list() # run the algorithm with all combinations of the two major parameters # for projection calculation. for kwargs in [{'combine_neighbormappers': True, 'nproc': 2}, {'combine_neighbormappers': True, 'dtype': 'float64', 'compute_recon': True}, {'combine_neighbormappers': True, 'exclude_from_model': [2, 4]}, {'combine_neighbormappers': False}, {'combine_neighbormappers': False, 'mask_node_ids': np.arange(dss[0].nfeatures).tolist()}, {'combine_neighbormappers': True, 'sparse_radius': 1}, {'combine_neighbormappers': True, 'nblocks': 2}]: slhyp = SearchlightHyperalignment(radius=2, **kwargs) mappers = slhyp(dss) # one mapper per input ds assert_equal(len(mappers), nds) projs.append(mappers) # some checks for midx in range(nds): # making sure mask_node_ids options works as expected assert_array_almost_equal(projs[3][midx].proj.todense(), projs[4][midx].proj.todense()) # recon check assert_array_almost_equal(projs[0][midx].proj.todense(), projs[1][midx].recon.T.todense(), decimal=5) assert_equal(projs[1][midx].proj.dtype, 'float64') assert_equal(projs[0][midx].proj.dtype, 'float32') # making sure the projections make sense for proj in projs: # no .max on sparse matrices on older scipy (e.g. on precise) so conver to array first max_weight = proj[0].proj.toarray().max(0).squeeze() diag_weight = proj[0].proj.diagonal() # Check to make sure diagonal is the max weight, in almost all rows for reference subject assert(np.sum(max_weight == diag_weight) / float(len(diag_weight)) > 0.90) # and not true for other subjects for i in range(1, nds - 1): assert(np.sum(proj[i].proj.toarray().max(0).squeeze() == proj[i].proj.diagonal()) / float(proj[i].proj.shape[0]) < 0.80) # Check to make sure projection weights match across duplicate datasets max_weight = proj[-1].proj.toarray().max(0).squeeze() diag_weight = proj[-1].proj.diagonal() # Check to make sure diagonal is the max weight, in almost all rows for reference subject assert(np.sum(max_weight == diag_weight) / float(len(diag_weight)) > 0.90) # project data dss_hyper = [hm.forward(sd) for hm, sd in zip(projs[0], dss)] _ = [zscore(sd, chunks_attr=None) for sd in dss_hyper] ndcss = [] nf = ds_orig.nfeatures for ds_hyper in dss_hyper: ndcs = np.diag(np.corrcoef(ds_hyper.samples.T, ds_orig.samples.T)[nf:, :nf], k=0) ndcss += [ndcs] assert_true(np.median(ndcss[0]) > 0.9) # noisy copy of original dataset should be similar to original after hyperalignment assert_true(np.median(ndcss[-1]) > 0.9) assert_true(np.all([np.median(ndcs) > 0.2 for ndcs in ndcss[1:-2]]))
def test_voxel_selection(self): """Compare surface and volume based searchlight""" """ Tests to see whether results are identical for surface-based searchlight (just one plane; Euclidean distnace) and volume-based searchlight. Note that the current value is a float; if it were int, it would specify the number of voxels in each searchlight""" radius = 10.0 """Define input filenames""" epi_fn = pathjoin(pymvpa_dataroot, "bold.nii.gz") maskfn = pathjoin(pymvpa_dataroot, "mask.nii.gz") """ Use the EPI datafile to define a surface. The surface has as many nodes as there are voxels and is parallel to the volume 'slice' """ vg = volgeom.from_any(maskfn, mask_volume=True) aff = vg.affine nx, ny, nz = vg.shape[:3] """Plane goes in x and y direction, so we take these vectors from the affine transformation matrix of the volume""" plane = surf.generate_plane(aff[:3, 3], aff[:3, 0], aff[:3, 1], nx, ny) """ Simulate pial and white matter as just above and below the central plane """ normal_vec = aff[:3, 2] outer = plane + normal_vec inner = plane + -normal_vec """ Combine volume and surface information """ vsm = volsurf.VolSurfMaximalMapping(vg, outer, inner) """ Run voxel selection with specified radius (in mm), using Euclidean distance measure """ surf_voxsel = surf_voxel_selection.voxel_selection(vsm, radius, distance_metric="e") """Define the measure""" # run_slow=True would give an actual cross-validation with meaningful # accuracies. Because this is a unit-test only the number of voxels # in each searchlight is tested. run_slow = False if run_slow: meas = CrossValidation(GNB(), OddEvenPartitioner(), errorfx=lambda p, t: np.mean(p == t)) postproc = mean_sample else: meas = _Voxel_Count_Measure() postproc = lambda x: x """ Surface analysis: define the query engine, cross validation, and searchlight """ surf_qe = SurfaceVerticesQueryEngine(surf_voxsel) surf_sl = Searchlight(meas, queryengine=surf_qe, postproc=postproc) """ new (Sep 2012): also test 'simple' queryengine wrapper function """ surf_qe2 = disc_surface_queryengine( radius, maskfn, inner, outer, plane, volume_mask=True, distance_metric="euclidean" ) surf_sl2 = Searchlight(meas, queryengine=surf_qe2, postproc=postproc) """ Same for the volume analysis """ element_sizes = tuple(map(abs, (aff[0, 0], aff[1, 1], aff[2, 2]))) sph = Sphere(radius, element_sizes=element_sizes) kwa = {"voxel_indices": sph} vol_qe = IndexQueryEngine(**kwa) vol_sl = Searchlight(meas, queryengine=vol_qe, postproc=postproc) """The following steps are similar to start_easy.py""" attr = SampleAttributes(pathjoin(pymvpa_dataroot, "attributes_literal.txt")) mask = surf_voxsel.get_mask() dataset = fmri_dataset( samples=pathjoin(pymvpa_dataroot, "bold.nii.gz"), targets=attr.targets, chunks=attr.chunks, mask=mask ) if run_slow: # do chunkswise linear detrending on dataset poly_detrend(dataset, polyord=1, chunks_attr="chunks") # zscore dataset relative to baseline ('rest') mean zscore(dataset, chunks_attr="chunks", param_est=("targets", ["rest"])) # select class face and house for this demo analysis # would work with full datasets (just a little slower) dataset = dataset[np.array([l in ["face", "house"] for l in dataset.sa.targets], dtype="bool")] """Apply searchlight to datasets""" surf_dset = surf_sl(dataset) surf_dset2 = surf_sl2(dataset) vol_dset = vol_sl(dataset) surf_data = surf_dset.samples surf_data2 = surf_dset2.samples vol_data = vol_dset.samples assert_array_equal(surf_data, surf_data2) assert_array_equal(surf_data, vol_data)
def train(self, datasets): """Derive a common feature space from a series of datasets. Parameters ---------- datasets : sequence of datasets Returns ------- A list of trained Mappers matching the number of input datasets. """ params = self.params # for quicker access ;) ca = self.ca ndatasets = len(datasets) nfeatures = [ds.nfeatures for ds in datasets] alpha = params.alpha residuals = None if ca['training_residual_errors'].enabled: residuals = np.zeros((1 + params.level2_niter, ndatasets)) ca.training_residual_errors = Dataset( samples = residuals, sa = {'levels' : ['1'] + ['2:%i' % i for i in xrange(params.level2_niter)]}) if __debug__: debug('HPAL', "Hyperalignment %s for %i datasets" % (self, ndatasets)) if params.ref_ds is None: ref_ds = np.argmax(nfeatures) else: ref_ds = params.ref_ds if ref_ds < 0 and ref_ds >= ndatasets: raise ValueError, "Requested reference dataset %i is out of " \ "bounds. We have only %i datasets provided" \ % (ref_ds, ndatasets) ca.choosen_ref_ds = ref_ds # zscore all data sets # ds = [ zscore(ds, chunks_attr=None) for ds in datasets] # TODO since we are doing in-place zscoring create deep copies # of the datasets with pruned targets and shallow copies of # the collections (if they would come needed in the transformation) # TODO: handle floats and non-floats differently to prevent # waste of memory if there is no need (e.g. no z-scoring) #otargets = [ds.sa.targets for ds in datasets] datasets = [ds.copy(deep=False) for ds in datasets] #datasets = [Dataset(ds.samples.astype(float), sa={'targets': [None] * len(ds)}) #datasets = [Dataset(ds.samples, sa={'targets': [None] * len(ds)}) # for ds in datasets] if params.zscore_all: if __debug__: debug('HPAL', "Z-scoring all datasets") for ids in xrange(len(datasets)): zmapper = ZScoreMapper(chunks_attr=None) zmapper.train(datasets[ids]) datasets[ids] = zmapper.forward(datasets[ids]) if alpha < 1: datasets, wmappers = self._regularize(datasets, alpha) # initial common space is the reference dataset commonspace = datasets[ref_ds].samples # the reference dataset might have been zscored already, don't do it # twice if params.zscore_common and not params.zscore_all: if __debug__: debug('HPAL_', "Creating copy of a commonspace and assuring " "it is of a floating type") commonspace = commonspace.astype(float) zscore(commonspace, chunks_attr=None) # create a mapper per dataset # might prefer some other way to initialize... later mappers = [deepcopy(params.alignment) for ds in datasets] # # Level 1 -- initial projection # lvl1_projdata = self._level1(datasets, commonspace, ref_ds, mappers, residuals) # # Level 2 -- might iterate multiple times # # this is the final common space self.commonspace = self._level2(datasets, lvl1_projdata, mappers, residuals)
def fx(ds): zscore(ds, chunks_attr=None) perm = AttributePermutator('condition', limit='chunks') ds = perm(ds) return ds
def test_hrf_modeling(): skip_if_no_external('nibabel') skip_if_no_external('nipy') # ATM relies on NiPy's GLM implementation ds = load_example_fmri_dataset('25mm') #literal=True) # TODO: simulate short dataset with known properties and use it # for testing events = find_events(targets=ds.sa.targets, chunks=ds.sa.chunks) tr = ds.a.imghdr['pixdim'][4] for ev in events: for a in ('onset', 'duration'): ev[a] = ev[a] * tr evds = eventrelated_dataset(ds, events, time_attr='time_coords', condition_attr='targets', design_kwargs=dict(drift_model='blank'), glmfit_kwargs=dict(model='ols'), model='hrf') # same voxels assert_equal(ds.nfeatures, evds.nfeatures) assert_array_equal(ds.fa.voxel_indices, evds.fa.voxel_indices) # one sample for each condition, plus constant assert_equal(sorted(ds.sa['targets'].unique), sorted(evds.sa.targets)) assert_equal(evds.a.add_regs.sa.regressor_names[0], 'constant') # with centered data zscore(ds) evds_demean = eventrelated_dataset(ds, events, time_attr='time_coords', condition_attr='targets', design_kwargs=dict(drift_model='blank'), glmfit_kwargs=dict(model='ols'), model='hrf') # after demeaning the constant should consume a lot less assert(evds.a.add_regs[0].samples.mean() > evds_demean.a.add_regs[0].samples.mean()) # from eyeballing the sensitivity example -- would be better to test this on # the tutorial data assert(evds_demean[evds.sa.targets == 'shoe'].samples.max() \ > evds_demean[evds.sa.targets == 'bottle'].samples.max()) # HRF models assert('regressors' in evds.sa) assert('regressors' in evds.a.add_regs.sa) assert_equal(evds.sa.regressors.shape[1], len(ds)) # custom regressors evds_regrs = eventrelated_dataset(ds, events, time_attr='time_coords', condition_attr='targets', regr_attrs=['time_indices'], design_kwargs=dict(drift_model='blank'), glmfit_kwargs=dict(model='ols'), model='hrf') # verify that nothing screwed up time_coords assert_equal(ds.sa.time_coords[0], 0) assert_equal(len(evds_regrs), len(evds)) # one more output sample in .a.add_regs assert_equal(len(evds_regrs.a.add_regs) - 1, len(evds.a.add_regs)) # comes last before constant assert_equal('time_indices', evds_regrs.a.add_regs.sa.regressor_names[-2]) # order of main regressors is unchanged assert_array_equal(evds.sa.targets, evds_regrs.sa.targets) # custom regressors from external sources evds_regrs = eventrelated_dataset(ds, events, time_attr='time_coords', condition_attr='targets', regr_attrs=['time_coords'], design_kwargs=dict(drift_model='blank', add_regs=np.linspace(1, -1, len(ds))[None].T, add_reg_names=['negative_trend']), glmfit_kwargs=dict(model='ols'), model='hrf') assert_equal(len(evds_regrs), len(evds)) # But we got one more in additional regressors assert_equal(len(evds_regrs.a.add_regs) - 2, len(evds.a.add_regs)) # comes last before constant assert_array_equal(['negative_trend', 'time_coords', 'constant'], evds_regrs.a.add_regs.sa.regressor_names) # order is otherwise unchanged assert_array_equal(evds.sa.targets, evds_regrs.sa.targets) # HRF models with estimating per each chunk assert_equal(ds.sa.time_coords[0], 0) evds_regrs = eventrelated_dataset(ds, events, time_attr='time_coords', condition_attr=['targets', 'chunks'], regr_attrs=['time_indices'], design_kwargs=dict(drift_model='blank'), glmfit_kwargs=dict(model='ols'), model='hrf') assert_true('add_regs' in evds_regrs.a) assert_true('time_indices' in evds_regrs.a.add_regs.sa.regressor_names) assert_equal(len(ds.UC) * len(ds.UT), len(evds_regrs)) assert_equal(len(evds_regrs.UC) * len(evds_regrs.UT), len(evds_regrs)) from mvpa2.mappers.fx import mean_group_sample evds_regrs_meaned = mean_group_sample(['targets'])(evds_regrs) assert_array_equal(evds_regrs_meaned.T, evds.T) # targets should be the same
def test_zscore(): """Test z-scoring transformation """ # dataset: mean=2, std=1 samples = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)).\ reshape((16, 1)) data = dataset_wizard(samples.copy(), targets=range(16), chunks=[0] * 16) assert_equal(data.samples.mean(), 2.0) assert_equal(data.samples.std(), 1.0) data_samples = data.samples.copy() zscore(data, chunks_attr='chunks') # copy should stay intact assert_equal(data_samples.mean(), 2.0) assert_equal(data_samples.std(), 1.0) # we should be able to operate on ndarrays # But we can't change type inplace for an array, can't we? assert_raises(TypeError, zscore, data_samples, chunks_attr=None) # so lets do manually data_samples = data_samples.astype(float) zscore(data_samples, chunks_attr=None) assert_array_equal(data.samples, data_samples) # check z-scoring check = np.array([-2, -1, 1, 2, 0, 0, 1, -1, -1, 1, 1, -1, 0, 0, 0, 0], dtype='float64').reshape(16, 1) assert_array_equal(data.samples, check) data = dataset_wizard(samples.copy(), targets=range(16), chunks=[0] * 16) zscore(data, chunks_attr=None) assert_array_equal(data.samples, check) # check z-scoring taking set of labels as a baseline data = dataset_wizard(samples.copy(), targets=[0, 2, 2, 2, 1] + [2] * 11, chunks=[0] * 16) zscore(data, param_est=('targets', [0, 1])) assert_array_equal(samples, data.samples + 1.0) # check that zscore modifies in-place; only guaranteed if no upcasting is # necessary samples = samples.astype('float') data = dataset_wizard(samples, targets=[0, 2, 2, 2, 1] + [2] * 11, chunks=[0] * 16) zscore(data, param_est=('targets', [0, 1])) assert_array_equal(samples, data.samples) # these might be duplicating code above -- but twice is better than nothing # dataset: mean=2, std=1 raw = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)) # dataset: mean=12, std=1 raw2 = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)) + 10 # zscore target check = [-2, -1, 1, 2, 0, 0, 1, -1, -1, 1, 1, -1, 0, 0, 0, 0] ds = dataset_wizard(raw.copy(), targets=range(16), chunks=[0] * 16) pristine = dataset_wizard(raw.copy(), targets=range(16), chunks=[0] * 16) zm = ZScoreMapper() # should do global zscore by default zm.train(ds) # train assert_array_almost_equal(zm.forward(ds), np.transpose([check])) # should not modify the source assert_array_equal(pristine, ds) # if we tell it a different mean it should obey the order zm = ZScoreMapper(params=(3,1)) zm.train(ds) assert_array_almost_equal(zm.forward(ds), np.transpose([check]) - 1 ) assert_array_equal(pristine, ds) # let's look at chunk-wise z-scoring ds = dataset_wizard(np.hstack((raw.copy(), raw2.copy())), targets=range(32), chunks=[0] * 16 + [1] * 16) # by default chunk-wise zm = ZScoreMapper() zm.train(ds) # train assert_array_almost_equal(zm.forward(ds), np.transpose([check + check])) # we should be able to do that same manually zm = ZScoreMapper(params={0: (2,1), 1: (12,1)}) zm.train(ds) # train assert_array_almost_equal(zm.forward(ds), np.transpose([check + check]))
def __call__(self, datasets): """Estimate mappers for each dataset Parameters ---------- datasets : list or tuple of datasets Returns ------- A list of trained Mappers of the same length as datasets """ params = self.params # for quicker access ;) ca = self.ca ndatasets = len(datasets) nfeatures = [ds.nfeatures for ds in datasets] residuals = None if ca['residual_errors'].enabled: residuals = np.zeros((2 + params.level2_niter, ndatasets)) ca.residual_errors = Dataset( samples = residuals, sa = {'levels' : ['1'] + ['2:%i' % i for i in xrange(params.level2_niter)] + ['3']}) if __debug__: debug('HPAL', "Hyperalignment %s for %i datasets" % (self, ndatasets)) if params.ref_ds is None: ref_ds = np.argmax(nfeatures) else: ref_ds = params.ref_ds if ref_ds < 0 and ref_ds >= ndatasets: raise ValueError, "Requested reference dataset %i is out of " \ "bounds. We have only %i datasets provided" \ % (ref_ds, ndatasets) ca.choosen_ref_ds = ref_ds # might prefer some other way to initialize... later mappers = [deepcopy(params.alignment) for ds in datasets] # zscore all data sets # ds = [ zscore(ds, chunks_attr=None) for ds in datasets] # Level 1 (first) commonspace = np.asanyarray(datasets[ref_ds]) if params.zscore_common: zscore(commonspace, chunks_attr=None) data_mapped = [np.asanyarray(ds) for ds in datasets] for i, (m, data) in enumerate(zip(mappers, data_mapped)): if __debug__: debug('HPAL_', "Level 1: ds #%i" % i) if i == ref_ds: continue #ZSC zscore(data, chunks_attr=None) ds = dataset_wizard(samples=data, targets=commonspace) #ZSC zscore(ds, chunks_attr=None) m.train(ds) data_temp = m.forward(data) #ZSC zscore(data_temp, chunks_attr=None) data_mapped[i] = data_temp if residuals is not None: residuals[0, i] = np.linalg.norm(data_temp - commonspace) ## if ds_mapped == []: ## ds_mapped = [zscore(m.forward(d), chunks_attr=None)] ## else: ## ds_mapped += [zscore(m.forward(d), chunks_attr=None)] # zscore before adding # TODO: make just a function so we dont' waste space commonspace = params.combiner1(data_mapped[i], commonspace) if params.zscore_common: zscore(commonspace, chunks_attr=None) # update commonspace to mean of ds_mapped commonspace = params.combiner2(data_mapped) if params.zscore_common: zscore(commonspace, chunks_attr=None) # Level 2 -- might iterate multiple times for loop in xrange(params.level2_niter): for i, (m, ds) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 2 (%i-th iteration): ds #%i" % (loop, i)) ## ds_temp = zscore( (commonspace*ndatasets - ds_mapped[i]) ## /(ndatasets-1), chunks_attr=None ) ds_new = ds.copy() #ZSC zscore(ds_new, chunks_attr=None) #PRJ ds_temp = (commonspace*ndatasets - ds_mapped[i])/(ndatasets-1) #ZSC zscore(ds_temp, chunks_attr=None) ds_new.targets = commonspace #PRJ ds_temp m.train(ds_new) # ds_temp) data_mapped[i] = m.forward(np.asanyarray(ds)) if residuals is not None: residuals[1+loop, i] = np.linalg.norm(data_mapped - commonspace) #ds_mapped[i] = zscore( m.forward(ds_temp), chunks_attr=None) commonspace = params.combiner2(data_mapped) if params.zscore_common: zscore(commonspace, chunks_attr=None) # Level 3 (last) to params.levels for i, (m, ds) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 3: ds #%i" % i) ## ds_temp = zscore( (commonspace*ndatasets - ds_mapped[i]) ## /(ndatasets-1), chunks_attr=None ) ds_new = ds.copy() # shallow copy so we could assign new labels #ZSC zscore(ds_new, chunks_attr=None) #PRJ ds_temp = (commonspace*ndatasets - ds_mapped[i])/(ndatasets-1) #ZSC zscore(ds_temp, chunks_attr=None) ds_new.targets = commonspace #PRJ ds_temp# m.train(ds_new) #ds_temp) if residuals is not None: data_mapped = m.forward(ds_new) residuals[-1, i] = np.linalg.norm(data_mapped - commonspace) return mappers
def test_zscore(): """Test z-scoring transformation """ # dataset: mean=2, std=1 samples = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)).\ reshape((16, 1)) data = dataset_wizard(samples.copy(), targets=range(16), chunks=[0] * 16) assert_equal(data.samples.mean(), 2.0) assert_equal(data.samples.std(), 1.0) data_samples = data.samples.copy() zscore(data, chunks_attr='chunks') # copy should stay intact assert_equal(data_samples.mean(), 2.0) assert_equal(data_samples.std(), 1.0) # we should be able to operate on ndarrays # But we can't change type inplace for an array, can't we? assert_raises(TypeError, zscore, data_samples, chunks_attr=None) # so lets do manually data_samples = data_samples.astype(float) zscore(data_samples, chunks_attr=None) assert_array_equal(data.samples, data_samples) # check z-scoring check = np.array([-2, -1, 1, 2, 0, 0, 1, -1, -1, 1, 1, -1, 0, 0, 0, 0], dtype='float64').reshape(16, 1) assert_array_equal(data.samples, check) data = dataset_wizard(samples.copy(), targets=range(16), chunks=[0] * 16) zscore(data, chunks_attr=None) assert_array_equal(data.samples, check) # check z-scoring taking set of labels as a baseline data = dataset_wizard(samples.copy(), targets=[0, 2, 2, 2, 1] + [2] * 11, chunks=[0] * 16) zscore(data, param_est=('targets', [0, 1])) assert_array_equal(samples, data.samples + 1.0) # check that zscore modifies in-place; only guaranteed if no upcasting is # necessary samples = samples.astype('float') data = dataset_wizard(samples, targets=[0, 2, 2, 2, 1] + [2] * 11, chunks=[0] * 16) zscore(data, param_est=('targets', [0, 1])) assert_array_equal(samples, data.samples) # verify that if param_est is set but chunks_attr is None # performs zscoring across entire dataset correctly data = data.copy() data_01 = data.select({'targets': [0, 1]}) zscore(data_01, chunks_attr=None) zscore(data, chunks_attr=None, param_est=('targets', [0, 1])) assert_array_equal(data_01.samples, data.select({'targets': [0, 1]})) # these might be duplicating code above -- but twice is better than nothing # dataset: mean=2, std=1 raw = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)) # dataset: mean=12, std=1 raw2 = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)) + 10 # zscore target check = [-2, -1, 1, 2, 0, 0, 1, -1, -1, 1, 1, -1, 0, 0, 0, 0] ds = dataset_wizard(raw.copy(), targets=range(16), chunks=[0] * 16) pristine = dataset_wizard(raw.copy(), targets=range(16), chunks=[0] * 16) zm = ZScoreMapper() # should do global zscore by default zm.train(ds) # train assert_array_almost_equal(zm.forward(ds), np.transpose([check])) # should not modify the source assert_array_equal(pristine, ds) # if we tell it a different mean it should obey the order zm = ZScoreMapper(params=(3,1)) zm.train(ds) assert_array_almost_equal(zm.forward(ds), np.transpose([check]) - 1 ) assert_array_equal(pristine, ds) # let's look at chunk-wise z-scoring ds = dataset_wizard(np.hstack((raw.copy(), raw2.copy())), targets=range(32), chunks=[0] * 16 + [1] * 16) # by default chunk-wise zm = ZScoreMapper() zm.train(ds) # train assert_array_almost_equal(zm.forward(ds), np.transpose([check + check])) # we should be able to do that same manually zm = ZScoreMapper(params={0: (2,1), 1: (12,1)}) zm.train(ds) # train assert_array_almost_equal(zm.forward(ds), np.transpose([check + check])) # And just a smoke test for warnings reporting whenever # of # samples per chunk is low. # on 1 sample per chunk zds1 = ZScoreMapper(chunks_attr='chunks', auto_train=True)( ds[[0, -1]]) ok_(np.all(zds1.samples == 0)) # they all should be 0 # on 2 samples per chunk zds2 = ZScoreMapper(chunks_attr='chunks', auto_train=True)( ds[[0, 1, -10, -1]]) assert_array_equal(np.unique(zds2.samples), [-1., 1]) # they all should be -1 or 1 # on 3 samples per chunk -- different warning ZScoreMapper(chunks_attr='chunks', auto_train=True)( ds[[0, 1, 2, -3, -2, -1]]) # test if std provided as a list not as an array is handled # properly -- should zscore all features (not just first/none # as it was before) ds = dataset_wizard(np.arange(32).reshape((8,-1)), targets=range(8), chunks=[0] * 8) means = [0, 1, -10, 10] std0 = np.std(ds[:, 0]) # std deviation of first one stds = [std0, 10, .1, 1] zm = ZScoreMapper(params=(means, stds), auto_train=True) dsz = zm(ds) assert_array_almost_equal((np.mean(ds, axis=0) - np.asanyarray(means))/np.array(stds), np.mean(dsz, axis=0)) assert_array_almost_equal(np.std(ds, axis=0)/np.array(stds), np.std(dsz, axis=0))
def test_basic_functioning(self, ref_ds, zscore_common, zscore_all): ha = Hyperalignment(ref_ds=ref_ds, zscore_all=zscore_all, zscore_common=zscore_common) if ref_ds is None: ref_ds = 0 # by default should be this one # get a dataset with some prominent trends in it ds4l = datasets['uni4large'] # lets select for now only meaningful features ds_orig = ds4l[:, ds4l.a.nonbogus_features] nf = ds_orig.nfeatures n = 4 # # of datasets to generate Rs, dss_rotated, dss_rotated_clean, random_shifts, random_scales \ = [], [], [], [], [] # now lets compose derived datasets by using some random # rotation(s) for i in xrange(n): ## if False: # i == ref_ds: # # Do not rotate the target space so we could check later on # # if we transform back nicely # R = np.eye(ds_orig.nfeatures) ## else: ds_ = random_affine_transformation(ds_orig, scale_fac=100, shift_fac=10) Rs.append(ds_.a.random_rotation) # reusing random data from dataset itself random_scales += [ds_.a.random_scale] random_shifts += [ds_.a.random_shift] random_noise = ds4l.samples[:, ds4l.a.bogus_features[:4]] ## if (zscore_common or zscore_all): ## # for later on testing of "precise" reconstruction ## zscore(ds_, chunks_attr=None) dss_rotated_clean.append(ds_) ds_ = ds_.copy() ds_.samples = ds_.samples + 0.1 * random_noise dss_rotated.append(ds_) # Lets test two scenarios -- in one with no noise -- we should get # close to perfect reconstruction. If noise was added -- not so good for noisy, dss in ((False, dss_rotated_clean), (True, dss_rotated)): # to verify that original datasets didn't get changed by # Hyperalignment store their idhashes of samples idhashes = [idhash(ds.samples) for ds in dss] idhashes_targets = [idhash(ds.targets) for ds in dss] mappers = ha(dss) idhashes_ = [idhash(ds.samples) for ds in dss] idhashes_targets_ = [idhash(ds.targets) for ds in dss] self.assertEqual(idhashes, idhashes_, msg="Hyperalignment must not change original data.") self.assertEqual(idhashes_targets, idhashes_targets_, msg="Hyperalignment must not change original data targets.") self.assertEqual(ref_ds, ha.ca.chosen_ref_ds) # Map data back dss_clean_back = [m.forward(ds_) for m, ds_ in zip(mappers, dss_rotated_clean)] ds_norm = np.linalg.norm(dss[ref_ds].samples) nddss = [] ndcss = [] ds_orig_Rref = np.dot(ds_orig.samples, Rs[ref_ds]) \ * random_scales[ref_ds] \ + random_shifts[ref_ds] if zscore_common or zscore_all: zscore(Dataset(ds_orig_Rref), chunks_attr=None) for ds_back in dss_clean_back: # if we used zscoring of common, we cannot rely # that range/offset could be matched, so lets use # corrcoef ndcs = np.diag(np.corrcoef(ds_back.samples.T, ds_orig_Rref.T)[nf:, :nf], k=0) ndcss += [ndcs] dds = ds_back.samples - ds_orig_Rref ndds = np.linalg.norm(dds) / ds_norm nddss += [ndds] snoisy = ('clean', 'noisy')[int(noisy)] do_labile = cfg.getboolean('tests', 'labile', default='yes') if not noisy or do_labile: # First compare correlations self.assertTrue(np.all(np.array(ndcss) >= (0.9, 0.85)[int(noisy)]), msg="Should have reconstructed original dataset more or" " less. Got correlations %s in %s case." % (ndcss, snoisy)) if not (zscore_all or zscore_common): # if we didn't zscore -- all of them should be really close self.assertTrue(np.all(np.array(nddss) <= (1e-10, 1e-1)[int(noisy)]), msg="Should have reconstructed original dataset well " "without zscoring. Got normed differences %s in %s case." % (nddss, snoisy)) elif do_labile: # otherwise they all should be somewhat close self.assertTrue(np.all(np.array(nddss) <= (.2, 3)[int(noisy)]), msg="Should have reconstructed original dataset more or" " less for all. Got normed differences %s in %s case." % (nddss, snoisy)) self.assertTrue(np.all(nddss[ref_ds] <= .09), msg="Should have reconstructed original dataset quite " "well even with zscoring. Got normed differences %s " "in %s case." % (nddss, snoisy)) # yoh: and leave 5% of difference for a chance and numerical # fluctuations ;) self.assertTrue(np.all(np.array(nddss) >= 0.95*nddss[ref_ds]), msg="Should have reconstructed orig_ds best of all. " "Got normed differences %s in %s case with ref_ds=%d." % (nddss, snoisy, ref_ds)) # Lets see how well we do if asked to compute residuals ha = Hyperalignment(ref_ds=ref_ds, level2_niter=2, enable_ca=['training_residual_errors', 'residual_errors']) mappers = ha(dss_rotated_clean) self.assertTrue(np.all(ha.ca.training_residual_errors.sa.levels == ['1', '2:0', '2:1'])) rterrors = ha.ca.training_residual_errors.samples # just basic tests: self.assertEqual(rterrors[0, ref_ds], 0) self.assertEqual(rterrors.shape, (3, n)) rerrors = ha.ca.residual_errors.samples self.assertEqual(rerrors.shape, (1, n))
from mvpa2.datasets.sources.native import load_tutorial_data datapath = pjoin(cfg.get('location', 'tutorial data'), 'haxby2001') ds = load_tutorial_data(roi=(15, 16, 23, 24, 36, 38, 39, 40, 48)) """ We only do minimal pre-processing: linear trend removal and Z-scoring all voxel time-series with respect to the mean and standard deviation of the "rest" condition. """ # only minial detrending from mvpa2.mappers.detrend import poly_detrend poly_detrend(ds, polyord=1, chunks_attr='chunks') # z-scoring with respect to the 'rest' condition from mvpa2.mappers.zscore import zscore zscore(ds, chunks_attr='chunks', param_est=('targets', 'rest')) # now remove 'rest' samples ds = ds[ds.sa.targets != 'rest'] """ RSA is all about so-called dissimilarity matrices: square, symetric matrices with a zero diagonal that encode the (dis)similarity between all pairs of data samples or conditions in a dataset. We compose a little helper function to plot such matrices, including a color-scale and proper labeling of matrix rows and columns. """ # little helper function to plot dissimilarity matrices def plot_mtx(mtx, labels, title): pl.figure() pl.imshow(mtx, interpolation='nearest')
def __call__(self, datasets): """Estimate mappers for each dataset Parameters ---------- datasets : list or tuple of datasets Returns ------- A list of trained Mappers of the same length as datasets """ params = self.params # for quicker access ;) ca = self.ca ndatasets = len(datasets) nfeatures = [ds.nfeatures for ds in datasets] residuals = None if ca['residual_errors'].enabled: residuals = np.zeros((2 + params.level2_niter, ndatasets)) ca.residual_errors = Dataset( samples = residuals, sa = {'levels' : ['1'] + ['2:%i' % i for i in xrange(params.level2_niter)] + ['3']}) if __debug__: debug('HPAL', "Hyperalignment %s for %i datasets" % (self, ndatasets)) if params.ref_ds is None: ref_ds = np.argmax(nfeatures) else: ref_ds = params.ref_ds if ref_ds < 0 and ref_ds >= ndatasets: raise ValueError, "Requested reference dataset %i is out of " \ "bounds. We have only %i datasets provided" \ % (ref_ds, ndatasets) ca.choosen_ref_ds = ref_ds # might prefer some other way to initialize... later mappers = [deepcopy(params.alignment) for ds in datasets] # zscore all data sets # ds = [ zscore(ds, chunks_attr=None) for ds in datasets] # Level 1 (first) # TODO since we are doing in-place zscoring create deep copies # of the datasets with pruned targets and shallow copies of # the collections (if they would come needed in the transformation) # TODO: handle floats and non-floats differently to prevent # waste of memory if there is no need (e.g. no z-scoring) #otargets = [ds.sa.targets for ds in datasets] datasets = [ds.copy(deep=False) for ds in datasets] #datasets = [Dataset(ds.samples.astype(float), sa={'targets': [None] * len(ds)}) #datasets = [Dataset(ds.samples, sa={'targets': [None] * len(ds)}) # for ds in datasets] if params.zscore_all: if __debug__: debug('HPAL', "Z-scoring all datasets") # zscore them once while storing corresponding ZScoreMapper's zmappers = [] for ids in xrange(len(datasets)): zmapper = ZScoreMapper(chunks_attr=None) zmappers.append(zmapper) zmapper.train(datasets[ids]) datasets[ids] = zmapper.forward(datasets[ids]) commonspace = np.asanyarray(datasets[ref_ds]) if params.zscore_common and not params.zscore_all: if __debug__: debug('HPAL_', "Creating copy of a commonspace and assuring " "it is of a floating type") commonspace = commonspace.astype(float) zscore(commonspace, chunks_attr=None) data_mapped = [np.asanyarray(ds) for ds in datasets] #zscore(data_mapped[ref_ds],chunks_attr=None) for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 1: ds #%i" % i) if i == ref_ds: continue #ds_new = ds.copy() #zscore(ds_new, chunks_attr=None); ds_new.targets = commonspace m.train(ds_new) ds_ = m.forward(np.asanyarray(ds_new)) if params.zscore_common: zscore(ds_, chunks_attr=None) data_mapped[i] = ds_ if residuals is not None: residuals[0, i] = np.linalg.norm(ds_ - commonspace) ## if ds_mapped == []: ## ds_mapped = [zscore(m.forward(d), chunks_attr=None)] ## else: ## ds_mapped += [zscore(m.forward(d), chunks_attr=None)] # zscore before adding # TODO: make just a function so we dont' waste space commonspace = params.combiner1(data_mapped[i], commonspace) if params.zscore_common: zscore(commonspace, chunks_attr=None) # update commonspace to mean of ds_mapped commonspace = params.combiner2(data_mapped) #if params.zscore_common: #zscore(commonspace, chunks_attr=None) # Level 2 -- might iterate multiple times for loop in xrange(params.level2_niter): for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 2 (%i-th iteration): ds #%i" % (loop, i)) ds_temp = (commonspace*ndatasets - data_mapped[i])/(ndatasets-1) if params.zscore_common: zscore(ds_temp, chunks_attr=None) #ds_new = ds.copy() #zscore(ds_new, chunks_attr=None) ds_new.targets = ds_temp #commonspace #PRJ ds_temp m.train(ds_new) # ds_temp) ds_ = m.forward(np.asanyarray(ds_new)) if params.zscore_common: zscore(ds_, chunks_attr=None) data_mapped[i] = ds_ if residuals is not None: residuals[1+loop, i] = np.linalg.norm(ds_ - commonspace) #ds_mapped[i] = zscore( m.forward(ds_temp), chunks_attr=None) commonspace = params.combiner2(data_mapped) #if params.zscore_common: #zscore(commonspace, chunks_attr=None) # Level 3 (last) to params.levels for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 3: ds #%i" % i) #ds_new = ds.copy() # shallow copy so we could assign new labels #zscore(ds_new, chunks_attr=None) ds_temp = (commonspace*ndatasets - data_mapped[i])/(ndatasets-1) if params.zscore_common: zscore(ds_temp, chunks_attr=None) ds_new.targets = ds_temp #commonspace #PRJ ds_temp# m.train(ds_new) #ds_temp) data_mapped[i] = m.forward(np.asanyarray(ds_new)) if residuals is not None: residuals[-1, i] = np.linalg.norm(data_mapped[i] - commonspace) if params.zscore_all: # We need to construct new mappers which would chain # zscore and then final transformation return [ChainMapper([zm, m]) for zm, m in zip(zmappers, mappers)] else: return mappers