def apply_falign(ds, ha): subjects = ds.sa['subject_id'].unique rois = ds.fa['annotation'].unique #FIXME: roi cannot be a fa hemis = ds.fa['hemi'].unique rds = ds.copy() sds = [] for subject in subjects: rds = [] for roi in rois: hds = [] for hemi in hemis: select = ({ 'subject_id': [subject] }, { 'annotation': [roi], 'hemi': [hemi] }) mds = ha[hemi][roi][subject].forward(ds[select]) mds.fa['annotation'] = ds[select].fa['annotation'] mds.fa['hemi'] = ds[select].fa['hemi'] hds.append(mds) rds.append(hstack(hds)) sds.append(hstack(rds)) return vstack(sds)
def run(args): dss = hdf2ds(args.data) verbose(3, 'Loaded %i dataset(s)' % len(dss)) ds = vstack(dss) verbose(3, 'Concatenation yielded %i samples with %i features' % ds.shape) # slicing sliceme = {'samples': slice(None), 'features': slice(None)} # indices for opt, col, which in ((args.samples_by_index, ds.sa, 'samples'), (args.features_by_index, ds.fa, 'features')): if opt is None: continue if len(opt) == 1 and opt[0].count(':'): # slice spec arg = opt[0].split(':') spec = [] for a in arg: if not len(a): spec.append(None) else: spec.append(int(a)) sliceme[which] = slice(*spec) else: # actual indices sliceme[which] = [int(o) for o in opt] # attribute evaluation for opt, col, which in ((args.samples_by_attr, ds.sa, 'samples'), (args.features_by_attr, ds.fa, 'features')): if opt is None: continue sliceme[which] = _eval_attr_expr(opt, col) # apply selection ds = ds.__getitem__((sliceme['samples'], sliceme['features'])) verbose(1, 'Selected %i samples with %i features' % ds.shape) # strip attributes for attrarg, col, descr in ((args.strip_sa, ds.sa, 'sample '), (args.strip_fa, ds.fa, 'feature '), (args.strip_da, ds.a, '')): if not attrarg is None: for attr in attrarg: try: del col[attr] except KeyError: warning("dataset has no %sattribute '%s' to remove" % (descr, attr)) # and store ds2hdf5(ds, args.output, compression=args.hdf5_compression) return ds
def test_resample(): time = np.linspace(0, 2 * np.pi, 100) ds = Dataset(np.vstack((np.sin(time), np.cos(time))).T, sa={ 'time': time, 'section': np.repeat(range(10), 10) }) assert_equal(ds.shape, (100, 2)) # downsample num = 10 rm = FFTResampleMapper(num, window=('gauss', 50), position_attr='time', attr_strategy='sample') mds = rm.forward(ds) assert_equal(mds.shape, (num, ds.nfeatures)) # didn't change the orig assert_equal(len(ds), 100) # check position-based resampling ds_partial = ds[0::10] mds_partial = rm.forward(ds_partial) # despite different input sampling should yield the same output timepoints assert_array_almost_equal(mds.sa.time, mds_partial.sa.time) # exclude the first points to prevent edge effects, but the data should be # very similar too assert_array_almost_equal(mds.samples[2:], mds_partial.samples[2:], decimal=2) # simple sample of sa's should give meaningful stuff assert_array_equal(mds.sa.section, range(10)) # and now for a dataset with chunks cds = vstack([ds.copy(), ds.copy()]) cds.sa['chunks'] = np.repeat([0, 1], len(ds)) rm = FFTResampleMapper(num, attr_strategy='sample', chunks_attr='chunks', window=('gauss', 50)) mcds = rm.forward(cds) assert_equal(mcds.shape, (20, 2)) assert_array_equal(mcds.sa.section, np.tile(range(10), 2)) # each individual chunks should be identical to previous dataset assert_array_almost_equal(mds.samples, mcds.samples[:10]) assert_array_almost_equal(mds.samples, mcds.samples[10:])
def join_datasets(datasets, a=None): # if a is None: # a = list(set(reduce(operator.add, # [ds.a.keys() for ds in datasets]))) # vds = [] # for ds in datasets: # for k in a: # if k not in ds.sa: # if k in ds.a: # ds.sa[k] = [ds.a[k]]*ds.nsamples # else: # ds.sa[k] = None # vds.append(ds) vds = datasets return vstack(vds)
def run(args): dss = hdf2ds(args.data) verbose(3, 'Loaded %i dataset(s)' % len(dss)) ds = vstack(dss) verbose(3, 'Concatenation yielded %i samples with %i features' % ds.shape) # get CV instance cv = get_crossvalidation_instance(args.learner, args.partitioner, args.errorfx, args.sampling_repetitions, args.learner_space, args.balance_training, args.permutations, args.avg_datafold_results, args.prob_tail) res = cv(ds) # some meaningful output # XXX make condition on classification analysis only? print cv.ca.stats print 'Results\n-------' if args.permutations > 0: nprob = cv.ca.null_prob.samples if res.shape[1] == 1: # simple result structure if args.permutations > 0: p = ', p-value (%s tail)' % args.prob_tail else: p = '' print 'Fold, Result%s' % p for i in xrange(len(res)): if args.permutations > 0: p = ', %f' % nprob[i, 0] else: p = '' print '%s, %f%s' % (res.sa.cvfolds[i], res.samples[i, 0], p) # and store ds2hdf5(res, args.output, compression=args.hdf5_compression) if args.permutations > 0: if args.output.endswith('.hdf5'): args.output = args.output[:-5] ds2hdf5(cv.ca.null_prob, '%s_nullprob' % args.output, compression=args.hdf5_compression) return res
def test_resample(): time = np.linspace(0, 2*np.pi, 100) ds = Dataset(np.vstack((np.sin(time), np.cos(time))).T, sa = {'time': time, 'section': np.repeat(range(10), 10)}) assert_equal(ds.shape, (100, 2)) # downsample num = 10 rm = FFTResampleMapper(num, window=('gauss', 50), position_attr='time', attr_strategy='sample') mds = rm.forward(ds) assert_equal(mds.shape, (num, ds.nfeatures)) # didn't change the orig assert_equal(len(ds), 100) # check position-based resampling ds_partial = ds[0::10] mds_partial = rm.forward(ds_partial) # despite different input sampling should yield the same output timepoints assert_array_almost_equal(mds.sa.time, mds_partial.sa.time) # exclude the first points to prevent edge effects, but the data should be # very similar too assert_array_almost_equal(mds.samples[2:], mds_partial.samples[2:], decimal=2) # simple sample of sa's should give meaningful stuff assert_array_equal(mds.sa.section, range(10)) # and now for a dataset with chunks cds = vstack([ds.copy(), ds.copy()]) cds.sa['chunks'] = np.repeat([0,1], len(ds)) rm = FFTResampleMapper(num, attr_strategy='sample', chunks_attr='chunks', window=('gauss', 50)) mcds = rm.forward(cds) assert_equal(mcds.shape, (20, 2)) assert_array_equal(mcds.sa.section, np.tile(range(10),2)) # each individual chunks should be identical to previous dataset assert_array_almost_equal(mds.samples, mcds.samples[:10]) assert_array_almost_equal(mds.samples, mcds.samples[10:])
def apply_falign(ds, ha): subjects = ds.sa["subject_id"].unique rois = ds.fa["annotation"].unique # FIXME: roi cannot be a fa hemis = ds.fa["hemi"].unique rds = ds.copy() sds = [] for subject in subjects: rds = [] for roi in rois: hds = [] for hemi in hemis: select = ({"subject_id": [subject]}, {"annotation": [roi], "hemi": [hemi]}) mds = ha[hemi][roi][subject].forward(ds[select]) mds.fa["annotation"] = ds[select].fa["annotation"] mds.fa["hemi"] = ds[select].fa["hemi"] hds.append(mds) rds.append(hstack(hds)) sds.append(hstack(rds)) return vstack(sds)
def run(args): dss = hdf2ds(args.data) verbose(3, 'Loaded %i dataset(s)' % len(dss)) ds = vstack(dss) verbose(3, 'Concatenation yielded %i samples with %i features' % ds.shape) # get CV instance cv = get_crossvalidation_instance( args.learner, args.partitioner, args.errorfx, args.sampling_repetitions, args.learner_space, args.balance_training, args.permutations, args.avg_datafold_results, args.prob_tail) res = cv(ds) # some meaningful output # XXX make condition on classification analysis only? print cv.ca.stats print 'Results\n-------' if args.permutations > 0: nprob = cv.ca.null_prob.samples if res.shape[1] == 1: # simple result structure if args.permutations > 0: p=', p-value (%s tail)' % args.prob_tail else: p='' print 'Fold, Result%s' % p for i in xrange(len(res)): if args.permutations > 0: p = ', %f' % nprob[i, 0] else: p = '' print '%s, %f%s' % (res.sa.cvfolds[i], res.samples[i, 0], p) # and store ds2hdf5(res, args.output, compression=args.hdf5_compression) if args.permutations > 0: if args.output.endswith('.hdf5'): args.output = args.output[:-5] ds2hdf5(cv.ca.null_prob, '%s_nullprob' % args.output, compression=args.hdf5_compression) return res
def _call(self, ds): # local binding generator = self._generator node = self._node ca = self.ca space = self.get_space() concat_as = self._concat_as if self.ca.is_enabled("stats") and (not node.ca.has_key("stats") or not node.ca.is_enabled("stats")): warning("'stats' conditional attribute was enabled, but " "the assigned node '%s' either doesn't support it, " "or it is disabled" % node) # precharge conditional attributes ca.datasets = [] # run the node an all generated datasets results = [] for i, sds in enumerate(generator.generate(ds)): if __debug__: debug('REPM', "%d-th iteration of %s on %s", (i, self, sds)) if ca.is_enabled("datasets"): # store dataset in ca ca.datasets.append(sds) # run the beast result = node(sds) # callback if not self._callback is None: self._callback(data=sds, node=node, result=result) # subclass postprocessing result = self._repetition_postcall(sds, node, result) if space: # XXX maybe try to get something more informative from the # processing node (e.g. in 0.5 it used to be 'chunks'->'chunks' # to indicate what was trained and what was tested. Now it is # more tricky, because `node` could be anything result.set_attr(space, (i, )) # store results.append(result) if ca.is_enabled("stats") and node.ca.has_key("stats") \ and node.ca.is_enabled("stats"): if not ca.is_set('stats'): # create empty stats container of matching type ca.stats = node.ca['stats'].value.__class__() # harvest summary stats ca['stats'].value.__iadd__(node.ca['stats'].value) # charge condition attribute self.ca.repetition_results = results # stack all results into a single Dataset if concat_as == 'samples': results = vstack(results) elif concat_as == 'features': results = hstack(results) else: raise ValueError("Unkown concatenation mode '%s'" % concat_as) # no need to store the raw results, since the Measure class will # automatically store them in a CA return results
def movie_dataset( subj, preproc=None, base_path=os.curdir, fname_tmpl='sub-{subj:02d}/ses-movie/func/sub-{subj:02d}_ses-movie_task-movie_run-{run}_recording-eyegaze_physio.tsv.gz' ): """ Load eyegaze recordings from all runs a merge into a consecutive timeseries When merging intersegment-overlap is removed. Parameters ---------- subj : str Subject code. preproc : callable or None Callable to preprocess a record array of the raw timeseries. The record array has the field 'x', 'y', 'pupil', and 'movie_frame'. It needs to return a record array with the same fields and must not change the sampling rate or number of samples. base_path : path Base directory for input file discovery. fname_tmpl : str Template expression to match input files. Support dict expansion with 'subj' and 'run' keys. Returns ------- Dataset The dataset contains a number of attributes, most of which should be self-explanatory. The `ds.a.run_duration_deviation` attribute quantifies the eyegaze recording duration difference from the expected value (in seconds). """ # in frames (hand-verified by re-assembling in kdenlive -- using MELT # underneath) # and inline with phase2/code/stimulus/movie/segment_timing.csv seg_offsets = (0, 22150, 43802, 65304, 89305, 112007, 133559, 160261) movie_fps = 25.0 eyegaze_sr = 1000.0 # Hz intersegment_overlap = 150 # frames segments = [] for seg, offset in enumerate(seg_offsets): raw = np.recfromcsv(os.path.join( base_path, fname_tmpl.format(subj=subj, run=seg + 1)), delimiter='\t', names=('x', 'y', 'pupil', 'movie_frame')) if not preproc is None: raw = preproc(raw) # glue together to form a dataset ds = Dataset( np.array((raw.x, raw.y, raw.pupil)).T, # movie frame idx is not zero-based in the files sa=dict(movie_frame=raw.movie_frame - 1)) ds.sa['movie_run_frame'] = ds.sa.movie_frame.copy() # turn into movie frame ID for the entire unsegmented movie ds.sa.movie_frame += offset ## truncate segment time series to remove overlap if seg < 7: # cut the end in a safe distance to the actual end, but inside the # overlap ds = ds[:-int(intersegment_overlap / movie_fps * eyegaze_sr)] # introduce an artificial blink at the end (see below) ds.samples[-100:] = np.nan if seg > 0: # cut the beginning to have a seamless start after the previous # segment ds = ds[ds.sa.movie_frame > segments[-1].sa.movie_frame.max()] # second half of the artificial blink. this is to avoid the # impression of a saccade at the point where two segments are # sown together ds.samples[:100] = np.nan ds.sa['movie_run'] = [seg + 1] * len(ds) segments.append(ds) ds = vstack(segments) # column names ds.fa['name'] = ('x', 'y', 'pupil') ds.a['sampling_rate'] = eyegaze_sr ds.a['movie_fps'] = movie_fps return ds
def load_openfmri_ds(root, subject, mask=None, filterfun=None, TR=3.0): ####Define helper functions for loading different parts of the data#### def read_condition_keys(DSROOT): maxlength = 0 def pick_fields(keys, line): fields = line.split() try: keys[fields[0]][fields[1]] = ' '.join(fields[2:]) except KeyError: keys[fields[0]]={fields[1] : ' '.join(fields[2:])} return keys with open(os.path.join(DSROOT, 'models/model001/condition_key.txt'),'r') as keyfile: return reduce(pick_fields,keyfile,{}) def parse_condition_onsets(path): condfiles=glob.glob(os.path.join(path, 'cond*')) timeline = [] for cfile in condfiles: cond_name = os.path.basename(cfile).rstrip('.txt') with open(cfile,'r') as cfh: for line in cfh: start, duration, weight = line.split() timeline.append((float(start), float(duration), cond_name)) timeline.sort() return timeline def extract_task_and_run(string): m=re.search('task([0-9]+)_run([0-9]+)', string) return int(m.group(1)), int(m.group(2)) def load_run(runstring): ds=fmri_dataset(samples=os.path.join(root,subject,'BOLD',runstring,'bold.nii.gz'),mask=mask) task, run = extract_task_and_run(runstring) ds.sa['chunks'] = np.empty(len(ds)) ds.sa.chunks.fill(run) ds.sa['task'] = np.empty(len(ds)) ds.sa.task.fill(task) return ds def merge_conditions_onto_ds(ds, onsets): targets = np.chararray(ds.shape[0],itemsize=17) targets.fill('rest') for cond in onsets: start, duration, condition = cond startidx = int(start/TR) endidx = int((start+duration)/TR) targets[startidx:endidx+1] = condition_keys['task001'][condition] ds.sa['targets']=targets ##Actual data loading begins here condition_keys = read_condition_keys(root) allruns = map(lambda x: os.path.basename(x), glob.glob(os.path.join(root, subject,'BOLD/task*'))) if filterfun: allruns = filter(filterfun,allruns) alldata=[] for run in allruns: ds=load_run(run) onsets=parse_condition_onsets(os.path.join(root,subject,'model/model001/onsets/',run)) merge_conditions_onto_ds(ds,onsets) alldata.append(ds) merged = vstack(alldata) merged.a.update(alldata[0].a) return merged
def get_model_bold_dataset(self, model_id, subj_id, run_ids=None, preproc_img=None, preproc_ds=None, modelfx=None, stack=True, flavor=None, mask=None, add_fa=None, add_sa=None, **kwargs): """Build a PyMVPA dataset for a model defined in the OpenFMRI dataset Parameters ---------- model_id : int Model ID. subj_id : int or str or list Integer, or string ID of the subject whose data shall be considered. Alternatively, a list of IDs can be given and data from all matching subjects will be loaded at once. run_ids : list, optional Run ids to be loaded. If None, all runs get loaded preproc_img : callable or None See get_bold_run_dataset() documentation preproc_ds : callable or None If not None, this callable will be called with each run bold dataset as an argument before ``modelfx`` is executed. The callable must return a dataset. modelfx : callable or None This callable will be called with each run dataset and the respective event list for each run as arguments, In addition all additional **kwargs of this method will be passed on to this callable. The callable must return a dataset. If None, ``assign_conditionlabels`` will be used as a default callable. stack : boolean Flag whether to stack all run datasets into a single dataset, or whether to return a list of datasets. flavor See get_bold_run_dataset() documentation mask See fmri_dataset() documentation. add_fa See fmri_dataset() documentation. add_sa See get_bold_run_dataset() documentation. Returns ------- Dataset or list Depending on the ``stack`` argument either a single dataset or a list of datasets for all subject/task/run combinations relevant to the model will be returned. In the stacked case the dataset attributes of the returned dataset are taken from the first run dataset, and are assumed to be identical for all of them. """ if modelfx is None: # loading a model dataset without actually considering the model # probably makes little sense, so at least create an attribute from mvpa2.datasets.eventrelated import assign_conditionlabels modelfx = assign_conditionlabels conds = self.get_model_conditions(model_id) # what tasks do we need to consider for this model tasks = np.unique([c['task'] for c in conds]) if isinstance(subj_id, (int, str)): subj_id = [subj_id] dss = [] for sub in subj_id: # we need to loop over tasks first in order to be able to determine # what runs exists: that means we have to load the model info # repeatedly for task in tasks: run_ids_ = run_ids \ if run_ids is not None \ else self.get_bold_run_ids(sub, task) for i, run in enumerate(run_ids_): events = self.get_bold_run_model(model_id, sub, run) # at this point our events should only contain those # matching the current task. If not, this model violates # the implicit assumption that one condition (label) can # only be present in a single task. The current OpenFMRI # spec does not allow for a more complex setup. I think # this is worth a runtime check check_events = [ev for ev in events if ev['task'] == task] if not len(check_events) == len(events): warning( "not all event specifications match the expected " "task ID -- something is wrong -- check that each " "model condition label is only associated with a " "single task") if not len(events): # nothing in this run for the given model # it could be argued whether we'd still want this data loaded # XXX maybe a flag? continue d = self.get_bold_run_dataset(sub, task, run=run, flavor=flavor, preproc_img=preproc_img, chunks=i, mask=mask, add_fa=add_fa, add_sa=add_sa) if preproc_ds is not None: d = preproc_ds(d) d = modelfx( d, events, **dict([(k, v) for k, v in kwargs.items() if not k in ('preproc_img', 'preproc_ds', 'modelfx', 'stack', 'flavor', 'mask', 'add_fa', 'add_sa')])) # if the modelfx doesn't leave 'chunk' information, we put # something minimal in for attr, info in (('chunks', i), ('run', run), ('subj', sub)): if not attr in d.sa: d.sa[attr] = [info] * len(d) dss.append(d) if stack: dss = vstack(dss, a=0) return dss
def _call(self, ds): # local binding generator = self._generator node = self._node ca = self.ca space = self.get_space() concat_as = self._concat_as if self.ca.is_enabled("stats") and (not node.ca.has_key("stats") or not node.ca.is_enabled("stats")): warning("'stats' conditional attribute was enabled, but " "the assigned node '%s' either doesn't support it, " "or it is disabled" % node) # precharge conditional attributes ca.datasets = [] # run the node an all generated datasets results = [] for i, sds in enumerate(generator.generate(ds)): if __debug__: debug('REPM', "%d-th iteration of %s on %s", (i, self, sds)) if ca.is_enabled("datasets"): # store dataset in ca ca.datasets.append(sds) # run the beast result = node(sds) # callback if not self._callback is None: self._callback(data=sds, node=node, result=result) # subclass postprocessing result = self._repetition_postcall(sds, node, result) if space: # XXX maybe try to get something more informative from the # processing node (e.g. in 0.5 it used to be 'chunks'->'chunks' # to indicate what was trained and what was tested. Now it is # more tricky, because `node` could be anything result.set_attr(space, (i,)) # store results.append(result) if ca.is_enabled("stats") and node.ca.has_key("stats") \ and node.ca.is_enabled("stats"): if not ca.is_set('stats'): # create empty stats container of matching type ca.stats = node.ca['stats'].value.__class__() # harvest summary stats ca['stats'].value.__iadd__(node.ca['stats'].value) # charge condition attribute self.ca.repetition_results = results # stack all results into a single Dataset if concat_as == 'samples': results = vstack(results) elif concat_as == 'features': results = hstack(results) else: raise ValueError("Unkown concatenation mode '%s'" % concat_as) # no need to store the raw results, since the Measure class will # automatically store them in a CA return results
def movie_dataset( subj, preproc=None, base_path=os.curdir, fname_tmpl='sub-%(subj)s/ses-movie/func/sub-%(subj)s_ses-movie_task-movie_run-%(run)i_recording-eyegaze_physio.tsv.gz'): """ Load eyegaze recordings from all runs a merge into a consecutive timeseries When merging intersegment-overlap is removed. Parameters ---------- subj : str Subject code. preproc : callable or None Callable to preprocess a record array of the raw timeseries. The record array has the field 'x', 'y', 'pupil', and 'movie_frame'. It needs to return a record array with the same fields and must not change the sampling rate or number of samples. base_path : path Base directory for input file discovery. fname_tmpl : str Template expression to match input files. Support dict expansion with 'subj' and 'run' keys. Returns ------- Dataset The dataset contains a number of attributes, most of which should be self-explanatory. The `ds.a.run_duration_deviation` attribute quantifies the eyegaze recording duration difference from the expected value (in seconds). """ # in frames (hand-verified by re-assembling in kdenlive -- using MELT # underneath) seg_offsets = (0, 22150, 43802, 65304, 89305, 112007, 133559, 160261) movie_fps = 25.0 eyegaze_sr = 1000.0 # Hz intersegment_overlap = 400 # frames segments = [] for seg, offset in enumerate(seg_offsets): raw = np.recfromcsv( os.path.join(base_path, fname_tmpl % dict(subj=subj, run=seg + 1)), delimiter='\t', names=('x', 'y', 'pupil', 'movie_frame')) if not preproc is None: raw = preproc(raw) # glue together to form a dataset ds = Dataset(np.array((raw.x, raw.y, raw.pupil)).T, sa=dict(movie_frame=raw.movie_frame)) ds.sa['movie_run_frame'] = ds.sa.movie_frame.copy() # turn into movie frame ID for the entire unsegmented movie ds.sa.movie_frame += offset ## truncate segment time series to remove overlap if seg < 7: # cut the end in a safe distance to the actual end, but inside the # overlap ds = ds[:-int(intersegment_overlap / movie_fps * eyegaze_sr)] if seg > 0: # cut the beginning to have a seamless start after the previous # segment ds = ds[ds.sa.movie_frame > segments[-1].sa.movie_frame.max()] ds.sa['movie_run'] = [seg + 1] * len(ds) segments.append(ds) ds = vstack(segments) # column names ds.fa['name'] = ('x', 'y', 'pupil') ds.a['sampling_rate'] = eyegaze_sr ds.a['movie_fps'] = movie_fps return ds
def test_rfe_sensmap(): # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2013q3/002538.html # just a smoke test. fails with from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import FeatureSelectionClassifier from mvpa2.measures.base import CrossValidation, RepeatedMeasure from mvpa2.generators.splitters import Splitter from mvpa2.generators.partition import NFoldPartitioner from mvpa2.misc.errorfx import mean_mismatch_error from mvpa2.mappers.fx import mean_sample from mvpa2.mappers.fx import maxofabs_sample from mvpa2.generators.base import Repeater from mvpa2.featsel.rfe import RFE from mvpa2.featsel.helpers import FractionTailSelector, BestDetector from mvpa2.featsel.helpers import NBackHistoryStopCrit from mvpa2.datasets import vstack from mvpa2.misc.data_generators import normal_feature_dataset # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent fds = normal_feature_dataset(nlabels=3, snr=1, # 100, # pure signal! ;) perlabel=9, nfeatures=6, nonbogus_features=range(3), nchunks=3) clfsvm = LinearCSVMC() rfesvm = RFE(clfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()), CrossValidation( clfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()), Repeater(2), fselector=FractionTailSelector(0.70, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10), update_sensitivity=True) fclfsvm = FeatureSelectionClassifier(clfsvm, rfesvm) sensanasvm = fclfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()) # manually repeating/splitting so we do both RFE sensitivity and classification senses, errors = [], [] for i, pset in enumerate(NFoldPartitioner().generate(fds)): # split partitioned dataset split = [d for d in Splitter('partitions').generate(pset)] senses.append(sensanasvm(split[0])) # and it also should train the classifier so we would ask it about error errors.append(mean_mismatch_error(fclfsvm.predict(split[1]), split[1].targets)) senses = vstack(senses) errors = vstack(errors) # Let's compare against rerunning the beast simply for classification with CV errors_cv = CrossValidation(fclfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error)(fds) # and they should match assert_array_equal(errors, errors_cv) # buggy! cv_sensana_svm = RepeatedMeasure(sensanasvm, NFoldPartitioner()) senses_rm = cv_sensana_svm(fds) #print senses.samples, senses_rm.samples #print errors, errors_cv.samples assert_raises(AssertionError, assert_array_almost_equal, senses.samples, senses_rm.samples) raise SkipTest("Known failure for repeated measures: https://github.com/PyMVPA/PyMVPA/issues/117")
def run(args): if os.path.isfile(args.payload) and args.payload.endswith('.py'): measure = script2obj(args.payload) elif args.payload == 'cv': if args.cv_learner is None or args.cv_partitioner is None: raise ValueError( 'cross-validation payload requires --learner and --partitioner' ) # get CV instance measure = get_crossvalidation_instance( args.cv_learner, args.cv_partitioner, args.cv_errorfx, args.cv_sampling_repetitions, args.cv_learner_space, args.cv_balance_training, args.cv_permutations, args.cv_avg_datafold_results, args.cv_prob_tail) else: raise RuntimeError("this should not happen") dss = hdf2ds(args.data) verbose(3, 'Loaded %i dataset(s)' % len(dss)) ds = vstack(dss) verbose(3, 'Concatenation yielded %i samples with %i features' % ds.shape) # setup neighborhood # XXX add big switch to allow for setting up surface-based neighborhoods from mvpa2.misc.neighborhood import IndexQueryEngine qe = IndexQueryEngine(**dict(args.neighbors)) # determine ROIs rids = None # all by default aggregate_fx = args.aggregate_fx if args.roi_attr is not None: # first figure out which roi features should be processed if len(args.roi_attr) == 1 and args.roi_attr[0] in ds.fa.keys(): # name of an attribute -> pull non-zeroes rids = ds.fa[args.roi_attr[0]].value.nonzero()[0] else: # an expression? from .cmd_select import _eval_attr_expr rids = _eval_attr_expr(args.roi_attr, ds.fa).nonzero()[0] seed_ids = None if args.scatter_rois is not None: # scatter_neighborhoods among available ids if was requested from mvpa2.misc.neighborhood import scatter_neighborhoods attr, nb = args.scatter_rois coords = ds.fa[attr].value if rids is not None: # select only those which were chosen by ROI coords = coords[rids] _, seed_ids = scatter_neighborhoods(nb, coords) if aggregate_fx is None: # no custom one given -> use default "fill in" function aggregate_fx = _fill_in_scattered_results if args.enable_ca is None: args.enable_ca = ['roi_feature_ids'] elif 'roi_feature_ids' not in args.enable_ca: args.enable_ca += ['roi_feature_ids'] if seed_ids is None: roi_ids = rids else: if rids is not None: # we had to sub-select by scatterring among available rids # so we would need to get original ids roi_ids = rids[seed_ids] else: # scattering happened on entire feature-set roi_ids = seed_ids verbose( 3, 'Attempting %i ROI analyses' % ((roi_ids is None) and ds.nfeatures or len(roi_ids))) from mvpa2.measures.searchlight import Searchlight sl = Searchlight(measure, queryengine=qe, roi_ids=roi_ids, nproc=args.nproc, results_backend=args.multiproc_backend, results_fx=aggregate_fx, enable_ca=args.enable_ca, disable_ca=args.disable_ca) # XXX support me too! # add_center_fa # tmp_prefix # nblocks # null_dist # run res = sl(ds) if (seed_ids is not None) and ('mapper' in res.a): # strip the last mapper link in the chain, which would be the seed ID selection res.a['mapper'] = res.a.mapper[:-1] # XXX create more output # and store ds2hdf5(res, args.output, compression=args.hdf5_compression) return res
def run(args): if os.path.isfile(args.payload) and args.payload.endswith(".py"): measure = script2obj(args.payload) elif args.payload == "cv": if args.cv_learner is None or args.cv_partitioner is None: raise ValueError("cross-validation payload requires --learner and --partitioner") # get CV instance measure = get_crossvalidation_instance( args.cv_learner, args.cv_partitioner, args.cv_errorfx, args.cv_sampling_repetitions, args.cv_learner_space, args.cv_balance_training, args.cv_permutations, args.cv_avg_datafold_results, args.cv_prob_tail, ) else: raise RuntimeError("this should not happen") dss = hdf2ds(args.data) verbose(3, "Loaded %i dataset(s)" % len(dss)) ds = vstack(dss) verbose(3, "Concatenation yielded %i samples with %i features" % ds.shape) # setup neighborhood # XXX add big switch to allow for setting up surface-based neighborhoods from mvpa2.misc.neighborhood import IndexQueryEngine qe = IndexQueryEngine(**dict(args.neighbors)) # determine ROIs rids = None # all by default aggregate_fx = args.aggregate_fx if args.roi_attr is not None: # first figure out which roi features should be processed if len(args.roi_attr) == 1 and args.roi_attr[0] in ds.fa.keys(): # name of an attribute -> pull non-zeroes rids = ds.fa[args.roi_attr[0]].value.nonzero()[0] else: # an expression? from .cmd_select import _eval_attr_expr rids = _eval_attr_expr(args.roi_attr, ds.fa).nonzero()[0] seed_ids = None if args.scatter_rois is not None: # scatter_neighborhoods among available ids if was requested from mvpa2.misc.neighborhood import scatter_neighborhoods attr, nb = args.scatter_rois coords = ds.fa[attr].value if rids is not None: # select only those which were chosen by ROI coords = coords[rids] _, seed_ids = scatter_neighborhoods(nb, coords) if aggregate_fx is None: # no custom one given -> use default "fill in" function aggregate_fx = _fill_in_scattered_results if args.enable_ca is None: args.enable_ca = ["roi_feature_ids"] elif "roi_feature_ids" not in args.enable_ca: args.enable_ca += ["roi_feature_ids"] if seed_ids is None: roi_ids = rids else: if rids is not None: # we had to sub-select by scatterring among available rids # so we would need to get original ids roi_ids = rids[seed_ids] else: # scattering happened on entire feature-set roi_ids = seed_ids verbose(3, "Attempting %i ROI analyses" % ((roi_ids is None) and ds.nfeatures or len(roi_ids))) from mvpa2.measures.searchlight import Searchlight sl = Searchlight( measure, queryengine=qe, roi_ids=roi_ids, nproc=args.nproc, results_backend=args.multiproc_backend, results_fx=aggregate_fx, enable_ca=args.enable_ca, disable_ca=args.disable_ca, ) # XXX support me too! # add_center_fa # tmp_prefix # nblocks # null_dist # run res = sl(ds) if (seed_ids is not None) and ("mapper" in res.a): # strip the last mapper link in the chain, which would be the seed ID selection res.a["mapper"] = res.a.mapper[:-1] # XXX create more output # and store ds2hdf5(res, args.output, compression=args.hdf5_compression) return res
def get_model_bold_dataset(self, model_id, subj_id, preprocfx=None, modelfx=None, stack=True, flavor=None, mask=None, add_fa=None, add_sa=None, **kwargs): """Build a PyMVPA dataset for a model defined in the OpenFMRI dataset Parameters ---------- model_id : int Model ID. subj_id : int or str or list Integer, or string ID of the subject whose data shall be considered. Alternatively, a list of IDs can be given and data from all matching subjects will be loaded at once. preprocfx : callable or None If not None, this callable will be called with each run bold dataset as an argument before ``modelfx`` is executed. The callable must return a dataset. modelfx : callable or None This callable will be called with each run dataset and the respective event list for each run as arguments, In addition all additional **kwargs of this method will be passed on to this callable. The callable must return a dataset. If None, ``assign_conditionlabels`` will be used as a default callable. stack : boolean Flag whether to stack all run datasets into a single dataset, or whether to return a list of datasets. flavor See get_bold_run_dataset() documentation mask See fmri_dataset() documentation. add_fa See fmri_dataset() documentation. add_sa See get_bold_run_dataset() documentation. Returns ------- Dataset or list Depending on the ``stack`` argument either a single dataset or a list of datasets for all subject/task/run combinations relevant to the model will be returned. In the stacked case the dataset attributes of the returned dataset are taken from the first run dataset, and are assumed to be identical for all of them. """ if modelfx is None: # loading a model dataset without actually considering the model # probably makes little sense, so at least create an attribute from mvpa2.datasets.eventrelated import assign_conditionlabels modelfx=assign_conditionlabels conds = self.get_model_conditions(model_id) # what tasks do we need to consider for this model tasks = np.unique([c['task'] for c in conds]) if isinstance(subj_id, int) or isinstance(subj_id, basestring): subj_id = [subj_id] dss = [] for sub in subj_id: # we need to loop over tasks first in order to be able to determine # what runs exists: that means we have to load the model info # repeatedly for task in tasks: for run in self.get_bold_run_ids(sub, task): events = self.get_bold_run_model(model_id, sub, run) # at this point our events should only contain those # matching the current task. If not, this model violates # the implicit assumption that one condition (label) can # only be present in a single task. The current OpenFMRI # spec does not allow for a more complex setup. I think # this is worth a runtime check check_events = [ev for ev in events if ev['task'] == task] if not len(check_events) == len(events): warning( "not all event specifications match the expected " "task ID -- something is wrong -- check that each " "model condition label is only associated with a " "single task") if not len(events): # nothing in this run for the given model # it could be argued whether we'd still want this data loaded # XXX maybe a flag? continue d = self.get_bold_run_dataset(sub, task, run=run, flavor=flavor, chunks=run, mask=mask, add_fa=add_fa, add_sa=add_sa) if not preprocfx is None: d = preprocfx(d) d = modelfx(d, events, **kwargs) # if the modelfx doesn't leave 'chunk' information, we put # something minimal in for attr, info in (('chunks', run), ('subj', sub)): if not attr in d.sa: d.sa[attr] = [info] * len(d) dss.append(d) if stack: dss = vstack(dss, a=0) return dss
def get_model_bold_dataset(self, model_id, subj_id, preprocfx=None, modelfx=None, stack=True, flavor=None, mask=None, add_fa=None, add_sa=None, **kwargs): """Build a PyMVPA dataset for a model defined in the OpenFMRI dataset Parameters ---------- model_id : int Model ID. subj_id : int or str or list Integer, or string ID of the subject whose data shall be considered. Alternatively, a list of IDs can be given and data from all matching subjects will be loaded at once. preprocfx : callable or None If not None, this callable will be called with each run bold dataset as an argument before ``modelfx`` is executed. The callable must return a dataset. modelfx : callable or None This callable will be called with each run dataset and the respective event list for each run as arguments, In addition all additional **kwargs of this method will be passed on to this callable. The callable must return a dataset. If None, ``assign_conditionlabels`` will be used as a default callable. stack : boolean Flag whether to stack all run datasets into a single dataset, or whether to return a list of datasets. flavor See get_bold_run_dataset() documentation mask See fmri_dataset() documentation. add_fa See fmri_dataset() documentation. add_sa See get_bold_run_dataset() documentation. Returns ------- Dataset or list Depending on the ``stack`` argument either a single dataset or a list of datasets for all subject/task/run combinations relevant to the model will be returned. In the stacked case the dataset attributes of the returned dataset are taken from the first run dataset, and are assumed to be identical for all of them. """ if modelfx is None: # loading a model dataset without actually considering the model # probably makes little sense, so at least create an attribute from mvpa2.datasets.eventrelated import assign_conditionlabels modelfx=assign_conditionlabels conds = self.get_model_conditions(model_id) # what tasks do we need to consider for this model tasks = np.unique([c['task'] for c in conds]) if isinstance(subj_id, int) or isinstance(subj_id, basestring): subj_id = [subj_id] dss = [] for sub in subj_id: for task in tasks: for run in self.get_bold_run_ids(sub, task): events = self.get_bold_run_model(model_id, task, run) if not len(events): # nothing in this run for the given model # it could be argued whether we'd still want this data loaded # XXX maybe a flag? continue d = self.get_bold_run_dataset(sub, task, run=run, flavor=flavor, chunks=run, mask=mask, add_fa=add_fa, add_sa=add_sa) if not preprocfx is None: d = preprocfx(d) d = modelfx(d, events, **kwargs) # if the modelfx doesn't leave 'chunk' information, we put # something minimal in if not 'chunks' in d.sa: d.sa['chunks'] = [run] * len(d) dss.append(d) if stack: dss = vstack(dss, a=0) return dss