Example #1
0
def apply_falign(ds, ha):

    subjects = ds.sa['subject_id'].unique
    rois = ds.fa['annotation'].unique  #FIXME: roi cannot be a fa
    hemis = ds.fa['hemi'].unique

    rds = ds.copy()

    sds = []
    for subject in subjects:
        rds = []
        for roi in rois:
            hds = []
            for hemi in hemis:
                select = ({
                    'subject_id': [subject]
                }, {
                    'annotation': [roi],
                    'hemi': [hemi]
                })
                mds = ha[hemi][roi][subject].forward(ds[select])
                mds.fa['annotation'] = ds[select].fa['annotation']
                mds.fa['hemi'] = ds[select].fa['hemi']
                hds.append(mds)
            rds.append(hstack(hds))
        sds.append(hstack(rds))
    return vstack(sds)
Example #2
0
def run(args):
    dss = hdf2ds(args.data)
    verbose(3, 'Loaded %i dataset(s)' % len(dss))
    ds = vstack(dss)
    verbose(3, 'Concatenation yielded %i samples with %i features' % ds.shape)
    # slicing
    sliceme = {'samples': slice(None), 'features': slice(None)}
    # indices
    for opt, col, which in ((args.samples_by_index, ds.sa, 'samples'),
                     (args.features_by_index, ds.fa, 'features')):
        if opt is None:
            continue
        if len(opt) == 1 and opt[0].count(':'):
            # slice spec
            arg = opt[0].split(':')
            spec = []
            for a in arg:
                if not len(a):
                    spec.append(None)
                else:
                    spec.append(int(a))
            sliceme[which] = slice(*spec)
        else:
            # actual indices
            sliceme[which] = [int(o) for o in opt]
    # attribute evaluation
    for opt, col, which in ((args.samples_by_attr, ds.sa, 'samples'),
                     (args.features_by_attr, ds.fa, 'features')):
        if opt is None:
            continue
        sliceme[which] = _eval_attr_expr(opt, col)

    # apply selection
    ds = ds.__getitem__((sliceme['samples'], sliceme['features']))
    verbose(1, 'Selected %i samples with %i features' % ds.shape)

    # strip attributes
    for attrarg, col, descr in ((args.strip_sa, ds.sa, 'sample '),
                                (args.strip_fa, ds.fa, 'feature '),
                                (args.strip_da, ds.a, '')):
        if not attrarg is None:
            for attr in attrarg:
                try:
                    del col[attr]
                except KeyError:
                    warning("dataset has no %sattribute '%s' to remove"
                            % (descr, attr))
    # and store
    ds2hdf5(ds, args.output, compression=args.hdf5_compression)
    return ds
def test_resample():
    time = np.linspace(0, 2 * np.pi, 100)
    ds = Dataset(np.vstack((np.sin(time), np.cos(time))).T,
                 sa={
                     'time': time,
                     'section': np.repeat(range(10), 10)
                 })
    assert_equal(ds.shape, (100, 2))

    # downsample
    num = 10
    rm = FFTResampleMapper(num,
                           window=('gauss', 50),
                           position_attr='time',
                           attr_strategy='sample')
    mds = rm.forward(ds)
    assert_equal(mds.shape, (num, ds.nfeatures))
    # didn't change the orig
    assert_equal(len(ds), 100)

    # check position-based resampling
    ds_partial = ds[0::10]
    mds_partial = rm.forward(ds_partial)
    # despite different input sampling should yield the same output timepoints
    assert_array_almost_equal(mds.sa.time, mds_partial.sa.time)
    # exclude the first points to prevent edge effects, but the data should be
    # very similar too
    assert_array_almost_equal(mds.samples[2:],
                              mds_partial.samples[2:],
                              decimal=2)
    # simple sample of sa's should give meaningful stuff
    assert_array_equal(mds.sa.section, range(10))

    # and now for a dataset with chunks
    cds = vstack([ds.copy(), ds.copy()])
    cds.sa['chunks'] = np.repeat([0, 1], len(ds))
    rm = FFTResampleMapper(num,
                           attr_strategy='sample',
                           chunks_attr='chunks',
                           window=('gauss', 50))
    mcds = rm.forward(cds)
    assert_equal(mcds.shape, (20, 2))
    assert_array_equal(mcds.sa.section, np.tile(range(10), 2))
    # each individual chunks should be identical to previous dataset
    assert_array_almost_equal(mds.samples, mcds.samples[:10])
    assert_array_almost_equal(mds.samples, mcds.samples[10:])
Example #4
0
def join_datasets(datasets, a=None):

    # if a is None:
    #     a = list(set(reduce(operator.add,
    #                         [ds.a.keys() for ds in datasets])))

    # vds = []
    # for ds in datasets:
    #     for k in a:
    #         if k not in ds.sa:
    #             if k in ds.a:
    #                 ds.sa[k] = [ds.a[k]]*ds.nsamples
    #             else:
    #                 ds.sa[k] = None
    #     vds.append(ds)
    vds = datasets

    return vstack(vds)
Example #5
0
def join_datasets(datasets, a=None):

    # if a is None:
    #     a = list(set(reduce(operator.add,
    #                         [ds.a.keys() for ds in datasets])))

    # vds = []
    # for ds in datasets:
    #     for k in a:
    #         if k not in ds.sa:
    #             if k in ds.a:
    #                 ds.sa[k] = [ds.a[k]]*ds.nsamples
    #             else:
    #                 ds.sa[k] = None
    #     vds.append(ds)
    vds = datasets

    return vstack(vds)
def run(args):
    dss = hdf2ds(args.data)
    verbose(3, 'Loaded %i dataset(s)' % len(dss))
    ds = vstack(dss)
    verbose(3, 'Concatenation yielded %i samples with %i features' % ds.shape)
    # get CV instance
    cv = get_crossvalidation_instance(args.learner, args.partitioner,
                                      args.errorfx, args.sampling_repetitions,
                                      args.learner_space,
                                      args.balance_training, args.permutations,
                                      args.avg_datafold_results,
                                      args.prob_tail)
    res = cv(ds)
    # some meaningful output
    # XXX make condition on classification analysis only?
    print cv.ca.stats
    print 'Results\n-------'
    if args.permutations > 0:
        nprob = cv.ca.null_prob.samples
    if res.shape[1] == 1:
        # simple result structure
        if args.permutations > 0:
            p = ', p-value (%s tail)' % args.prob_tail
        else:
            p = ''
        print 'Fold, Result%s' % p
        for i in xrange(len(res)):
            if args.permutations > 0:
                p = ', %f' % nprob[i, 0]
            else:
                p = ''
            print '%s, %f%s' % (res.sa.cvfolds[i], res.samples[i, 0], p)
    # and store
    ds2hdf5(res, args.output, compression=args.hdf5_compression)
    if args.permutations > 0:
        if args.output.endswith('.hdf5'):
            args.output = args.output[:-5]
        ds2hdf5(cv.ca.null_prob,
                '%s_nullprob' % args.output,
                compression=args.hdf5_compression)
    return res
Example #7
0
def test_resample():
    time = np.linspace(0, 2*np.pi, 100)
    ds = Dataset(np.vstack((np.sin(time), np.cos(time))).T,
                 sa = {'time': time,
                       'section': np.repeat(range(10), 10)})
    assert_equal(ds.shape, (100, 2))

    # downsample
    num = 10
    rm = FFTResampleMapper(num, window=('gauss', 50),
                           position_attr='time',
                           attr_strategy='sample')
    mds = rm.forward(ds)
    assert_equal(mds.shape, (num, ds.nfeatures))
    # didn't change the orig
    assert_equal(len(ds), 100)

    # check position-based resampling
    ds_partial = ds[0::10]
    mds_partial = rm.forward(ds_partial)
    # despite different input sampling should yield the same output timepoints
    assert_array_almost_equal(mds.sa.time, mds_partial.sa.time)
    # exclude the first points to prevent edge effects, but the data should be
    # very similar too
    assert_array_almost_equal(mds.samples[2:], mds_partial.samples[2:], decimal=2)
    # simple sample of sa's should give meaningful stuff
    assert_array_equal(mds.sa.section, range(10))

    # and now for a dataset with chunks
    cds = vstack([ds.copy(), ds.copy()])
    cds.sa['chunks'] = np.repeat([0,1], len(ds))
    rm = FFTResampleMapper(num, attr_strategy='sample', chunks_attr='chunks',
                           window=('gauss', 50))
    mcds = rm.forward(cds)
    assert_equal(mcds.shape, (20, 2))
    assert_array_equal(mcds.sa.section, np.tile(range(10),2))
    # each individual chunks should be identical to previous dataset
    assert_array_almost_equal(mds.samples, mcds.samples[:10])
    assert_array_almost_equal(mds.samples, mcds.samples[10:])
Example #8
0
def apply_falign(ds, ha):

    subjects = ds.sa["subject_id"].unique
    rois = ds.fa["annotation"].unique  # FIXME: roi cannot be a fa
    hemis = ds.fa["hemi"].unique

    rds = ds.copy()

    sds = []
    for subject in subjects:
        rds = []
        for roi in rois:
            hds = []
            for hemi in hemis:
                select = ({"subject_id": [subject]}, {"annotation": [roi], "hemi": [hemi]})
                mds = ha[hemi][roi][subject].forward(ds[select])
                mds.fa["annotation"] = ds[select].fa["annotation"]
                mds.fa["hemi"] = ds[select].fa["hemi"]
                hds.append(mds)
            rds.append(hstack(hds))
        sds.append(hstack(rds))
    return vstack(sds)
Example #9
0
def run(args):
    dss = hdf2ds(args.data)
    verbose(3, 'Loaded %i dataset(s)' % len(dss))
    ds = vstack(dss)
    verbose(3, 'Concatenation yielded %i samples with %i features' % ds.shape)
    # get CV instance
    cv = get_crossvalidation_instance(
            args.learner, args.partitioner, args.errorfx, args.sampling_repetitions,
            args.learner_space, args.balance_training, args.permutations,
            args.avg_datafold_results, args.prob_tail)
    res = cv(ds)
    # some meaningful output
    # XXX make condition on classification analysis only?
    print cv.ca.stats
    print 'Results\n-------'
    if args.permutations > 0:
        nprob =  cv.ca.null_prob.samples
    if res.shape[1] == 1:
        # simple result structure
        if args.permutations > 0:
            p=', p-value (%s tail)' % args.prob_tail
        else:
            p=''
        print 'Fold, Result%s' % p
        for i in xrange(len(res)):
            if args.permutations > 0:
                p = ', %f' % nprob[i, 0]
            else:
                p = ''
            print '%s, %f%s' % (res.sa.cvfolds[i], res.samples[i, 0], p)
    # and store
    ds2hdf5(res, args.output, compression=args.hdf5_compression)
    if args.permutations > 0:
        if args.output.endswith('.hdf5'):
            args.output = args.output[:-5]
        ds2hdf5(cv.ca.null_prob, '%s_nullprob' % args.output,
                compression=args.hdf5_compression)
    return res
Example #10
0
    def _call(self, ds):
        # local binding
        generator = self._generator
        node = self._node
        ca = self.ca
        space = self.get_space()
        concat_as = self._concat_as

        if self.ca.is_enabled("stats") and (not node.ca.has_key("stats") or
                                            not node.ca.is_enabled("stats")):
            warning("'stats' conditional attribute was enabled, but "
                    "the assigned node '%s' either doesn't support it, "
                    "or it is disabled" % node)
        # precharge conditional attributes
        ca.datasets = []

        # run the node an all generated datasets
        results = []
        for i, sds in enumerate(generator.generate(ds)):
            if __debug__:
                debug('REPM', "%d-th iteration of %s on %s", (i, self, sds))
            if ca.is_enabled("datasets"):
                # store dataset in ca
                ca.datasets.append(sds)
            # run the beast
            result = node(sds)
            # callback
            if not self._callback is None:
                self._callback(data=sds, node=node, result=result)
            # subclass postprocessing
            result = self._repetition_postcall(sds, node, result)
            if space:
                # XXX maybe try to get something more informative from the
                # processing node (e.g. in 0.5 it used to be 'chunks'->'chunks'
                # to indicate what was trained and what was tested. Now it is
                # more tricky, because `node` could be anything
                result.set_attr(space, (i, ))
            # store
            results.append(result)

            if ca.is_enabled("stats") and node.ca.has_key("stats") \
               and node.ca.is_enabled("stats"):
                if not ca.is_set('stats'):
                    # create empty stats container of matching type
                    ca.stats = node.ca['stats'].value.__class__()
                # harvest summary stats
                ca['stats'].value.__iadd__(node.ca['stats'].value)

        # charge condition attribute
        self.ca.repetition_results = results

        # stack all results into a single Dataset
        if concat_as == 'samples':
            results = vstack(results)
        elif concat_as == 'features':
            results = hstack(results)
        else:
            raise ValueError("Unkown concatenation mode '%s'" % concat_as)
        # no need to store the raw results, since the Measure class will
        # automatically store them in a CA
        return results
def movie_dataset(
    subj,
    preproc=None,
    base_path=os.curdir,
    fname_tmpl='sub-{subj:02d}/ses-movie/func/sub-{subj:02d}_ses-movie_task-movie_run-{run}_recording-eyegaze_physio.tsv.gz'
):
    """
    Load eyegaze recordings from all runs a merge into a consecutive timeseries

    When merging intersegment-overlap is removed.

    Parameters
    ----------
    subj : str
      Subject code.
    preproc : callable or None
      Callable to preprocess a record array of the raw timeseries. The record
      array has the field 'x', 'y', 'pupil', and 'movie_frame'. It needs to
      return a record array with the same fields and must not change the
      sampling rate or number of samples.
    base_path : path
      Base directory for input file discovery.
    fname_tmpl : str
      Template expression to match input files. Support dict expansion with
      'subj' and 'run' keys.

    Returns
    -------
    Dataset
      The dataset contains a number of attributes, most of which should be
      self-explanatory. The `ds.a.run_duration_deviation` attribute quantifies
      the eyegaze recording duration difference from the expected value (in
      seconds).
    """
    # in frames (hand-verified by re-assembling in kdenlive -- using MELT
    # underneath)
    # and inline with phase2/code/stimulus/movie/segment_timing.csv
    seg_offsets = (0, 22150, 43802, 65304, 89305, 112007, 133559, 160261)
    movie_fps = 25.0
    eyegaze_sr = 1000.0  # Hz
    intersegment_overlap = 150  # frames

    segments = []
    for seg, offset in enumerate(seg_offsets):
        raw = np.recfromcsv(os.path.join(
            base_path, fname_tmpl.format(subj=subj, run=seg + 1)),
                            delimiter='\t',
                            names=('x', 'y', 'pupil', 'movie_frame'))
        if not preproc is None:
            raw = preproc(raw)
        # glue together to form a dataset
        ds = Dataset(
            np.array((raw.x, raw.y, raw.pupil)).T,
            # movie frame idx is not zero-based in the files
            sa=dict(movie_frame=raw.movie_frame - 1))
        ds.sa['movie_run_frame'] = ds.sa.movie_frame.copy()
        # turn into movie frame ID for the entire unsegmented movie
        ds.sa.movie_frame += offset
        ## truncate segment time series to remove overlap
        if seg < 7:
            # cut the end in a safe distance to the actual end, but inside the
            # overlap
            ds = ds[:-int(intersegment_overlap / movie_fps * eyegaze_sr)]
            # introduce an artificial blink at the end (see below)
            ds.samples[-100:] = np.nan
        if seg > 0:
            # cut the beginning to have a seamless start after the previous
            # segment
            ds = ds[ds.sa.movie_frame > segments[-1].sa.movie_frame.max()]
            # second half of the artificial blink. this is to avoid the
            # impression of a saccade at the point where two segments are
            # sown together
            ds.samples[:100] = np.nan
        ds.sa['movie_run'] = [seg + 1] * len(ds)
        segments.append(ds)
    ds = vstack(segments)
    # column names
    ds.fa['name'] = ('x', 'y', 'pupil')
    ds.a['sampling_rate'] = eyegaze_sr
    ds.a['movie_fps'] = movie_fps
    return ds
Example #12
0
def load_openfmri_ds(root, subject, mask=None, filterfun=None, TR=3.0):
    
    ####Define helper functions for loading different parts of the data####
    def read_condition_keys(DSROOT):
        maxlength = 0
        def pick_fields(keys, line):
            fields = line.split()
            try:
                keys[fields[0]][fields[1]] = ' '.join(fields[2:])
            except KeyError:
                keys[fields[0]]={fields[1] : ' '.join(fields[2:])}
            return keys

        with open(os.path.join(DSROOT, 
                                    'models/model001/condition_key.txt'),'r') as keyfile:
            return reduce(pick_fields,keyfile,{})
        
    def parse_condition_onsets(path):
        condfiles=glob.glob(os.path.join(path, 'cond*'))
        timeline = []
        for cfile in condfiles:
            cond_name = os.path.basename(cfile).rstrip('.txt')
            with open(cfile,'r') as cfh:
                for line in cfh:
                    start, duration, weight = line.split()
                    timeline.append((float(start), float(duration), cond_name))
        timeline.sort()
        return timeline
        
    def extract_task_and_run(string):
        m=re.search('task([0-9]+)_run([0-9]+)', string)
        return int(m.group(1)), int(m.group(2))

    def load_run(runstring):
        ds=fmri_dataset(samples=os.path.join(root,subject,'BOLD',runstring,'bold.nii.gz'),mask=mask)
        task, run = extract_task_and_run(runstring)

        ds.sa['chunks'] = np.empty(len(ds))
        ds.sa.chunks.fill(run)
        ds.sa['task'] = np.empty(len(ds))
        ds.sa.task.fill(task)
        return ds
    
    def merge_conditions_onto_ds(ds, onsets):
        targets = np.chararray(ds.shape[0],itemsize=17)
        targets.fill('rest')
        for cond in onsets:
            start, duration, condition = cond
            startidx = int(start/TR)
            endidx = int((start+duration)/TR)
            targets[startidx:endidx+1] = condition_keys['task001'][condition]
        ds.sa['targets']=targets
    
    ##Actual data loading begins here
    condition_keys = read_condition_keys(root)
    
    allruns = map(lambda x: os.path.basename(x),
                glob.glob(os.path.join(root, subject,'BOLD/task*')))
    
    if filterfun:
        allruns = filter(filterfun,allruns)
    
    alldata=[]
    for run in allruns:
        ds=load_run(run)
        onsets=parse_condition_onsets(os.path.join(root,subject,'model/model001/onsets/',run))
        merge_conditions_onto_ds(ds,onsets)
        alldata.append(ds)
        
    merged = vstack(alldata)
    merged.a.update(alldata[0].a)
    return merged
Example #13
0
    def get_model_bold_dataset(self,
                               model_id,
                               subj_id,
                               run_ids=None,
                               preproc_img=None,
                               preproc_ds=None,
                               modelfx=None,
                               stack=True,
                               flavor=None,
                               mask=None,
                               add_fa=None,
                               add_sa=None,
                               **kwargs):
        """Build a PyMVPA dataset for a model defined in the OpenFMRI dataset

        Parameters
        ----------
        model_id : int
          Model ID.
        subj_id : int or str or list
          Integer, or string ID of the subject whose data shall be considered.
          Alternatively, a list of IDs can be given and data from all matching
          subjects will be loaded at once.
        run_ids : list, optional
          Run ids to be loaded.  If None, all runs get loaded
        preproc_img : callable or None
          See get_bold_run_dataset() documentation
        preproc_ds : callable or None
          If not None, this callable will be called with each run bold dataset
          as an argument before ``modelfx`` is executed. The callable must
          return a dataset.
        modelfx : callable or None
          This callable will be called with each run dataset and the respective
          event list for each run as arguments, In addition all additional
          **kwargs of this method will be passed on to this callable. The
          callable must return a dataset. If None, ``assign_conditionlabels``
          will be used as a default callable.
        stack : boolean
          Flag whether to stack all run datasets into a single dataset, or whether
          to return a list of datasets.
        flavor
          See get_bold_run_dataset() documentation
        mask
          See fmri_dataset() documentation.
        add_fa
          See fmri_dataset() documentation.
        add_sa
          See get_bold_run_dataset() documentation.

        Returns
        -------
        Dataset or list
          Depending on the ``stack`` argument either a single dataset or a list
          of datasets for all subject/task/run combinations relevant to the model
          will be returned. In the stacked case the dataset attributes of the
          returned dataset are taken from the first run dataset, and are assumed
          to be identical for all of them.
        """
        if modelfx is None:
            # loading a model dataset without actually considering the model
            # probably makes little sense, so at least create an attribute
            from mvpa2.datasets.eventrelated import assign_conditionlabels
            modelfx = assign_conditionlabels
        conds = self.get_model_conditions(model_id)
        # what tasks do we need to consider for this model
        tasks = np.unique([c['task'] for c in conds])
        if isinstance(subj_id, (int, str)):
            subj_id = [subj_id]
        dss = []
        for sub in subj_id:
            # we need to loop over tasks first in order to be able to determine
            # what runs exists: that means we have to load the model info
            # repeatedly
            for task in tasks:
                run_ids_ = run_ids \
                    if run_ids is not None \
                    else self.get_bold_run_ids(sub, task)
                for i, run in enumerate(run_ids_):
                    events = self.get_bold_run_model(model_id, sub, run)
                    # at this point our events should only contain those
                    # matching the current task. If not, this model violates
                    # the implicit assumption that one condition (label) can
                    # only be present in a single task. The current OpenFMRI
                    # spec does not allow for a more complex setup. I think
                    # this is worth a runtime check
                    check_events = [ev for ev in events if ev['task'] == task]
                    if not len(check_events) == len(events):
                        warning(
                            "not all event specifications match the expected "
                            "task ID -- something is wrong -- check that each "
                            "model condition label is only associated with a "
                            "single task")

                    if not len(events):
                        # nothing in this run for the given model
                        # it could be argued whether we'd still want this data loaded
                        # XXX maybe a flag?
                        continue
                    d = self.get_bold_run_dataset(sub,
                                                  task,
                                                  run=run,
                                                  flavor=flavor,
                                                  preproc_img=preproc_img,
                                                  chunks=i,
                                                  mask=mask,
                                                  add_fa=add_fa,
                                                  add_sa=add_sa)
                    if preproc_ds is not None:
                        d = preproc_ds(d)
                    d = modelfx(
                        d, events,
                        **dict([(k, v) for k, v in kwargs.items()
                                if not k in ('preproc_img', 'preproc_ds',
                                             'modelfx', 'stack', 'flavor',
                                             'mask', 'add_fa', 'add_sa')]))
                    # if the modelfx doesn't leave 'chunk' information, we put
                    # something minimal in
                    for attr, info in (('chunks', i), ('run', run), ('subj',
                                                                     sub)):
                        if not attr in d.sa:
                            d.sa[attr] = [info] * len(d)
                    dss.append(d)
        if stack:
            dss = vstack(dss, a=0)
        return dss
Example #14
0
    def _call(self, ds):
        # local binding
        generator = self._generator
        node = self._node
        ca = self.ca
        space = self.get_space()
        concat_as = self._concat_as

        if self.ca.is_enabled("stats") and (not node.ca.has_key("stats") or
                                            not node.ca.is_enabled("stats")):
            warning("'stats' conditional attribute was enabled, but "
                    "the assigned node '%s' either doesn't support it, "
                    "or it is disabled" % node)
        # precharge conditional attributes
        ca.datasets = []

        # run the node an all generated datasets
        results = []
        for i, sds in enumerate(generator.generate(ds)):
            if __debug__:
                debug('REPM', "%d-th iteration of %s on %s",
                      (i, self, sds))
            if ca.is_enabled("datasets"):
                # store dataset in ca
                ca.datasets.append(sds)
            # run the beast
            result = node(sds)
            # callback
            if not self._callback is None:
                self._callback(data=sds, node=node, result=result)
            # subclass postprocessing
            result = self._repetition_postcall(sds, node, result)
            if space:
                # XXX maybe try to get something more informative from the
                # processing node (e.g. in 0.5 it used to be 'chunks'->'chunks'
                # to indicate what was trained and what was tested. Now it is
                # more tricky, because `node` could be anything
                result.set_attr(space, (i,))
            # store
            results.append(result)

            if ca.is_enabled("stats") and node.ca.has_key("stats") \
               and node.ca.is_enabled("stats"):
                if not ca.is_set('stats'):
                    # create empty stats container of matching type
                    ca.stats = node.ca['stats'].value.__class__()
                # harvest summary stats
                ca['stats'].value.__iadd__(node.ca['stats'].value)

        # charge condition attribute
        self.ca.repetition_results = results

        # stack all results into a single Dataset
        if concat_as == 'samples':
            results = vstack(results)
        elif concat_as == 'features':
            results = hstack(results)
        else:
            raise ValueError("Unkown concatenation mode '%s'" % concat_as)
        # no need to store the raw results, since the Measure class will
        # automatically store them in a CA
        return results
def movie_dataset(
        subj, preproc=None,
        base_path=os.curdir,
        fname_tmpl='sub-%(subj)s/ses-movie/func/sub-%(subj)s_ses-movie_task-movie_run-%(run)i_recording-eyegaze_physio.tsv.gz'):
    """
    Load eyegaze recordings from all runs a merge into a consecutive timeseries

    When merging intersegment-overlap is removed.

    Parameters
    ----------
    subj : str
      Subject code.
    preproc : callable or None
      Callable to preprocess a record array of the raw timeseries. The record
      array has the field 'x', 'y', 'pupil', and 'movie_frame'. It needs to
      return a record array with the same fields and must not change the
      sampling rate or number of samples.
    base_path : path
      Base directory for input file discovery.
    fname_tmpl : str
      Template expression to match input files. Support dict expansion with
      'subj' and 'run' keys.

    Returns
    -------
    Dataset
      The dataset contains a number of attributes, most of which should be
      self-explanatory. The `ds.a.run_duration_deviation` attribute quantifies
      the eyegaze recording duration difference from the expected value (in
      seconds).
    """
    # in frames (hand-verified by re-assembling in kdenlive -- using MELT
    # underneath)
    seg_offsets = (0, 22150, 43802, 65304, 89305, 112007, 133559, 160261)
    movie_fps = 25.0
    eyegaze_sr = 1000.0  # Hz
    intersegment_overlap = 400  # frames

    segments = []
    for seg, offset in enumerate(seg_offsets):
        raw = np.recfromcsv(
            os.path.join(base_path, fname_tmpl % dict(subj=subj, run=seg + 1)),
            delimiter='\t',
            names=('x', 'y', 'pupil', 'movie_frame'))
        if not preproc is None:
            raw = preproc(raw)
        # glue together to form a dataset
        ds = Dataset(np.array((raw.x, raw.y, raw.pupil)).T,
                     sa=dict(movie_frame=raw.movie_frame))
        ds.sa['movie_run_frame'] = ds.sa.movie_frame.copy()
        # turn into movie frame ID for the entire unsegmented movie
        ds.sa.movie_frame += offset
        ## truncate segment time series to remove overlap
        if seg < 7:
            # cut the end in a safe distance to the actual end, but inside the
            # overlap
            ds = ds[:-int(intersegment_overlap / movie_fps * eyegaze_sr)]
        if seg > 0:
            # cut the beginning to have a seamless start after the previous
            # segment
            ds = ds[ds.sa.movie_frame > segments[-1].sa.movie_frame.max()]
        ds.sa['movie_run'] = [seg + 1] * len(ds)
        segments.append(ds)
    ds = vstack(segments)
    # column names
    ds.fa['name'] = ('x', 'y', 'pupil')
    ds.a['sampling_rate'] = eyegaze_sr
    ds.a['movie_fps'] = movie_fps
    return ds
Example #16
0
def test_rfe_sensmap():
    # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2013q3/002538.html
    # just a smoke test. fails with
    from mvpa2.clfs.svm import LinearCSVMC
    from mvpa2.clfs.meta import FeatureSelectionClassifier
    from mvpa2.measures.base import CrossValidation, RepeatedMeasure
    from mvpa2.generators.splitters import Splitter
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.misc.errorfx import mean_mismatch_error
    from mvpa2.mappers.fx import mean_sample
    from mvpa2.mappers.fx import maxofabs_sample
    from mvpa2.generators.base import Repeater
    from mvpa2.featsel.rfe import RFE
    from mvpa2.featsel.helpers import FractionTailSelector, BestDetector
    from mvpa2.featsel.helpers import NBackHistoryStopCrit
    from mvpa2.datasets import vstack

    from mvpa2.misc.data_generators import normal_feature_dataset

    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    fds = normal_feature_dataset(nlabels=3,
                                 snr=1, # 100,   # pure signal! ;)
                                 perlabel=9,
                                 nfeatures=6,
                                 nonbogus_features=range(3),
                                 nchunks=3)
    clfsvm = LinearCSVMC()

    rfesvm = RFE(clfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()),
                 CrossValidation(
                     clfsvm,
                     NFoldPartitioner(),
                     errorfx=mean_mismatch_error, postproc=mean_sample()),
                 Repeater(2),
                 fselector=FractionTailSelector(0.70, mode='select', tail='upper'),
                 stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10),
                 update_sensitivity=True)

    fclfsvm = FeatureSelectionClassifier(clfsvm, rfesvm)

    sensanasvm = fclfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample())


    # manually repeating/splitting so we do both RFE sensitivity and classification
    senses, errors = [], []
    for i, pset in enumerate(NFoldPartitioner().generate(fds)):
        # split partitioned dataset
        split = [d for d in Splitter('partitions').generate(pset)]
        senses.append(sensanasvm(split[0])) # and it also should train the classifier so we would ask it about error
        errors.append(mean_mismatch_error(fclfsvm.predict(split[1]), split[1].targets))

    senses = vstack(senses)
    errors = vstack(errors)

    # Let's compare against rerunning the beast simply for classification with CV
    errors_cv = CrossValidation(fclfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error)(fds)
    # and they should match
    assert_array_equal(errors, errors_cv)

    # buggy!
    cv_sensana_svm = RepeatedMeasure(sensanasvm, NFoldPartitioner())
    senses_rm = cv_sensana_svm(fds)

    #print senses.samples, senses_rm.samples
    #print errors, errors_cv.samples
    assert_raises(AssertionError,
                  assert_array_almost_equal,
                  senses.samples, senses_rm.samples)
    raise SkipTest("Known failure for repeated measures: https://github.com/PyMVPA/PyMVPA/issues/117")
Example #17
0
def run(args):
    if os.path.isfile(args.payload) and args.payload.endswith('.py'):
        measure = script2obj(args.payload)
    elif args.payload == 'cv':
        if args.cv_learner is None or args.cv_partitioner is None:
            raise ValueError(
                'cross-validation payload requires --learner and --partitioner'
            )
        # get CV instance
        measure = get_crossvalidation_instance(
            args.cv_learner, args.cv_partitioner, args.cv_errorfx,
            args.cv_sampling_repetitions, args.cv_learner_space,
            args.cv_balance_training, args.cv_permutations,
            args.cv_avg_datafold_results, args.cv_prob_tail)
    else:
        raise RuntimeError("this should not happen")
    dss = hdf2ds(args.data)
    verbose(3, 'Loaded %i dataset(s)' % len(dss))
    ds = vstack(dss)
    verbose(3, 'Concatenation yielded %i samples with %i features' % ds.shape)
    # setup neighborhood
    # XXX add big switch to allow for setting up surface-based neighborhoods
    from mvpa2.misc.neighborhood import IndexQueryEngine
    qe = IndexQueryEngine(**dict(args.neighbors))
    # determine ROIs
    rids = None  # all by default
    aggregate_fx = args.aggregate_fx
    if args.roi_attr is not None:
        # first figure out which roi features should be processed
        if len(args.roi_attr) == 1 and args.roi_attr[0] in ds.fa.keys():
            # name of an attribute -> pull non-zeroes
            rids = ds.fa[args.roi_attr[0]].value.nonzero()[0]
        else:
            # an expression?
            from .cmd_select import _eval_attr_expr
            rids = _eval_attr_expr(args.roi_attr, ds.fa).nonzero()[0]

    seed_ids = None
    if args.scatter_rois is not None:
        # scatter_neighborhoods among available ids if was requested
        from mvpa2.misc.neighborhood import scatter_neighborhoods
        attr, nb = args.scatter_rois
        coords = ds.fa[attr].value
        if rids is not None:
            # select only those which were chosen by ROI
            coords = coords[rids]
        _, seed_ids = scatter_neighborhoods(nb, coords)
        if aggregate_fx is None:
            # no custom one given -> use default "fill in" function
            aggregate_fx = _fill_in_scattered_results
            if args.enable_ca is None:
                args.enable_ca = ['roi_feature_ids']
            elif 'roi_feature_ids' not in args.enable_ca:
                args.enable_ca += ['roi_feature_ids']

    if seed_ids is None:
        roi_ids = rids
    else:
        if rids is not None:
            # we had to sub-select by scatterring among available rids
            # so we would need to get original ids
            roi_ids = rids[seed_ids]
        else:
            # scattering happened on entire feature-set
            roi_ids = seed_ids

    verbose(
        3, 'Attempting %i ROI analyses' %
        ((roi_ids is None) and ds.nfeatures or len(roi_ids)))

    from mvpa2.measures.searchlight import Searchlight

    sl = Searchlight(measure,
                     queryengine=qe,
                     roi_ids=roi_ids,
                     nproc=args.nproc,
                     results_backend=args.multiproc_backend,
                     results_fx=aggregate_fx,
                     enable_ca=args.enable_ca,
                     disable_ca=args.disable_ca)
    # XXX support me too!
    #                 add_center_fa
    #                 tmp_prefix
    #                 nblocks
    #                 null_dist
    # run
    res = sl(ds)
    if (seed_ids is not None) and ('mapper' in res.a):
        # strip the last mapper link in the chain, which would be the seed ID selection
        res.a['mapper'] = res.a.mapper[:-1]
    # XXX create more output
    # and store
    ds2hdf5(res, args.output, compression=args.hdf5_compression)
    return res
Example #18
0
def run(args):
    if os.path.isfile(args.payload) and args.payload.endswith(".py"):
        measure = script2obj(args.payload)
    elif args.payload == "cv":
        if args.cv_learner is None or args.cv_partitioner is None:
            raise ValueError("cross-validation payload requires --learner and --partitioner")
        # get CV instance
        measure = get_crossvalidation_instance(
            args.cv_learner,
            args.cv_partitioner,
            args.cv_errorfx,
            args.cv_sampling_repetitions,
            args.cv_learner_space,
            args.cv_balance_training,
            args.cv_permutations,
            args.cv_avg_datafold_results,
            args.cv_prob_tail,
        )
    else:
        raise RuntimeError("this should not happen")
    dss = hdf2ds(args.data)
    verbose(3, "Loaded %i dataset(s)" % len(dss))
    ds = vstack(dss)
    verbose(3, "Concatenation yielded %i samples with %i features" % ds.shape)
    # setup neighborhood
    # XXX add big switch to allow for setting up surface-based neighborhoods
    from mvpa2.misc.neighborhood import IndexQueryEngine

    qe = IndexQueryEngine(**dict(args.neighbors))
    # determine ROIs
    rids = None  # all by default
    aggregate_fx = args.aggregate_fx
    if args.roi_attr is not None:
        # first figure out which roi features should be processed
        if len(args.roi_attr) == 1 and args.roi_attr[0] in ds.fa.keys():
            # name of an attribute -> pull non-zeroes
            rids = ds.fa[args.roi_attr[0]].value.nonzero()[0]
        else:
            # an expression?
            from .cmd_select import _eval_attr_expr

            rids = _eval_attr_expr(args.roi_attr, ds.fa).nonzero()[0]

    seed_ids = None
    if args.scatter_rois is not None:
        # scatter_neighborhoods among available ids if was requested
        from mvpa2.misc.neighborhood import scatter_neighborhoods

        attr, nb = args.scatter_rois
        coords = ds.fa[attr].value
        if rids is not None:
            # select only those which were chosen by ROI
            coords = coords[rids]
        _, seed_ids = scatter_neighborhoods(nb, coords)
        if aggregate_fx is None:
            # no custom one given -> use default "fill in" function
            aggregate_fx = _fill_in_scattered_results
            if args.enable_ca is None:
                args.enable_ca = ["roi_feature_ids"]
            elif "roi_feature_ids" not in args.enable_ca:
                args.enable_ca += ["roi_feature_ids"]

    if seed_ids is None:
        roi_ids = rids
    else:
        if rids is not None:
            # we had to sub-select by scatterring among available rids
            # so we would need to get original ids
            roi_ids = rids[seed_ids]
        else:
            # scattering happened on entire feature-set
            roi_ids = seed_ids

    verbose(3, "Attempting %i ROI analyses" % ((roi_ids is None) and ds.nfeatures or len(roi_ids)))

    from mvpa2.measures.searchlight import Searchlight

    sl = Searchlight(
        measure,
        queryengine=qe,
        roi_ids=roi_ids,
        nproc=args.nproc,
        results_backend=args.multiproc_backend,
        results_fx=aggregate_fx,
        enable_ca=args.enable_ca,
        disable_ca=args.disable_ca,
    )
    # XXX support me too!
    #                 add_center_fa
    #                 tmp_prefix
    #                 nblocks
    #                 null_dist
    # run
    res = sl(ds)
    if (seed_ids is not None) and ("mapper" in res.a):
        # strip the last mapper link in the chain, which would be the seed ID selection
        res.a["mapper"] = res.a.mapper[:-1]
    # XXX create more output
    # and store
    ds2hdf5(res, args.output, compression=args.hdf5_compression)
    return res
Example #19
0
    def get_model_bold_dataset(self, model_id, subj_id,
                          preprocfx=None, modelfx=None, stack=True,
                          flavor=None, mask=None, add_fa=None,
                          add_sa=None, **kwargs):
        """Build a PyMVPA dataset for a model defined in the OpenFMRI dataset

        Parameters
        ----------
        model_id : int
          Model ID.
        subj_id : int or str or list
          Integer, or string ID of the subject whose data shall be considered.
          Alternatively, a list of IDs can be given and data from all matching
          subjects will be loaded at once.
        preprocfx : callable or None
          If not None, this callable will be called with each run bold dataset
          as an argument before ``modelfx`` is executed. The callable must
          return a dataset.
        modelfx : callable or None
          This callable will be called with each run dataset and the respective
          event list for each run as arguments, In addition all additional
          **kwargs of this method will be passed on to this callable. The
          callable must return a dataset. If None, ``assign_conditionlabels``
          will be used as a default callable.
        stack : boolean
          Flag whether to stack all run datasets into a single dataset, or whether
          to return a list of datasets.
        flavor
          See get_bold_run_dataset() documentation
        mask
          See fmri_dataset() documentation.
        add_fa
          See fmri_dataset() documentation.
        add_sa
          See get_bold_run_dataset() documentation.

        Returns
        -------
        Dataset or list
          Depending on the ``stack`` argument either a single dataset or a list
          of datasets for all subject/task/run combinations relevant to the model
          will be returned. In the stacked case the dataset attributes of the
          returned dataset are taken from the first run dataset, and are assumed
          to be identical for all of them.
        """
        if modelfx is None:
            # loading a model dataset without actually considering the model
            # probably makes little sense, so at least create an attribute
            from mvpa2.datasets.eventrelated import assign_conditionlabels
            modelfx=assign_conditionlabels
        conds = self.get_model_conditions(model_id)
        # what tasks do we need to consider for this model
        tasks = np.unique([c['task'] for c in conds])
        if isinstance(subj_id, int) or isinstance(subj_id, basestring):
            subj_id = [subj_id]
        dss = []
        for sub in subj_id:
            # we need to loop over tasks first in order to be able to determine
            # what runs exists: that means we have to load the model info
            # repeatedly
            for task in tasks:
                for run in self.get_bold_run_ids(sub, task):
                    events = self.get_bold_run_model(model_id, sub, run)
                    # at this point our events should only contain those
                    # matching the current task. If not, this model violates
                    # the implicit assumption that one condition (label) can
                    # only be present in a single task. The current OpenFMRI
                    # spec does not allow for a more complex setup. I think
                    # this is worth a runtime check
                    check_events = [ev for ev in events if ev['task'] == task]
                    if not len(check_events) == len(events):
                        warning(
                            "not all event specifications match the expected "
                            "task ID -- something is wrong -- check that each "
                            "model condition label is only associated with a "
                            "single task")

                    if not len(events):
                        # nothing in this run for the given model
                        # it could be argued whether we'd still want this data loaded
                        # XXX maybe a flag?
                        continue
                    d = self.get_bold_run_dataset(sub, task, run=run, flavor=flavor,
                            chunks=run, mask=mask, add_fa=add_fa, add_sa=add_sa)
                    if not preprocfx is None:
                        d = preprocfx(d)
                    d = modelfx(d, events, **kwargs)
                    # if the modelfx doesn't leave 'chunk' information, we put
                    # something minimal in
                    for attr, info in (('chunks', run), ('subj', sub)):
                        if not attr in d.sa:
                            d.sa[attr] = [info] * len(d)
                    dss.append(d)
        if stack:
            dss = vstack(dss, a=0)
        return dss
Example #20
0
    def get_model_bold_dataset(self, model_id, subj_id,
                          preprocfx=None, modelfx=None, stack=True,
                          flavor=None, mask=None, add_fa=None,
                          add_sa=None, **kwargs):
        """Build a PyMVPA dataset for a model defined in the OpenFMRI dataset

        Parameters
        ----------
        model_id : int
          Model ID.
        subj_id : int or str or list
          Integer, or string ID of the subject whose data shall be considered.
          Alternatively, a list of IDs can be given and data from all matching
          subjects will be loaded at once.
        preprocfx : callable or None
          If not None, this callable will be called with each run bold dataset
          as an argument before ``modelfx`` is executed. The callable must
          return a dataset.
        modelfx : callable or None
          This callable will be called with each run dataset and the respective
          event list for each run as arguments, In addition all additional
          **kwargs of this method will be passed on to this callable. The
          callable must return a dataset. If None, ``assign_conditionlabels``
          will be used as a default callable.
        stack : boolean
          Flag whether to stack all run datasets into a single dataset, or whether
          to return a list of datasets.
        flavor
          See get_bold_run_dataset() documentation
        mask
          See fmri_dataset() documentation.
        add_fa
          See fmri_dataset() documentation.
        add_sa
          See get_bold_run_dataset() documentation.

        Returns
        -------
        Dataset or list
          Depending on the ``stack`` argument either a single dataset or a list
          of datasets for all subject/task/run combinations relevant to the model
          will be returned. In the stacked case the dataset attributes of the
          returned dataset are taken from the first run dataset, and are assumed
          to be identical for all of them.
        """
        if modelfx is None:
            # loading a model dataset without actually considering the model
            # probably makes little sense, so at least create an attribute
            from mvpa2.datasets.eventrelated import assign_conditionlabels
            modelfx=assign_conditionlabels
        conds = self.get_model_conditions(model_id)
        # what tasks do we need to consider for this model
        tasks = np.unique([c['task'] for c in conds])
        if isinstance(subj_id, int) or isinstance(subj_id, basestring):
            subj_id = [subj_id]
        dss = []
        for sub in subj_id:
            for task in tasks:
                for run in self.get_bold_run_ids(sub, task):
                    events = self.get_bold_run_model(model_id, task, run)
                    if not len(events):
                        # nothing in this run for the given model
                        # it could be argued whether we'd still want this data loaded
                        # XXX maybe a flag?
                        continue
                    d = self.get_bold_run_dataset(sub, task, run=run, flavor=flavor,
                            chunks=run, mask=mask, add_fa=add_fa, add_sa=add_sa)
                    if not preprocfx is None:
                        d = preprocfx(d)
                    d = modelfx(d, events, **kwargs)
                    # if the modelfx doesn't leave 'chunk' information, we put
                    # something minimal in
                    if not 'chunks' in d.sa:
                        d.sa['chunks'] = [run] * len(d)
                    dss.append(d)
        if stack:
            dss = vstack(dss, a=0)
        return dss