def intersubject_correlation(dss, reference_ds=0):
    """
    Computes voxelwise inter-subject time series correlation
    in a pairwise fashion for a list of Datasets. Datasets
    must all be the same shape. Resulting dataset of pairwise
    correlations will inherit Dataset attributes from
    reference data set [Default: first data set in list].
    """

    # Check if input list contains Datasets, ndarrays
    dss = [Dataset(ds) if not type(ds) == Dataset else ds for ds in dss]

    ds_shape = dss[reference_ds].shape
    n_features = ds_shape[1]

    for ds in dss:
        assert ds.shape == ds_shape

    # Compute time series correlation per feature per subject pair
    correlations = [
        map(lambda a, b: pearson_correlation(a, b), ds1.samples.T,
            ds2.samples.T) for (ds1, ds2) in combinations(dss, 2)
    ]

    correlations = np.asarray(correlations)
    # Resulting correlation map inherits attributes of referece data set
    correlations_ds = Dataset(correlations,
                              fa=dss[reference_ds].fa,
                              a=dss[reference_ds].a)
    correlations_ds.sa['pairs'] = list(combinations(range(len(dss)), 2))

    assert correlations_ds.shape[0] == len(dss) * (len(dss) - 1) / 2
    assert correlations_ds.shape[1] == n_features

    return correlations_ds
Example #2
0
    def test_surf_ring_queryengine(self):
        s = surf.generate_plane((0, 0, 0), (0, 1, 0), (0, 0, 1), 4, 5)
        # add second layer
        s2 = surf.merge(s, (s + (.01, 0, 0)))
        ds = Dataset(samples=np.arange(20)[np.newaxis],
                     fa=dict(node_indices=np.arange(39, 0, -2)))
        # add more features (with shared node indices)
        ds3 = hstack((ds, ds, ds))
        radius = 2.5
        inner_radius = 1.0
        # Makes sure it raises error if inner_radius is >= radius
        assert_raises(ValueError,
                      lambda: queryengine.SurfaceRingQueryEngine(surface=s2,
                                                         inner_radius=2.5,
                                                         radius=radius))
        distance_metrics = ('euclidean', 'dijkstra', 'euclidean', 'dijkstra')
        for distance_metric, include_center in zip(distance_metrics, [True, False]*2):
            qe = queryengine.SurfaceRingQueryEngine(surface=s2, radius=radius,
                                inner_radius=inner_radius, distance_metric=distance_metric,
                                include_center=include_center)
            # untrained qe should give errors
            assert_raises(ValueError, lambda: qe.ids)
            assert_raises(ValueError, lambda: qe.query_byid(0))

            # node index out of bounds should give error
            ds_ = ds.copy()
            ds_.fa.node_indices[0] = 100
            assert_raises(ValueError, lambda: qe.train(ds_))

            # lack of node indices should give error
            ds_.fa.pop('node_indices')
            assert_raises(ValueError, lambda: qe.train(ds_))
            # train the qe
            qe.train(ds3)

            for node in np.arange(-1, s2.nvertices + 1):
                if node < 0 or node >= s2.nvertices:
                    assert_raises(KeyError, lambda: qe.query_byid(node))
                    continue

                feature_ids = np.asarray(qe.query_byid(node))
                # node indices relative to ds
                base_ids = feature_ids[feature_ids < 20]
                # should have multiples of 20
                assert_equal(set(feature_ids),
                             set((base_ids[np.newaxis].T + \
                                  [0, 20, 40]).ravel()))

                node_indices = s2.circlearound_n2d(node,
                                    radius, distance_metric or 'dijkstra')

                fa_indices = [fa_index for fa_index, inode in
                              enumerate(ds3.fa.node_indices)
                              if inode in node_indices and node_indices[inode] > inner_radius]
                if include_center and node in ds3.fa.node_indices:
                    fa_indices += np.where(ds3.fa.node_indices == node)[0].tolist()
                assert_equal(set(feature_ids), set(fa_indices))
Example #3
0
def test_zscore_withoutchunks():
    # just a smoke test to see if all issues of
    # https://github.com/PyMVPA/PyMVPA/issues/26
    # are fixed
    from mvpa2.datasets import Dataset
    ds = Dataset(np.arange(32).reshape((8,-1)), sa=dict(targets=range(8)))
    zscore(ds, chunks_attr=None)
    assert(np.any(ds.samples != np.arange(32).reshape((8,-1))))
    ds_summary = ds.summary()
    assert(ds_summary is not None)
Example #4
0
def test_zscore_withoutchunks():
    # just a smoke test to see if all issues of
    # https://github.com/PyMVPA/PyMVPA/issues/26
    # are fixed
    from mvpa2.datasets import Dataset
    ds = Dataset(np.arange(32).reshape((8, -1)), sa=dict(targets=range(8)))
    zscore(ds, chunks_attr=None)
    assert (np.any(ds.samples != np.arange(32).reshape((8, -1))))
    ds_summary = ds.summary()
    assert (ds_summary is not None)
Example #5
0
    def _proc_block(self, block, ds, measure):
        """Little helper to capture the parts of the computation that can be
        parallelized
        """
        if __debug__:
            debug_slc_ = 'SLC_' in debug.active
            debug('SLC',
                  "Starting computing block for %i elements" % len(block))
        if self.ca.is_enabled('roi_sizes'):
            roi_sizes = []
        else:
            roi_sizes = None
        results = []
        # put rois around all features in the dataset and compute the
        # measure within them
        for i, f in enumerate(block):
            # retrieve the feature ids of all features in the ROI from the query
            # engine
            roi_fids = self._queryengine[f]

            if __debug__ and  debug_slc_:
                debug('SLC_', 'For %r query returned ids %r' % (f, roi_fids))

            # slice the dataset
            roi = ds[:, roi_fids]

            if self.__add_center_fa:
                # add fa to indicate ROI seed if requested
                roi_seed = np.zeros(roi.nfeatures, dtype='bool')
                roi_seed[roi_fids.index(f)] = True
                roi.fa[self.__add_center_fa] = roi_seed

            # compute the datameasure and store in results
            res = measure(roi)
            if self.ca.is_enabled('roi_feature_ids'):
                if not is_datasetlike(res):
                    res = Dataset(np.atleast_1d(res))
                # add roi feature ids to intermediate result dataset for later
                # aggregation
                res.a['roi_feature_ids'] = roi_fids
            results.append(res)

            # store the size of the roi dataset
            if not roi_sizes is None:
                roi_sizes.append(roi.nfeatures)

            if __debug__:
                debug('SLC', "Doing %i ROIs: %i (%i features) [%i%%]" \
                    % (len(block),
                       f+1,
                       roi.nfeatures,
                       float(i+1)/len(block)*100,), cr=True)

        return results, roi_sizes
Example #6
0
def run(args):
    from mvpa2.base.hdf5 import h5save
    ds = None
    if not args.txt_data is None:
        verbose(1, "Load data from TXT file '%s'" % args.txt_data)
        samples = _load_from_txt(args.txt_data)
        ds = Dataset(samples)
    elif not args.npy_data is None:
        verbose(1, "Load data from NPY file '%s'" % args.npy_data)
        samples = _load_from_npy(args.npy_data)
        ds = Dataset(samples)
    elif not args.mri_data is None:
        verbose(1, "Load data from MRI image(s) %s" % args.mri_data)
        from mvpa2.datasets.mri import fmri_dataset
        vol_attr = dict()
        if not args.add_vol_attr is None:
            # XXX add a way to use the mapper of an existing dataset to
            # add a volume attribute without having to load the entire
            # mri data again
            vol_attr = dict(args.add_vol_attr)
            if not len(args.add_vol_attr) == len(vol_attr):
                warning("--vol-attr option with duplicate attribute name: "
                        "check arguments!")
            verbose(2, "Add volumetric feature attributes: %s" % vol_attr)
        ds = fmri_dataset(args.mri_data, mask=args.mask, add_fa=vol_attr)

    if ds is None:
        if args.data is None:
            raise RuntimeError('no data source specific')
        else:
            ds = hdf2ds(args.data)[0]
    else:
        if args.data is not None:
            verbose(1, 'ignoring dataset input in favor of other data source -- remove either one to disambiguate')

    # act on all attribute options
    ds = process_common_dsattr_opts(ds, args)

    if not args.add_fsl_mcpar is None:
        from mvpa2.misc.fsl.base import McFlirtParams
        mc_par = McFlirtParams(args.add_fsl_mcpar)
        for param in mc_par:
            verbose(2, "Add motion regressor as sample attribute '%s'"
                       % ('mc_' + param))
            ds.sa['mc_' + param] = mc_par[param]

    verbose(3, "Dataset summary %s" % (ds.summary()))
    # and store
    outfilename = args.output
    if not outfilename.endswith('.hdf5'):
        outfilename += '.hdf5'
    verbose(1, "Save dataset to '%s'" % outfilename)
    h5save(outfilename, ds, mkdir=True, compression=args.hdf5_compression)
Example #7
0
 def _forward_dataset(self, ds):
     # apply function
     if self._train_as_1st:
         out = self._fx(self._ds_train, ds)
     else:
         out = self._fx(ds, self._ds_train)
     # wrap output in a dataset if necessary
     if not isinstance(out, Dataset):
         try:
             out = Dataset(out)
         except ValueError:
             # not a sequence?
             out = Dataset([out])
     return out
Example #8
0
    def _level3(self, datasets):
        params = self.params  # for quicker access ;)
        # create a mapper per dataset
        mappers = [deepcopy(params.alignment) for ds in datasets]

        # key different from level-2; the common space is uniform
        #temp_commonspace = commonspace

        residuals = None
        if self.ca['residual_errors'].enabled:
            residuals = np.zeros((1, len(datasets)))
            self.ca.residual_errors = Dataset(samples=residuals)

        # start from original input datasets again
        for i, (m, ds_new) in enumerate(zip(mappers, datasets)):
            if __debug__:
                debug('HPAL_', "Level 3: ds #%i" % i)

            # retrain mapper on final common space
            ds_new.sa[m.get_space()] = self.commonspace
            m.train(ds_new)
            # remove common space attribute again to save on memory
            del ds_new.sa[m.get_space()]

            if residuals is not None:
                # obtain final projection
                data_mapped = m.forward(ds_new.samples)
                residuals[0,
                          i] = np.linalg.norm(data_mapped - self.commonspace)

        return mappers
def test_pval():
    def not_inplace_shuffle(x):
        x = list(x)
        random.shuffle(x)
        return x

    x = range(100000) * 20
    x = np.array(x)
    x = x.reshape(20, 100000)
    x = x.T
    x = np.apply_along_axis(not_inplace_shuffle, axis=0, arr=x)
    expected_result = [100000 - 100000 * 0.001] * 20

    thresholds = gct.get_thresholding_map(x, p=0.001)
    assert_array_equal(thresholds, expected_result)
    # works with datasets too
    dsthresholds = gct.get_thresholding_map(Dataset(x), p=0.001)
    assert_almost_equal(thresholds, dsthresholds)
    assert_raises(ValueError, gct.get_thresholding_map, x, p=0.00000001)

    x = range(0, 100, 5)
    null_dist = np.repeat(1, 100).astype(float)[None]
    pvals = gct._transform_to_pvals(x, null_dist)
    desired_output = np.array([
        1, 0.95, 0.9, 0.85, 0.8, 0.75, 0.7, 0.65, 0.6, 0.55, 0.5, 0.45, 0.4,
        0.35, 0.3, 0.25, 0.2, 0.15, 0.1, 0.05
    ])
    assert_array_almost_equal(desired_output, pvals)
Example #10
0
def test_repeater():
    reps = 4
    r = Repeater(reps, space='OMG')
    dsl = [ds for ds in r.generate(Dataset([0, 1]))]
    assert_equal(len(dsl), reps)
    for i, ds in enumerate(dsl):
        assert_equal(ds.a.OMG, i)
Example #11
0
    def _call(self, dataset):
        sensitivities = []
        for ind, analyzer in enumerate(self.__analyzers):
            if __debug__:
                debug("SA",
                      "Computing sensitivity for SA#%d:%s" % (ind, analyzer))
            sensitivity = analyzer(dataset)
            sensitivities.append(sensitivity)

        if __debug__:
            debug(
                "SA", "Returning %d sensitivities from %s" %
                (len(sensitivities), self.__class__.__name__))

        sa_attr = self._sa_attr
        if isinstance(sensitivities[0], AttrDataset):
            smerged = []
            for i, s in enumerate(sensitivities):
                s.sa[sa_attr] = np.repeat(i, len(s))
                smerged.append(s)
            sensitivities = vstack(smerged)
        else:
            sensitivities = \
                Dataset(sensitivities,
                        sa={sa_attr: np.arange(len(sensitivities))})

        self.ca.sensitivities = sensitivities

        return sensitivities
Example #12
0
def psc(ds, **kwargs):
    """In-place PSC of a `Dataset` or `ndarray`.

    This function behaves identical to `PSCMapper`. The only difference is
    that the actual psc is done in-place -- potentially causing a
    significant reduction of memory demands.

    Parameters
    ----------
    ds : Dataset or ndarray
      The data that will be Z-scored in-place.
    **kwargs
      For all other arguments, please see the documentation of `ZScoreMapper`.
    """
    pscm = PSCMapper(**kwargs)
    pscm._secret_inplace_zscore = True
    # train
    if isinstance(ds, Dataset):
        pscm.train(ds)
    else:
        pscm.train(Dataset(ds))
    # map
    mapped = pscm.forward(ds)
    # and append the mapper to the dataset
    if isinstance(mapped, Dataset):
        mapped._append_mapper(pscm)
Example #13
0
def eep_dataset(samples, targets=None, chunks=None):
    """Create a dataset using an EEP binary file as source.

    EEP files are used by *eeprobe* a software for analysing even-related
    potentials (ERP), which was developed at the Max-Planck Institute for
    Cognitive Neuroscience in Leipzig, Germany.

      http://www.ant-neuro.com/products/eeprobe

    Parameters
    ----------
    samples : str or EEPBin instance
      This is either a filename of an EEP file, or an EEPBin instance, providing
      the samples data in EEP format.
    targets, chunks : sequence or scalar or None
      Values are pass through to `Dataset.from_wizard()`. See its documentation
      for more information.
    """
    if isinstance(samples, str):
        # open the eep file
        eb = EEPBin(samples)
    elif isinstance(samples, EEPBin):
        # nothing special
        eb = samples
    else:
        raise ValueError("eep_dataset takes the filename of an "
              "EEP file or a EEPBin object as 'samples' argument.")

    # init dataset
    ds = Dataset.from_channeltimeseries(
            eb.data, targets=targets, chunks=chunks, t0=eb.t0, dt=eb.dt,
            channelids=eb.channels)
    return ds
Example #14
0
    def _sl_call(self, dataset, roi_ids, nproc):
        """Classical generic searchlight implementation
        """
        assert (self.results_backend in ('native', 'hdf5'))
        # compute
        if nproc is not None and nproc > 1:
            # split all target ROIs centers into `nproc` equally sized blocks
            nproc_needed = min(len(roi_ids), nproc)
            nblocks = nproc_needed \
                      if self.nblocks is None else self.nblocks
            roi_blocks = np.array_split(roi_ids, nblocks)

            # the next block sets up the infrastructure for parallel computing
            # this can easily be changed into a ParallelPython loop, if we
            # decide to have a PP job server in PyMVPA
            import pprocess
            p_results = pprocess.Map(limit=nproc_needed)
            if __debug__:
                debug(
                    'SLC', "Starting off %s child processes for nblocks=%i" %
                    (nproc_needed, nblocks))
            compute = p_results.manage(pprocess.MakeParallel(self._proc_block))
            for iblock, block in enumerate(roi_blocks):
                # should we maybe deepcopy the measure to have a unique and
                # independent one per process?
                seed = mvpa2.get_random_seed()
                compute(block,
                        dataset,
                        copy.copy(self.__datameasure),
                        seed=seed,
                        iblock=iblock)
        else:
            # otherwise collect the results in an 1-item list
            p_results = [
                self._proc_block(roi_ids, dataset, self.__datameasure)
            ]

        # Finally collect and possibly process results
        # p_results here is either a generator from pprocess.Map or a list.
        # In case of a generator it allows to process results as they become
        # available
        result_ds = self.results_fx(
            sl=self,
            dataset=dataset,
            roi_ids=roi_ids,
            results=self.__handle_all_results(p_results))

        # Assure having a dataset (for paranoid ones)
        if not is_datasetlike(result_ds):
            try:
                result_a = np.atleast_1d(result_ds)
            except ValueError, e:
                if 'setting an array element with a sequence' in str(e):
                    # try forcing object array.  Happens with
                    # test_custom_results_fx_logic on numpy 1.4.1 on Debian
                    # squeeze
                    result_a = np.array(result_ds, dtype=object)
                else:
                    raise
            result_ds = Dataset(result_a)
Example #15
0
def test_sifter_with_balancing():
    # extended previous test which was already
    # "... somewhat duplicating the doctest"
    ds = Dataset(samples=np.arange(12).reshape((-1, 2)),
                 sa={
                     'chunks': [0, 1, 2, 3, 4, 5],
                     'targets': ['c', 'c', 'c', 'p', 'p', 'p']
                 })

    # Without sifter -- just to assure that we do get all of them
    # i.e. 6*5*4*3/(4!) = 15
    par = ChainNode([NFoldPartitioner(cvtype=4, attr='chunks')])
    assert_equal(len(list(par.generate(ds))), 15)

    # so we will take 4 chunks out of available 7, but would care only
    # about those partitions where we have balanced number of 'c' and 'p'
    # entries
    assert_raises(
        ValueError,
        lambda x: list(Sifter([('targets', dict(wrong=1))]).generate(x)), ds)

    par = ChainNode([
        NFoldPartitioner(cvtype=4, attr='chunks'),
        Sifter([('partitions', 2),
                ('targets', dict(uvalues=['c', 'p'], balanced=True))])
    ])
    dss = list(par.generate(ds))
    # print [ x[x.sa.partitions==2].sa.targets for x in dss ]
    assert_equal(len(dss), 9)
    for ds_ in dss:
        testing = ds[ds_.sa.partitions == 2]
        assert_array_equal(np.unique(testing.sa.targets), ['c', 'p'])
        # and we still have both targets  present in training
        training = ds[ds_.sa.partitions == 1]
        assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
Example #16
0
def test_exclude_targets_combinations_subjectchunks():
    partitioner = ChainNode([NFoldPartitioner(attr='subjects'),
                             ExcludeTargetsCombinationsPartitioner(
                                 k=1,
                                 targets_attr='chunks',
                                 space='partitions')],
                            space='partitions')
    # targets do not need even to be defined!
    ds = Dataset(np.arange(18).reshape(9, 2),
                 sa={'chunks': np.arange(9) // 3,
                     'subjects': np.arange(9) % 3})
    dss = list(partitioner.generate(ds))
    assert_equal(len(dss), 9)

    testing_subjs, testing_chunks = [], []
    for ds_ in dss:
        testing_partition = ds_.sa.partitions == 2
        training_partition = ds_.sa.partitions == 1
        # must be scalars -- so implicit test here
        # if not -- would be error
        testing_subj = np.asscalar(np.unique(ds_.sa.subjects[testing_partition]))
        testing_subjs.append(testing_subj)
        testing_chunk = np.asscalar(np.unique(ds_.sa.chunks[testing_partition]))
        testing_chunks.append(testing_chunk)
        # and those must not appear for training
        ok_(not testing_subj in ds_.sa.subjects[training_partition])
        ok_(not testing_chunk in ds_.sa.chunks[training_partition])
    # and we should have gone through all chunks/subjs pairs
    testing_pairs = set(zip(testing_subjs, testing_chunks))
    assert_equal(len(testing_pairs), 9)
    # yoh: equivalent to set(itertools.product(range(3), range(3))))
    #      but .product is N/A for python2.5
    assert_equal(testing_pairs, set(zip(*np.where(np.ones((3,3))))))
Example #17
0
 def _forward_dataset(self, ds):
     pos = self._pos
     if pos < 0:
         # support negative/reverse indices
         pos = len(ds.shape) + 1 + pos
     # select all prior axes, but at most all existing axes
     slicer = [slice(None)] * min(pos, len(ds.shape))
     # and as many new axes as necessary afterwards
     slicer += [None] * max(1, pos + 1 - len(ds.shape))
     # there are two special cases that require modification of feature
     # attributes
     if pos == 0:
         # prepend an axis to all sample attributes
         out_sa = dict([(attr, ds.sa[attr].value[None]) for attr in ds.sa])
         # prepend an axis to all FAs and repeat for each previous sample
         out_fa = dict([(attr,
                         np.repeat(ds.fa[attr].value[None], len(ds),
                                   axis=0)) for attr in ds.fa])
     elif pos == 1:
         # prepend an axis to all feature attributes
         out_fa = dict([(attr, ds.fa[attr].value[None]) for attr in ds.fa])
         out_sa = ds.sa
     else:
         out_sa = ds.sa
         out_fa = ds.fa
     out = Dataset(ds.samples.__getitem__(tuple(slicer)),
                   sa=out_sa,
                   fa=out_fa,
                   a=ds.a)
     return out
Example #18
0
    def _call(self, dataset):
        """Extract weights from GPR
        """

        clf = self.clf
        kernel = clf.kernel
        train_fv = clf._train_fv
        if isinstance(kernel, LinearKernel):
            Sigma_p = 1.0
        else:
            Sigma_p = kernel.params.Sigma_p

        weights = Ndot(Sigma_p, Ndot(train_fv.T, clf._alpha))

        if self.ca.is_enabled('variances'):
            # super ugly formulas that can be quite surely improved:
            tmp = np.linalg.inv(clf._L)
            Kyinv = Ndot(tmp.T, tmp)
            # XXX in such lengthy matrix manipulations you might better off
            #     using np.matrix where * is a matrix product
            self.ca.variances = Ndiag(
                Sigma_p -
                Ndot(Sigma_p,
                     Ndot(train_fv.T, Ndot(Kyinv, Ndot(train_fv, Sigma_p)))))
        return Dataset(np.atleast_2d(weights))
Example #19
0
def eep_dataset(samples, targets=None, chunks=None):
    """Create a dataset using an EEP binary file as source.

    EEP files are used by *eeprobe* a software for analysing even-related
    potentials (ERP), which was developed at the Max-Planck Institute for
    Cognitive Neuroscience in Leipzig, Germany.

      http://www.ant-neuro.com/products/eeprobe
    """
    
    if isinstance(samples, str):
        # open the eep file
        eb = EEPBin(samples)
    elif isinstance(samples, EEPBin):
        # nothing special
        eb = samples
    else:
        raise ValueError("eep_dataset takes the filename of an "
              "EEP file or a EEPBin object as 'samples' argument.")

    # init dataset
    ds = Dataset.from_channeltimeseries(
            eb.data, targets=targets, chunks=chunks, t0=eb.t0, dt=eb.dt,
            channelids=eb.channels)
    return ds
Example #20
0
def get_pairwise_accuracies(ds, stat='acc', pairs=None, select=None, space='targets'):
    """Extract pair-wise classification performances as a dataset

    Converts a dataset of classifications for all pairs of
    classifiers (e.g. obtained from raw_predictions_ds of a
    MulticlassClassifier) into a dataset of performances for each
    pair of stimuli categories.  I.e. only the pair-wise results
    where the target label matches one of the targets in the pair
    will contribute to the count.

    Parameters
    ----------
    pairs : 
      Pairs of targets corresponding to the last dimension in the
      provided dataset
    select : list, optional
      Deal only with those targets listed here, omitting the others
    """
    pairs, hits_misses = get_pairwise_hits_misses(
        ds.samples, ds.sa[space].value, pairs=pairs, select=select)
    hits_misses = np.array(hits_misses)
    if stat in ['acc']:
        stat_values = hits_misses[:, 0].astype(float)/np.sum(hits_misses, axis=1)
        stat_fa = [stat]
    elif stat == 'hits_misses':
        stat_values = hits_misses
        stat_fa = ['hits', 'misses']
    else:
        raise NotImplementedError("%s statistic not there yet" % stat)

    return Dataset(stat_values, sa={space: pairs}, fa={'stat': stat_fa})
Example #21
0
def test_bayes_confusion_hyp():
    from mvpa2.clfs.transerror import BayesConfusionHypothesis
    conf = np.array([[10, 0, 5, 5], [0, 10, 5, 5], [5, 5, 10, 0],
                     [5, 5, 0, 10]])
    conf = Dataset(conf, sa={'labels': ['A', 'B', 'C', 'D']})
    bayes = BayesConfusionHypothesis(labels_attr='labels')
    skip_if_no_external('scipy')  # uses factorial from scipy.misc
    hyptest = bayes(conf)
    # by default comes with all hypothesis and posterior probs
    assert_equal(hyptest.shape, (15, 2))
    assert_array_equal(hyptest.fa.stat, ['log(p(C|H))', 'log(p(H|C))'])
    # check order of hypothesis (coarse)
    assert_array_equal(hyptest.sa.hypothesis[0], [['A', 'B', 'C', 'D']])
    assert_array_equal(hyptest.sa.hypothesis[-1], [['A'], ['B'], ['C'], ['D']])
    # now with limited hypothesis (given with literal labels), set and in
    # non-log scale
    bayes = BayesConfusionHypothesis(labels_attr='labels',
                                     log=False,
                                     hypotheses=[[['A', 'B', 'C', 'D']],
                                                 [[
                                                     'A',
                                                     'C',
                                                 ], ['B', 'D']],
                                                 [[
                                                     'A',
                                                     'D',
                                                 ], ['B', 'C']],
                                                 [['A'], ['B'], ['C'], ['D']]])
    hyptest = bayes(conf)
    # also with custom hyp the post-probs must add up to 1
    post_prob = hyptest.samples[:, 1]
    assert_almost_equal(np.sum(post_prob), 1)
    # in this particular case ...
    assert (post_prob[3] - np.sum(post_prob[1:3]) < 0.02)
def get_bold():
    # TODO add second model
    hrf_x = np.linspace(0, 25, 250)
    hrf = double_gamma_hrf(hrf_x) - single_gamma_hrf(hrf_x, 0.8, 1, 0.05)

    samples = 1200
    exp_time = np.linspace(0, 120, samples)

    fast_er_onsets = np.array([50, 240, 340, 590, 640, 940, 960])
    fast_er = np.zeros(samples)
    fast_er[fast_er_onsets] = 1

    model_hr = np.convolve(fast_er, hrf)[:samples]

    tr = 2.0
    model_lr = signal.resample(model_hr, int(samples / tr / 10), window='ham')

    ## moderate noise level
    baseline = 800
    wsignal = baseline + 8.0 \
              * model_lr + np.random.randn(int(samples / tr / 10)) * 4.0
    nsignal = baseline \
              + np.random.randn(int(samples / tr / 10)) * 4.0

    ds = Dataset(samples=np.array([wsignal, nsignal]).T,
                 sa={'model': model_lr})

    return ds
Example #23
0
    def _forward_dataset_helper(self, ds):
        # local binding
        num = self.__num

        pos = None
        if not self.__position_attr is None:
            # we know something about sample position
            pos = ds.sa[self.__position_attr].value
            rsamples, pos = resample(ds.samples,
                                     self.__num,
                                     t=pos,
                                     window=self.__window_args)
        else:
            # we know nothing about samples position
            rsamples = resample(ds.samples,
                                self.__num,
                                t=None,
                                window=self.__window_args)
        # new dataset that reuses that feature and dataset attributes of the
        # source
        mds = Dataset(rsamples, fa=ds.fa, a=ds.a)

        # the tricky part is what to do with the samples attributes, since their
        # number has changes
        if self.__attr_strategy == 'remove':
            # nothing to be done
            pass
        elif self.__attr_strategy == 'sample':
            step = int(len(ds) / num)
            sa = dict([(k, ds.sa[k].value[0::step][:num]) for k in ds.sa])
            mds.sa.update(sa)
        elif self.__attr_strategy == 'resample':
            # resample the attributes themselves
            sa = {}
            for k in ds.sa:
                v = ds.sa[k].value
                if pos is None:
                    sa[k] = resample(v,
                                     self.__num,
                                     t=None,
                                     window=self.__window_args)
                else:
                    if k == self.__position_attr:
                        # position attr will be handled separately at the end
                        continue
                    sa[k] = resample(v,
                                     self.__num,
                                     t=pos,
                                     window=self.__window_args)[0]
            # inject them all
            mds.sa.update(sa)
        else:
            raise ValueError("Unkown attribute handling strategy '%s'." %
                             self.__attr_strategy)

        if not pos is None:
            # we got the new sample positions and can store them
            mds.sa[self.__position_attr] = pos
        return mds
def test_resample():
    time = np.linspace(0, 2 * np.pi, 100)
    ds = Dataset(np.vstack((np.sin(time), np.cos(time))).T,
                 sa={
                     'time': time,
                     'section': np.repeat(range(10), 10)
                 })
    assert_equal(ds.shape, (100, 2))

    # downsample
    num = 10
    rm = FFTResampleMapper(num,
                           window=('gauss', 50),
                           position_attr='time',
                           attr_strategy='sample')
    mds = rm.forward(ds)
    assert_equal(mds.shape, (num, ds.nfeatures))
    # didn't change the orig
    assert_equal(len(ds), 100)

    # check position-based resampling
    ds_partial = ds[0::10]
    mds_partial = rm.forward(ds_partial)
    # despite different input sampling should yield the same output timepoints
    assert_array_almost_equal(mds.sa.time, mds_partial.sa.time)
    # exclude the first points to prevent edge effects, but the data should be
    # very similar too
    assert_array_almost_equal(mds.samples[2:],
                              mds_partial.samples[2:],
                              decimal=2)
    # simple sample of sa's should give meaningful stuff
    assert_array_equal(mds.sa.section, range(10))

    # and now for a dataset with chunks
    cds = vstack([ds.copy(), ds.copy()])
    cds.sa['chunks'] = np.repeat([0, 1], len(ds))
    rm = FFTResampleMapper(num,
                           attr_strategy='sample',
                           chunks_attr='chunks',
                           window=('gauss', 50))
    mcds = rm.forward(cds)
    assert_equal(mcds.shape, (20, 2))
    assert_array_equal(mcds.sa.section, np.tile(range(10), 2))
    # each individual chunks should be identical to previous dataset
    assert_array_almost_equal(mds.samples, mcds.samples[:10])
    assert_array_almost_equal(mds.samples, mcds.samples[10:])
Example #25
0
def test_random_affine_transformation():
    ds = Dataset.from_wizard(np.random.randn(8, 3, 2))
    ds_d = random_affine_transformation(ds)
    # compare original to the inverse of the distortion using reported
    # parameters
    assert_array_almost_equal(
        np.dot((ds_d.samples - ds_d.a.random_shift) / ds_d.a.random_scale,
               ds_d.a.random_rotation.T), ds.samples)
Example #26
0
 def test_linear_kernel(self):
     """Simplistic testing of linear kernel"""
     d1 = Dataset(np.asarray([list(range(5))] * 10, dtype=float))
     lk = npK.LinearKernel()
     lk.compute(d1)
     self.assertTrue(lk._k.shape == (10, 10),
                     "Failure computing LinearKernel (Size mismatch)")
     self.assertTrue((lk._k == 30).all(), "Failure computing LinearKernel")
Example #27
0
def test_random_affine_transformation():
    ds = Dataset.from_wizard(np.random.randn(8,3,2))
    ds_d = random_affine_transformation(ds)
    # compare original to the inverse of the distortion using reported
    # parameters
    assert_array_almost_equal(
        np.dot((ds_d.samples - ds_d.a.random_shift) / ds_d.a.random_scale,
               ds_d.a.random_rotation.T),
        ds.samples)
Example #28
0
    def test_cached_kernel(self):
        nchunks = 5
        n = 50 * nchunks
        d = Dataset(np.random.randn(n, 132))
        d.sa.chunks = np.random.randint(nchunks, size=n)

        # We'll compare against an Rbf just because it has a parameter to change
        rk = npK.RbfKernel(sigma=1.5)

        # Assure two kernels are independent for this test
        ck = CachedKernel(kernel=npK.RbfKernel(sigma=1.5))
        ck.compute(d)  # Initial cache of all data

        self.assertTrue(ck._recomputed,
                        'CachedKernel was not initially computed')

        # Try some splitting
        for chunk in [d[d.sa.chunks == i] for i in range(nchunks)]:
            rk.compute(chunk)
            ck.compute(chunk)
            self.kernel_equiv(rk, ck)  #, accuracy=1e-12)
            self.failIf(ck._recomputed,
                        "CachedKernel incorrectly recomputed it's kernel")

        # Test what happens when a parameter changes
        ck.params.sigma = 3.5
        ck.compute(d)
        self.assertTrue(ck._recomputed,
                        "CachedKernel doesn't recompute on kernel change")
        rk.params.sigma = 3.5
        rk.compute(d)
        self.assertTrue(np.all(rk._k == ck._k),
                        'Cached and rbf kernels disagree after kernel change')

        # Now test handling new data
        d2 = Dataset(np.random.randn(32, 43))
        ck.compute(d2)
        self.assertTrue(
            ck._recomputed,
            "CachedKernel did not automatically recompute new data")
        ck.compute(d)
        self.assertTrue(ck._recomputed,
                        "CachedKernel did not recompute old data which had\n" + \
                        "previously been computed, but had the cache overriden")
Example #29
0
    def __process_roi(self, ds, roi_feature_id, measure, assure_dataset):
        # retrieve the feature ids of all features in the ROI from the query
        # engine
        roi_specs = self._queryengine[roi_feature_id]
        if __debug__:
            debug(
                'SLC_', 'For %r query returned roi_specs %r' %
                (roi_feature_id, roi_specs))
        if is_datasetlike(roi_specs):
            # TODO: unittest
            assert (len(roi_specs) == 1)
            roi_fids = roi_specs.samples[0]
        else:
            roi_fids = roi_specs

        # slice the dataset
        roi = ds[:, roi_fids]
        if is_datasetlike(roi_specs):
            for n, v in roi_specs.fa.iteritems():
                roi.fa[n] = v
        if self.__add_center_fa:
            # add fa to indicate ROI seed if requested
            roi_seed = np.zeros(roi.nfeatures, dtype='bool')
            if roi_feature_id in roi_fids:
                roi_seed[roi_fids.index(roi_feature_id)] = True
            else:
                warning("Center feature attribute id %s not found" %
                        roi_feature_id)
            roi.fa[self.__add_center_fa] = roi_seed

        # compute the datameasure and store in results
        res = measure(roi)
        if assure_dataset and not is_datasetlike(res):
            res = Dataset(np.atleast_1d(res))
        if self.ca.is_enabled('roi_feature_ids'):
            # add roi feature ids to intermediate result dataset for later
            # aggregation
            res.a['roi_feature_ids'] = roi_fids
        if self.ca.is_enabled('roi_sizes'):
            res.a['roi_sizes'] = roi.nfeatures
        if self.ca.is_enabled('roi_center_ids'):
            res.a['roi_center_ids'] = roi_feature_id
        return res, roi
Example #30
0
 def _call(self, dataset):
     # For performance measures -- increase to 50-200
     # np.sum here is just to get some meaningful value in
     # them
     #return np.ones(shape=(2, 2))*np.sum(dataset)
     return Dataset(
         np.array([{
             'd': np.ones(shape=(5, 5)) * np.sum(dataset)
         }],
                  dtype=object))
Example #31
0
 def _call(self, ds):
     y = ds.sa[self.space].value
     if self.numeric or ((self.numeric is None) and y.dtype.char == 'S'):
         y = AttributeMap().to_numeric(y)
     # TODO:
     if not self.uni:
         out = self.fx(ds.samples, y)
     else:
         out = np.array([self.fx(feat, y) for feat in ds.samples.T])
     return Dataset(out[None], fa=ds.fa)
def _fill_in_scattered_results(sl, dataset, roi_ids, results):
    """this requires the searchlight conditional attribute 'roi_feature_ids'
    to be enabled"""
    import numpy as np
    from mvpa2.datasets import Dataset

    resmap = None
    probmap = None
    for resblock in results:
        for res in resblock:
            if resmap is None:
                # prepare the result container
                resmap = np.zeros((len(res), dataset.nfeatures),
                                  dtype=res.samples.dtype)
                if 'null_prob' in res.fa:
                    # initialize the prob map also with zeroes, as p=0 can never
                    # happen as an empirical result
                    probmap = np.zeros((dataset.nfeatures,) + res.fa.null_prob.shape[1:],
                                       dtype=res.samples.dtype)
                observ_counter = np.zeros(dataset.nfeatures, dtype=int)
            # project the result onto all features -- love broadcasting!
            #print "averaging"
            resmap[:, res.a.roi_feature_ids] += res.samples
            if not probmap is None:
                probmap[res.a.roi_feature_ids] += res.fa.null_prob
            # increment observation counter for all relevant features
            observ_counter[res.a.roi_feature_ids] += 1
    # when all results have been added up average them according to the number
    # of observations
    observ_mask = observ_counter > 0
    resmap[:, observ_mask] /= observ_counter[observ_mask]
    result_ds = Dataset(resmap,
                        fa={'observations': observ_counter})
    if not probmap is None:
        # transpose to make broadcasting work -- creates a view, so in-place
        # modification still does the job
        probmap.T[:, observ_mask] /= observ_counter[observ_mask]
        result_ds.fa['null_prob'] = probmap.squeeze()
    if 'mapper' in dataset.a:
        import copy
        result_ds.a['mapper'] = copy.copy(dataset.a.mapper)
    return result_ds
Example #33
0
def _fill_in_scattered_results(sl, dataset, roi_ids, results):
    """this requires the searchlight conditional attribute 'roi_feature_ids'
    to be enabled"""
    import numpy as np
    from mvpa2.datasets import Dataset

    resmap = None
    probmap = None
    for resblock in results:
        for res in resblock:
            if resmap is None:
                # prepare the result container
                resmap = np.zeros((len(res), dataset.nfeatures),
                                  dtype=res.samples.dtype)
                if 'null_prob' in res.fa:
                    # initialize the prob map also with zeroes, as p=0 can never
                    # happen as an empirical result
                    probmap = np.zeros(
                        (dataset.nfeatures, ) + res.fa.null_prob.shape[1:],
                        dtype=res.samples.dtype)
                observ_counter = np.zeros(dataset.nfeatures, dtype=int)
            #project the result onto all features -- love broadcasting!
            resmap[:, res.a.roi_feature_ids] += res.samples
            if not probmap is None:
                probmap[res.a.roi_feature_ids] += res.fa.null_prob
            # increment observation counter for all relevant features
            observ_counter[res.a.roi_feature_ids] += 1
    # when all results have been added up average them according to the number
    # of observations
    observ_mask = observ_counter > 0
    resmap[:, observ_mask] /= observ_counter[observ_mask]
    result_ds = Dataset(resmap, fa={'observations': observ_counter})
    if not probmap is None:
        # transpose to make broadcasting work -- creates a view, so in-place
        # modification still does the job
        probmap.T[:, observ_mask] /= observ_counter[observ_mask]
        result_ds.fa['null_prob'] = probmap.squeeze()
    if 'mapper' in dataset.a:
        import copy
        result_ds.a['mapper'] = copy.copy(dataset.a.mapper)
    return result_ds
Example #34
0
 def test_1d_multispace_searchlight(self):
     ds = Dataset([np.arange(6)])
     ds.fa['coord1'] = np.repeat(np.arange(3), 2)
     # add a second space to the dataset
     ds.fa['coord2'] = np.tile(np.arange(2), 3)
     measure = lambda x: "+".join([str(x) for x in x.samples[0]])
     # simply select each feature once
     res = Searchlight(measure,
                       IndexQueryEngine(coord1=Sphere(0), coord2=Sphere(0)),
                       nproc=1)(ds)
     assert_array_equal(res.samples, [['0', '1', '2', '3', '4', '5']])
     res = Searchlight(measure,
                       IndexQueryEngine(coord1=Sphere(0), coord2=Sphere(1)),
                       nproc=1)(ds)
     assert_array_equal(res.samples,
                        [['0+1', '0+1', '2+3', '2+3', '4+5', '4+5']])
     res = Searchlight(measure,
                       IndexQueryEngine(coord1=Sphere(1), coord2=Sphere(0)),
                       nproc=1)(ds)
     assert_array_equal(res.samples,
                        [['0+2', '1+3', '0+2+4', '1+3+5', '2+4', '3+5']])
Example #35
0
def resting_dmn(sub, ses, in_file=None,
                lh_ctx_file=None, rh_ctx_file=None, sc_file=None,
                schedule_file=None):
    from pipe_hbn_ssi import wb_to_tss
    import os, sys
    import numpy as np
    sys.path.append('/home/bpinsard/data/projects/CoRe')
    import core.mvpa.dataset as cds
    from nipy.modalities.fmri.glm import GeneralLinearModel
    import scipy.ndimage    
    from mvpa2.datasets import Dataset

    sched = np.loadtxt(
        schedule_file, 
        converters = {0:int,1:int,2:str,3:int,4:str,5:str},
        dtype=np.object,
        skiprows=1)
    idx = sched[:,1].tolist().index(ses)
    #scan_no = sched[idx,2].split('-').index('Rest')
    if in_file is None:
        scan_no = [i for i,n in enumerate(lh_ctx_file) if 'RESTING' in n]
    else:
        scan_no = [i for i,n in enumerate(in_file) if 'RESTING' in n]
    scan_no = scan_no[0]
    
    if in_file is None:
        inf = lh_ctx_file[scan_no]
        print(inf)
        ds = Dataset(wb_to_tss(lh_ctx_file[scan_no], rh_ctx_file[scan_no], sc_file[scan_no]))
    else:
        inf = in_file[scan_no]
        print(inf)
        ds = cds.ds_from_ts(in_file[scan_no])
    #cds.preproc_ds(ds, detrend=True)
    ds.samples -= scipy.ndimage.gaussian_filter1d(ds.samples,sigma=8,axis=0,truncate=2)
    
    seed_roi = 9
    cds.add_aparc_ba_fa(ds, sub, pproc_tpl=os.path.join(pipe_hbn_ssi.proc_dir,'moco_multiband','surface_32k','_sub_%d'))
    roi_mask = np.logical_or(ds.fa.aparc==seed_roi+11100, ds.fa.aparc==seed_roi+12100)
    
    mean_roi_ts = ds.samples[:,roi_mask].mean(1)
    mean_roi_ts -= mean_roi_ts.mean()
    
    mtx = np.asarray([mean_roi_ts, np.ones(ds.nsamples)]).T
        
    glm = GeneralLinearModel(mtx)
    glm.fit(ds.samples,model='ols')
    contrast = glm.contrast([1,0], contrast_type='t')
    
    out_file = os.path.abspath('sub%d_ses%d_connectivity_results.npz'%(sub,ses))
    np.savez_compressed(out_file, contrast=contrast, mean_roi_ts=mean_roi_ts)
    #return contrast
    return out_file, inf
Example #36
0
    def _call(self, dataset):
        """Computes featurewise I-RELIEF weights."""
        samples = dataset.samples
        NS, NF = samples.shape[:2]
        if self.w_guess is None:
            self.w = np.ones(NF, 'd')
        # do normalization in all cases to be safe :)
        self.w = self.w / (self.w**2).sum()

        M, H = self.compute_M_H(dataset.targets)

        while True:
            self.k = self.kernel(length_scale=self.kernel_width / self.w)
            d_w_k = self.k.computed(samples).as_raw_np()
            # set d_w_k to zero where distance=0 (i.e. kernel ==
            # 1.0), otherwise I-RELIEF could not converge.
            # XXX Note that kernel==1 for distance=0 only for
            # exponential kernels!!  IMPROVE
            d_w_k[np.abs(d_w_k - 1.0) < 1.0e-15] = 0.0
            ni = np.zeros(NF, 'd')
            for n in range(NS):
                # d_w_k[n,n] could be omitted since == 0.0
                gamma_n = 1.0 - np.nan_to_num(d_w_k[n, M[n]].sum() \
                                / (d_w_k[n, :].sum()-d_w_k[n, n]))
                alpha_n = np.nan_to_num(d_w_k[n, M[n]] /
                                        (d_w_k[n, M[n]].sum()))
                beta_n = np.nan_to_num(d_w_k[n, H[n]] / (d_w_k[n, H[n]].sum()))

                m_n = (np.abs(samples[n, :] - samples[M[n], :]) \
                        * alpha_n[:, None]).sum(0)
                h_n = (np.abs(samples[n, :] - samples[H[n], :]) \
                        * beta_n[:, None]).sum(0)
                ni += gamma_n * (m_n - h_n)
            ni = ni / NS

            ni_plus = np.clip(ni, 0.0,
                              np.inf)  # set all negative elements to zero
            w_new = np.nan_to_num(ni_plus / (np.sqrt((ni_plus**2).sum())))
            change = np.abs(w_new - self.w).sum()
            if __debug__ and 'IRELIEF' in debug.active:
                debug(
                    'IRELIEF',
                    "change=%.4f max=%f min=%.4f mean=%.4f std=%.4f #nan=%d" %
                    (change, w_new.max(), w_new.min(), w_new.mean(),
                     w_new.std(), np.isnan(w_new).sum()))

            # update weights:
            self.w = w_new
            if change < self.threshold:
                break

        return Dataset(self.w[np.newaxis])
Example #37
0
def test_resample():
    time = np.linspace(0, 2*np.pi, 100)
    ds = Dataset(np.vstack((np.sin(time), np.cos(time))).T,
                 sa = {'time': time,
                       'section': np.repeat(range(10), 10)})
    assert_equal(ds.shape, (100, 2))

    # downsample
    num = 10
    rm = FFTResampleMapper(num, window=('gauss', 50),
                           position_attr='time',
                           attr_strategy='sample')
    mds = rm.forward(ds)
    assert_equal(mds.shape, (num, ds.nfeatures))
    # didn't change the orig
    assert_equal(len(ds), 100)

    # check position-based resampling
    ds_partial = ds[0::10]
    mds_partial = rm.forward(ds_partial)
    # despite different input sampling should yield the same output timepoints
    assert_array_almost_equal(mds.sa.time, mds_partial.sa.time)
    # exclude the first points to prevent edge effects, but the data should be
    # very similar too
    assert_array_almost_equal(mds.samples[2:], mds_partial.samples[2:], decimal=2)
    # simple sample of sa's should give meaningful stuff
    assert_array_equal(mds.sa.section, range(10))

    # and now for a dataset with chunks
    cds = vstack([ds.copy(), ds.copy()])
    cds.sa['chunks'] = np.repeat([0,1], len(ds))
    rm = FFTResampleMapper(num, attr_strategy='sample', chunks_attr='chunks',
                           window=('gauss', 50))
    mcds = rm.forward(cds)
    assert_equal(mcds.shape, (20, 2))
    assert_array_equal(mcds.sa.section, np.tile(range(10),2))
    # each individual chunks should be identical to previous dataset
    assert_array_almost_equal(mds.samples, mcds.samples[:10])
    assert_array_almost_equal(mds.samples, mcds.samples[10:])
def intersubject_correlation(dss, reference_ds=0):
    """
    Computes voxelwise inter-subject time series correlation
    in a pairwise fashion for a list of Datasets. Datasets
    must all be the same shape. Resulting dataset of pairwise
    correlations will inherit Dataset attributes from
    reference data set [Default: first data set in list].
    """

    # Check if input list contains Datasets, ndarrays
    dss = [Dataset(ds) if not type(ds) == Dataset else ds for ds in dss]

    ds_shape = dss[reference_ds].shape
    n_features = ds_shape[1]

    for ds in dss:
        assert ds.shape == ds_shape

    # Compute time series correlation per voxel per subject pair
    correlations = []
    for pair in combinations(dss, 2):
        pair_map = []
        for feature in xrange(n_features):
            pair_map.append(
                pearson_correlation(pair[0].samples[:, feature],
                                    pair[1].samples[:, feature]))
        correlations.append(pair_map)

    # Resulting correlation map inherits attributes of referece data set
    correlations_ds = Dataset(correlations,
                              fa=dss[reference_ds].fa,
                              a=dss[reference_ds].a)
    correlations_ds.sa['pairs'] = list(combinations(range(len(dss)), 2))

    assert correlations_ds.shape[0] == len(dss) * (len(dss) - 1) / 2
    assert correlations_ds.shape[1] == n_features

    return correlations_ds
Example #39
0
 def test_1d_multispace_searchlight(self):
     ds = Dataset([np.arange(6)])
     ds.fa['coord1'] = np.repeat(np.arange(3), 2)
     # add a second space to the dataset
     ds.fa['coord2'] = np.tile(np.arange(2), 3)
     measure = lambda x: "+".join([str(x) for x in x.samples[0]])
     # simply select each feature once
     res = Searchlight(measure,
                       IndexQueryEngine(coord1=Sphere(0),
                                        coord2=Sphere(0)),
                       nproc=1)(ds)
     assert_array_equal(res.samples, [['0', '1', '2', '3', '4', '5']])
     res = Searchlight(measure,
                       IndexQueryEngine(coord1=Sphere(0),
                                        coord2=Sphere(1)),
                       nproc=1)(ds)
     assert_array_equal(res.samples,
                        [['0+1', '0+1', '2+3', '2+3', '4+5', '4+5']])
     res = Searchlight(measure,
                       IndexQueryEngine(coord1=Sphere(1),
                                        coord2=Sphere(0)),
                       nproc=1)(ds)
     assert_array_equal(res.samples,
                        [['0+2', '1+3', '0+2+4', '1+3+5', '2+4', '3+5']])
Example #40
0
    def _proc_block(self, block, ds, measure, seed=None, iblock='main'):
        """Little helper to capture the parts of the computation that can be
        parallelized

        Parameters
        ----------
        seed
          RNG seed.  Should be provided e.g. in child process invocations
          to guarantee that they all seed differently to not keep generating
          the same sequencies due to reusing the same copy of numpy's RNG
        block
          Critical for generating non-colliding temp filenames in case
          of hdf5 backend.  Otherwise RNGs of different processes might
          collide in their temporary file names leading to problems.
        """
        if seed is not None:
            mvpa2.seed(seed)
        if __debug__:
            debug_slc_ = 'SLC_' in debug.active
            debug('SLC',
                  "Starting computing block for %i elements" % len(block))
        results = []
        store_roi_feature_ids = self.ca.is_enabled('roi_feature_ids')
        store_roi_sizes = self.ca.is_enabled('roi_sizes')
        store_roi_center_ids = self.ca.is_enabled('roi_center_ids')

        assure_dataset = any([store_roi_feature_ids,
                              store_roi_sizes,
                              store_roi_center_ids])

        # put rois around all features in the dataset and compute the
        # measure within them
        for i, f in enumerate(block):
            # retrieve the feature ids of all features in the ROI from the query
            # engine
            roi_specs = self._queryengine[f]

            if __debug__ and  debug_slc_:
                debug('SLC_', 'For %r query returned roi_specs %r'
                      % (f, roi_specs))

            if is_datasetlike(roi_specs):
                # TODO: unittest
                assert(len(roi_specs) == 1)
                roi_fids = roi_specs.samples[0]
            else:
                roi_fids = roi_specs

            # slice the dataset
            roi = ds[:, roi_fids]

            if is_datasetlike(roi_specs):
                for n, v in roi_specs.fa.iteritems():
                    roi.fa[n] = v

            if self.__add_center_fa:
                # add fa to indicate ROI seed if requested
                roi_seed = np.zeros(roi.nfeatures, dtype='bool')
                if f in roi_fids:
                    roi_seed[roi_fids.index(f)] = True
                else:
                    warning("Center feature attribute id %s not found" % f)
                roi.fa[self.__add_center_fa] = roi_seed

            # compute the datameasure and store in results
            res = measure(roi)

            if assure_dataset and not is_datasetlike(res):
                res = Dataset(np.atleast_1d(res))
            if store_roi_feature_ids:
                # add roi feature ids to intermediate result dataset for later
                # aggregation
                res.a['roi_feature_ids'] = roi_fids
            if store_roi_sizes:
                res.a['roi_sizes'] = roi.nfeatures
            if store_roi_center_ids:
                res.a['roi_center_ids'] = f
            results.append(res)

            if __debug__:
                debug('SLC', "Doing %i ROIs: %i (%i features) [%i%%]" \
                    % (len(block),
                       f + 1,
                       roi.nfeatures,
                       float(i + 1) / len(block) * 100,), cr=True)

        if self.results_postproc_fx:
            if __debug__:
                debug('SLC', "Post-processing %d results in proc_block using %s"
                      % (len(results), self.results_postproc_fx))
            results = self.results_postproc_fx(results)
        if self.results_backend == 'native':
            pass                        # nothing special
        elif self.results_backend == 'hdf5':
            # store results in a temporary file and return a filename
            results_file = tempfile.mktemp(prefix=self.tmp_prefix,
                                           suffix='-%s.hdf5' % iblock)
            if __debug__:
                debug('SLC', "Storing results into %s" % results_file)
            h5save(results_file, results)
            if __debug__:
                debug('SLC_', "Results stored")
            results = results_file
        else:
            raise RuntimeError("Must not reach this point")
        return results
Example #41
0
    def test_surf_queryengine(self, qefn):
        s = surf.generate_plane((0, 0, 0), (0, 1, 0), (0, 0, 1), 4, 5)

        # add scond layer
        s2 = surf.merge(s, (s + (.01, 0, 0)))

        ds = Dataset(samples=np.arange(20)[np.newaxis],
                    fa=dict(node_indices=np.arange(39, 0, -2)))

        # add more features (with shared node indices)
        ds3 = hstack((ds, ds, ds))

        radius = 2.5

        # Note: sweepargs it not used to avoid re-generating the same
        #       surface and dataset multiple times.
        for distance_metric in ('euclidean', 'dijkstra', '<illegal>', None):
            builder = lambda: queryengine.SurfaceQueryEngine(s2, radius,
                                                             distance_metric)
            if distance_metric in ('<illegal>', None):
                assert_raises(ValueError, builder)
                continue

            qe = builder()

            # test i/o and ensure that the untrained instance is not trained
            if externals.exists('h5py'):
                fd, qefn = tempfile.mkstemp('qe.hdf5', 'test'); os.close(fd)
                h5save(qefn, qe)
                qe = h5load(qefn)
                os.remove(qefn)


            # untrained qe should give errors
            assert_raises(ValueError, lambda:qe.ids)
            assert_raises(ValueError, lambda:qe.query_byid(0))

            # node index out of bounds should give error
            ds_ = ds.copy()
            ds_.fa.node_indices[0] = 100
            assert_raises(ValueError, lambda: qe.train(ds_))

            # lack of node indices should give error
            ds_.fa.pop('node_indices')
            assert_raises(ValueError, lambda: qe.train(ds_))


            # train the qe
            qe.train(ds3)

            # test i/o and ensure that the loaded instance is trained
            if externals.exists('h5py'):
                h5save(qefn, qe)
                qe = h5load(qefn)

            for node in np.arange(-1, s2.nvertices + 1):
                if node < 0 or node >= s2.nvertices:
                    assert_raises(KeyError, lambda: qe.query_byid(node))
                    continue

                feature_ids = np.asarray(qe.query_byid(node))

                # node indices relative to ds
                base_ids = feature_ids[feature_ids < 20]

                # should have multiples of 20
                assert_equal(set(feature_ids),
                             set((base_ids[np.newaxis].T + \
                                            [0, 20, 40]).ravel()))



                node_indices = list(s2.circlearound_n2d(node,
                                    radius, distance_metric or 'dijkstra'))

                fa_indices = [fa_index for fa_index, node in
                                    enumerate(ds3.fa.node_indices)
                                    if node in node_indices]


                assert_equal(set(feature_ids), set(fa_indices))

            # smoke tests
            assert_true('SurfaceQueryEngine' in '%s' % qe)
            assert_true('SurfaceQueryEngine' in '%r' % qe)
Example #42
0
    def _proc_block(self, block, ds, measure, iblock='main'):
        """Little helper to capture the parts of the computation that can be
        parallelized

        Parameters
        ----------
        iblock
          Critical for generating non-colliding temp filenames in case
          of hdf5 backend.  Otherwise RNGs of different processes might
          collide in their temporary file names leading to problems.
        """
        if __debug__:
            debug_slc_ = 'SLC_' in debug.active
            debug('SLC',
                  "Starting computing block for %i elements" % len(block))
        if self.ca.is_enabled('roi_sizes'):
            roi_sizes = []
        else:
            roi_sizes = None
        results = []
        # put rois around all features in the dataset and compute the
        # measure within them
        for i, f in enumerate(block):
            # retrieve the feature ids of all features in the ROI from the query
            # engine
            roi_fids = self._queryengine[f]

            if __debug__ and  debug_slc_:
                debug('SLC_', 'For %r query returned ids %r' % (f, roi_fids))

            # slice the dataset
            roi = ds[:, roi_fids]

            if self.__add_center_fa:
                # add fa to indicate ROI seed if requested
                roi_seed = np.zeros(roi.nfeatures, dtype='bool')
                roi_seed[roi_fids.index(f)] = True
                roi.fa[self.__add_center_fa] = roi_seed

            # compute the datameasure and store in results
            res = measure(roi)
            if self.ca.is_enabled('roi_feature_ids'):
                if not is_datasetlike(res):
                    res = Dataset(np.atleast_1d(res))
                # add roi feature ids to intermediate result dataset for later
                # aggregation
                res.a['roi_feature_ids'] = roi_fids
            results.append(res)

            # store the size of the roi dataset
            if not roi_sizes is None:
                roi_sizes.append(roi.nfeatures)

            if __debug__:
                debug('SLC', "Doing %i ROIs: %i (%i features) [%i%%]" \
                    % (len(block),
                       f+1,
                       roi.nfeatures,
                       float(i+1)/len(block)*100,), cr=True)

        if self.results_backend == 'native':
            pass                        # nothing special
        elif self.results_backend == 'hdf5':
            # store results in a temporary file and return a filename
            results_file = tempfile.mktemp(prefix=self.tmp_prefix,
                                           suffix='-%s.hdf5' % iblock)
            if __debug__:
                debug('SLC', "Storing results into %s" % results_file)
            h5save(results_file, results)
            if __debug__:
                debug('SLC_', "Results stored")
            results = results_file
        else:
            raise RuntimeError("Must not reach this point")
        return results, roi_sizes
def movie_dataset(
        subj, preproc=None,
        base_path=os.curdir,
        fname_tmpl='sub-%(subj)s/ses-movie/func/sub-%(subj)s_ses-movie_task-movie_run-%(run)i_recording-eyegaze_physio.tsv.gz'):
    """
    Load eyegaze recordings from all runs a merge into a consecutive timeseries

    When merging intersegment-overlap is removed.

    Parameters
    ----------
    subj : str
      Subject code.
    preproc : callable or None
      Callable to preprocess a record array of the raw timeseries. The record
      array has the field 'x', 'y', 'pupil', and 'movie_frame'. It needs to
      return a record array with the same fields and must not change the
      sampling rate or number of samples.
    base_path : path
      Base directory for input file discovery.
    fname_tmpl : str
      Template expression to match input files. Support dict expansion with
      'subj' and 'run' keys.

    Returns
    -------
    Dataset
      The dataset contains a number of attributes, most of which should be
      self-explanatory. The `ds.a.run_duration_deviation` attribute quantifies
      the eyegaze recording duration difference from the expected value (in
      seconds).
    """
    # in frames (hand-verified by re-assembling in kdenlive -- using MELT
    # underneath)
    seg_offsets = (0, 22150, 43802, 65304, 89305, 112007, 133559, 160261)
    movie_fps = 25.0
    eyegaze_sr = 1000.0  # Hz
    intersegment_overlap = 400  # frames

    segments = []
    for seg, offset in enumerate(seg_offsets):
        raw = np.recfromcsv(
            os.path.join(base_path, fname_tmpl % dict(subj=subj, run=seg + 1)),
            delimiter='\t',
            names=('x', 'y', 'pupil', 'movie_frame'))
        if not preproc is None:
            raw = preproc(raw)
        # glue together to form a dataset
        ds = Dataset(np.array((raw.x, raw.y, raw.pupil)).T,
                     sa=dict(movie_frame=raw.movie_frame))
        ds.sa['movie_run_frame'] = ds.sa.movie_frame.copy()
        # turn into movie frame ID for the entire unsegmented movie
        ds.sa.movie_frame += offset
        ## truncate segment time series to remove overlap
        if seg < 7:
            # cut the end in a safe distance to the actual end, but inside the
            # overlap
            ds = ds[:-int(intersegment_overlap / movie_fps * eyegaze_sr)]
        if seg > 0:
            # cut the beginning to have a seamless start after the previous
            # segment
            ds = ds[ds.sa.movie_frame > segments[-1].sa.movie_frame.max()]
        ds.sa['movie_run'] = [seg + 1] * len(ds)
        segments.append(ds)
    ds = vstack(segments)
    # column names
    ds.fa['name'] = ('x', 'y', 'pupil')
    ds.a['sampling_rate'] = eyegaze_sr
    ds.a['movie_fps'] = movie_fps
    return ds
Example #44
0
def test_datasetmapping():
    # 6 samples, 4X2 features
    data = np.arange(48).reshape(6,4,2)
    ds = Dataset(data,
                 sa={'timepoints': np.arange(6),
                     'multidim': data.copy()},
                 fa={'fid': np.arange(4)})
    # with overlapping and non-overlapping boxcars
    startpoints = [0, 1, 4]
    boxlength = 2
    bm = BoxcarMapper(startpoints, boxlength, space='boxy')
    # train is critical
    bm.train(ds)
    mds = bm.forward(ds)
    assert_equal(len(mds), len(startpoints))
    assert_equal(mds.nfeatures, boxlength)
    # all samples attributes remain, but the can rotated/compressed into
    # multidimensional attributes
    assert_equal(sorted(mds.sa.keys()), ['boxy_onsetidx'] + sorted(ds.sa.keys()))
    assert_equal(mds.sa.multidim.shape,
            (len(startpoints), boxlength) + ds.shape[1:])
    assert_equal(mds.sa.timepoints.shape, (len(startpoints), boxlength))
    assert_array_equal(mds.sa.timepoints.flatten(),
                       np.array([(s, s+1) for s in startpoints]).flatten())
    assert_array_equal(mds.sa.boxy_onsetidx, startpoints)
    # feature attributes also get rotated and broadcasted
    assert_array_equal(mds.fa.fid, [ds.fa.fid, ds.fa.fid])
    # and finally there is a new one
    assert_array_equal(mds.fa.boxy_offsetidx, range(boxlength))

    # now see how it works on reverse()
    rds = bm.reverse(mds)
    # we got at least something of all original attributes back
    assert_equal(sorted(rds.sa.keys()), sorted(ds.sa.keys()))
    assert_equal(sorted(rds.fa.keys()), sorted(ds.fa.keys()))
    # it is not possible to reconstruct the full samples array
    # some samples even might show up multiple times (when there are overlapping
    # boxcars
    assert_array_equal(rds.samples,
                       np.array([[[ 0,  1], [ 2,  3], [ 4,  5], [ 6,  7]],
                                 [[ 8,  9], [10, 11], [12, 13], [14, 15]],
                                 [[ 8,  9], [10, 11], [12, 13], [14, 15]],
                                 [[16, 17], [18, 19], [20, 21], [22, 23]],
                                 [[32, 33], [34, 35], [36, 37], [38, 39]],
                                 [[40, 41], [42, 43], [44, 45], [46, 47]]]))
    assert_array_equal(rds.sa.timepoints, [0, 1, 1, 2, 4, 5])
    assert_array_equal(rds.sa.multidim, ds.sa.multidim[rds.sa.timepoints])
    # but feature attributes should be fully recovered
    assert_array_equal(rds.fa.fid, ds.fa.fid)

    # popular dataset configuration (double flatten + boxcar)
    cm= ChainMapper([FlattenMapper(), bm, FlattenMapper()])
    cm.train(ds)
    bflat = ds.get_mapped(cm)
    assert_equal(bflat.shape, (len(startpoints), boxlength * np.prod(ds.shape[1:])))
    # add attributes
    bflat.fa['testfa'] = np.arange(bflat.nfeatures)
    bflat.sa['testsa'] = np.arange(bflat.nsamples)
    # now try to go back
    bflatrev = bflat.mapper.reverse(bflat)
    # data should be same again, as far as the boxcars match
    assert_array_equal(ds.samples[:2], bflatrev.samples[:2])
    assert_array_equal(ds.samples[-2:], bflatrev.samples[-2:])
    # feature axis should match
    assert_equal(ds.shape[1:], bflatrev.shape[1:])
Example #45
0
def test_polydetrend():
    samples_forwhole = np.array( [[1.0, 2, 3, 4, 5, 6],
                                 [-2.0, -4, -6, -8, -10, -12]], ndmin=2 ).T
    samples_forchunks = np.array( [[1.0, 2, 3, 3, 2, 1],
                                  [-2.0, -4, -6, -6, -4, -2]], ndmin=2 ).T
    chunks = [0, 0, 0, 1, 1, 1]
    chunks_bad = [ 0, 0, 1, 1, 1, 0]
    target_whole = np.array( [[-3.0, -2, -1, 1, 2, 3],
                             [-6, -4, -2,  2, 4, 6]], ndmin=2 ).T
    target_chunked = np.array( [[-1.0, 0, 1, 1, 0, -1],
                               [2, 0, -2, -2, 0, 2]], ndmin=2 ).T


    ds = Dataset(samples_forwhole)

    # this one will auto-train the mapper on first use
    dm = PolyDetrendMapper(polyord=1, space='police')
    mds = dm.forward(ds)
    # features are linear trends, so detrending should remove all
    assert_array_almost_equal(mds.samples, np.zeros(mds.shape))
    # we get the information where each sample is assumed to be in the
    # space spanned by the polynomials
    assert_array_equal(mds.sa.police, np.arange(len(ds)))

    # hackish way to get the previous regressors into a dataset
    ds.sa['opt_reg_const'] = dm._regs[:,0]
    ds.sa['opt_reg_lin'] = dm._regs[:,1]
    # using these precomputed regressors, we should get the same result as
    # before even if we do not generate a regressor for linear
    dm_optreg = PolyDetrendMapper(polyord=0,
                                  opt_regs=['opt_reg_const', 'opt_reg_lin'])
    mds_optreg = dm_optreg.forward(ds)
    assert_array_almost_equal(mds_optreg, np.zeros(mds.shape))


    ds = Dataset(samples_forchunks)
    # 'constant' detrending removes the mean
    mds = PolyDetrendMapper(polyord=0).forward(ds)
    assert_array_almost_equal(
            mds.samples,
            samples_forchunks - np.mean(samples_forchunks, axis=0))
    # if there is no GLOBAL linear trend it should be identical to mean removal
    # even if trying to remove linear
    mds2 = PolyDetrendMapper(polyord=1).forward(ds)
    assert_array_almost_equal(mds, mds2)

    # chunk-wise detrending
    ds = dataset_wizard(samples_forchunks, chunks=chunks)
    dm = PolyDetrendMapper(chunks_attr='chunks', polyord=1, space='police')
    mds = dm.forward(ds)
    # features are chunkswise linear trends, so detrending should remove all
    assert_array_almost_equal(mds.samples, np.zeros(mds.shape))
    # we get the information where each sample is assumed to be in the
    # space spanned by the polynomials, which is the identical linspace in both
    # chunks
    assert_array_equal(mds.sa.police, range(3) * 2)
    # non-matching number of samples cannot be mapped
    assert_raises(ValueError, dm.forward, ds[:-1])
    # however, if the dataset knows about the space it is possible
    ds.sa['police'] = mds.sa.police
    # XXX this should be
    #mds2 = dm(ds[1:-1])
    #assert_array_equal(mds[1:-1], mds2)
    # XXX but right now is
    assert_raises(NotImplementedError, dm.forward, ds[1:-1])

    # Detrend must preserve the size of dataset
    assert_equal(mds.shape, ds.shape)

    # small additional test for break points
    # although they are no longer there
    ds = dataset_wizard(np.array([[1.0, 2, 3, 1, 2, 3]], ndmin=2).T,
                 targets=chunks, chunks=chunks)
    mds = PolyDetrendMapper(chunks_attr='chunks', polyord=1).forward(ds)
    assert_array_almost_equal(mds.samples, np.zeros(mds.shape))

    # test of different polyord on each chunk
    target_mixed = np.array( [[-1.0, 0, 1, 0, 0, 0],
                             [2.0, 0, -2, 0, 0, 0]], ndmin=2 ).T
    ds = dataset_wizard(samples_forchunks.copy(), targets=chunks, chunks=chunks)
    mds = PolyDetrendMapper(chunks_attr='chunks', polyord=[0,1]).forward(ds)
    assert_array_almost_equal(mds, target_mixed)

    # test irregluar spacing of samples, but with corrective time info
    samples_forwhole = np.array( [[1.0, 4, 6, 8, 2, 9],
                                 [-2.0, -8, -12, -16, -4, -18]], ndmin=2 ).T
    ds = Dataset(samples_forwhole, sa={'time': samples_forwhole[:,0]})
    # linear detrending that makes use of temporal info from dataset
    dm = PolyDetrendMapper(polyord=1, space='time')
    mds = dm.forward(ds)
    assert_array_almost_equal(mds.samples, np.zeros(mds.shape))

    # and now the same stuff, but with chunking and ordered by time
    samples_forchunks = np.array( [[1.0, 3, 3, 2, 2, 1],
                                  [-2.0, -6, -6, -4, -4, -2]], ndmin=2 ).T
    chunks = [0, 1, 0, 1, 0, 1]
    time = [4, 4, 12, 8, 8, 12]
    ds = Dataset(samples_forchunks.copy(), sa={'chunks': chunks, 'time': time})
    mds = PolyDetrendMapper(chunks_attr='chunks', polyord=1, space='time').forward(ds)

    # the whole thing must not affect the source data
    assert_array_equal(ds, samples_forchunks)
    # but if done inplace that is no longer true
    poly_detrend(ds, chunks_attr='chunks', polyord=1, space='time')
    assert_array_equal(ds, mds)