def test_query_engine():
    data = np.arange(54)
    # indices in 3D
    ind = np.transpose((np.ones((3, 3, 3)).nonzero()))
    # sphere generator for 3 elements diameter
    sphere = ne.Sphere(1)
    # dataset with just one "space"
    ds = Dataset([data, data], fa={'s_ind': np.concatenate((ind, ind))})
    # and the query engine attaching the generator to the "index-space"
    qe = ne.IndexQueryEngine(s_ind=sphere)
    # cannot train since the engine does not know about the second space
    assert_raises(ValueError, qe.train, ds)
    # now do it again with a full spec
    ds = Dataset([data, data],
                 fa={
                     's_ind': np.concatenate((ind, ind)),
                     't_ind': np.repeat([0, 1], 27)
                 })
    qe = ne.IndexQueryEngine(s_ind=sphere, t_ind=None)
    qe.train(ds)
    # internal representation check
    # YOH: invalid for new implementation with lookup tables (dictionaries)
    #assert_array_equal(qe._searcharray,
    #                   np.arange(54).reshape(qe._searcharray.shape) + 1)
    # should give us one corner, collapsing the 't_ind'
    assert_array_equal(qe(s_ind=(0, 0, 0)), [0, 1, 3, 9, 27, 28, 30, 36])
    # directly specifying an index for 't_ind' without having an ROI
    # generator, should give the same corner, but just once
    assert_array_equal(qe(s_ind=(0, 0, 0), t_ind=0), [0, 1, 3, 9])
    # just out of the mask -- no match
    assert_array_equal(qe(s_ind=(3, 3, 3)), [])
    # also out of the mask -- but single match
    assert_array_equal(qe(s_ind=(2, 2, 3), t_ind=1), [53])
    # query by id
    assert_array_equal(qe(s_ind=(0, 0, 0), t_ind=0), qe[0])
    assert_array_equal(qe(s_ind=(0, 0, 0), t_ind=[0, 1]), qe(s_ind=(0, 0, 0)))
    # should not fail if t_ind is outside
    assert_array_equal(qe(s_ind=(0, 0, 0), t_ind=[0, 1, 10]),
                       qe(s_ind=(0, 0, 0)))

    # should fail if asked about some unknown thing
    assert_raises(ValueError, qe.__call__, s_ind=(0, 0, 0), buga=0)

    # Test by using some literal feature atttribute
    ds.fa['lit'] = ['roi1', 'ro2', 'r3'] * 18
    # should work as well as before
    assert_array_equal(qe(s_ind=(0, 0, 0)), [0, 1, 3, 9, 27, 28, 30, 36])
    # should fail if asked about some unknown (yet) thing
    assert_raises(ValueError, qe.__call__, s_ind=(0, 0, 0), lit='roi1')

    # Create qe which can query literals as well
    qe_lit = ne.IndexQueryEngine(s_ind=sphere, t_ind=None, lit=None)
    qe_lit.train(ds)
    # should work as well as before
    assert_array_equal(qe_lit(s_ind=(0, 0, 0)), [0, 1, 3, 9, 27, 28, 30, 36])
    # and subselect nicely -- only /3 ones
    assert_array_equal(qe_lit(s_ind=(0, 0, 0), lit='roi1'),
                       [0, 3, 9, 27, 30, 36])
    assert_array_equal(qe_lit(s_ind=(0, 0, 0), lit=['roi1', 'ro2']),
                       [0, 1, 3, 9, 27, 28, 30, 36])
Example #2
0
    def _call(self, dataset):
        # This code is based on SciPy's stats.f_oneway()
        # Copyright (c) Gary Strangman.  All rights reserved
        # License: BSD
        #
        # However, it got tweaked and optimized to better fit into PyMVPA.

        # number of groups
        targets_sa = dataset.sa[self._targets_attr]
        labels = targets_sa.value
        ul = targets_sa.unique

        na = len(ul)
        bign = float(dataset.nsamples)
        alldata = dataset.samples

        # total squares of sums
        sostot = np.sum(alldata, axis=0)
        sostot *= sostot
        sostot /= bign

        # total sum of squares
        sstot = np.sum(alldata * alldata, axis=0) - sostot

        # between group sum of squares
        ssbn = 0
        for l in ul:
            # all samples for the respective label
            d = alldata[labels == l]
            sos = np.sum(d, axis=0)
            sos *= sos
            ssbn += sos / float(len(d))

        ssbn -= sostot
        # within
        sswn = sstot - ssbn

        # degrees of freedom
        dfbn = na - 1
        dfwn = bign - na

        # mean sums of squares
        msb = ssbn / float(dfbn)
        msw = sswn / float(dfwn)
        f = msb / msw
        # assure no NaNs -- otherwise it leads instead of
        # sane unittest failure (check of NaNs) to crazy
        #   File "mtrand.pyx", line 1661, in mtrand.shuffle
        #  TypeError: object of type 'numpy.int64' has no len()
        # without any sane backtrace
        f[np.isnan(f)] = 0

        if externals.exists('scipy'):
            from scipy.stats import fprob
            return Dataset(f[np.newaxis], fa={'fprob': fprob(dfbn, dfwn, f)})
        else:
            return Dataset(f[np.newaxis])
Example #3
0
    def _call(self, dataset):
        # just for the beauty of it
        X = self._design

        # precompute transformation is not yet done
        if self._inv_design is None:
            self._inv_ip = (X.T * X).I
            self._inv_design = self._inv_ip * X.T

        # get parameter estimations for all features at once
        # (betas x features)
        betas = self._inv_design * dataset.samples

        # charge state
        self.ca.pe = pe = betas.T.A

        # if betas and no z-stats are desired return them right away
        if not self._voi == 'pe' or self.ca.is_enabled('zstat'):
            # compute residuals
            residuals = X * betas
            residuals -= dataset.samples

            # estimates of the parameter variance and compute zstats
            # assumption of mean(E) == 0 and equal variance
            # XXX next lines ignore off-diagonal elements and hence covariance
            # between regressors. The humble being writing these lines asks the
            # god of statistics for forgives, because it knows not what it does
            diag_ip = np.diag(self._inv_ip)
            # (features x betas)
            beta_vars = np.array([ r.var() * diag_ip for r in residuals.T ])
            # (parameter x feature)
            zstat = pe / np.sqrt(beta_vars)

            # charge state
            self.ca.zstat = zstat

        if self._voi == 'pe':
            # return as (beta x feature)
            result = Dataset(pe.T)
        elif self._voi == 'zstat':
            # return as (zstat x feature)
            result = Dataset(zstat.T)
        else:
            # we shall never get to this point
            raise ValueError, \
                  "Unknown variable of interest '%s'" % str(self._voi)
        result.sa['regressor'] = np.arange(len(result))
        return result
Example #4
0
def zscore(ds, **kwargs):
    """In-place Z-scoring of a `Dataset` or `ndarray`.

    This function behaves identical to `ZScoreMapper`. The only difference is
    that the actual Z-scoring is done in-place -- potentially causing a
    significant reduction of memory demands.

    Parameters
    ----------
    ds : Dataset or ndarray
      The data that will be Z-scored in-place.
    **kwargs
      For all other arguments, please see the documentation of `ZScoreMapper`.
    """
    zm = ZScoreMapper(**kwargs)
    zm._secret_inplace_zscore = True
    # train
    if isinstance(ds, Dataset):
        zm.train(ds)
    else:
        zm.train(Dataset(ds))
    # map
    mapped = zm(ds)
    # and append the mapper to the dataset
    if isinstance(mapped, Dataset):
        mapped._append_mapper(zm)
 def test_slicing(self):
     spl = HalfSplitter()
     splits = [(train, test) for (train, test) in spl(self.data)]
     for s in splits:
         # we get slicing all the time
         assert_true(s[0].samples.base is self.data.samples)
         assert_true(s[1].samples.base is self.data.samples)
     spl = HalfSplitter(noslicing=True)
     splits = [(train, test) for (train, test) in spl(self.data)]
     for s in splits:
         # we no slicing at all
         assert_false(s[0].samples.base is self.data.samples)
         assert_false(s[1].samples.base is self.data.samples)
     spl = NFoldSplitter()
     splits = [(train, test) for (train, test) in spl(self.data)]
     for i, s in enumerate(splits):
         # training only first and last split
         if i == 0 or i == len(splits) - 1:
             assert_true(s[0].samples.base is self.data.samples)
         else:
             assert_false(s[0].samples.base is self.data.samples)
         # we get slicing all the time
         assert_true(s[1].samples.base is self.data.samples)
     step_ds = Dataset(np.random.randn(20, 2),
                       sa={'chunks': np.tile([0, 1], 10)})
     spl = OddEvenSplitter()
     splits = [(train, test) for (train, test) in spl(step_ds)]
     assert_equal(len(splits), 2)
     for s in splits:
         # we get slicing all the time
         assert_true(s[0].samples.base is step_ds.samples)
         assert_true(s[1].samples.base is step_ds.samples)
Example #6
0
    def _call(self, dataset):
        """Computes featurewise f-scores using compound comparisons."""

        targets_sa = dataset.sa[self._targets_attr]
        orig_labels = targets_sa.value
        labels = orig_labels.copy()

        # Lets create a very shallow copy of a dataset with just
        # samples and targets_attr
        dataset_mod = Dataset(dataset.samples, sa={self._targets_attr: labels})
        results = []
        for ul in targets_sa.unique:
            labels[orig_labels == ul] = 1
            labels[orig_labels != ul] = 2
            f_ds = OneWayAnova._call(self, dataset_mod)
            if 'fprob' in f_ds.fa:
                # rename the fprob attribute to something label specific
                # to survive final aggregation stage
                f_ds.fa['fprob_' + str(ul)] = f_ds.fa.fprob
                del f_ds.fa['fprob']
            results.append(f_ds)

        results = vstack(results)
        results.sa[self._targets_attr] = targets_sa.unique
        return results
Example #7
0
    def _call(self, dataset):
        # XXX Hm... it might make sense to unify access functions
        # naming across our swig libsvm wrapper and sg access
        # functions for svm
        clf = self.clf
        sgsvm = clf.svm
        sens_labels = None
        if isinstance(sgsvm, shogun.Classifier.MultiClassSVM):
            sens, biases = [], []
            nsvms = sgsvm.get_num_svms()
            clabels = sorted(clf._attrmap.values())
            nclabels = len(clabels)
            sens_labels = []
            isvm = 0  # index for svm among known

            for i in xrange(nclabels):
                for j in xrange(i + 1, nclabels):
                    sgsvmi = sgsvm.get_svm(isvm)
                    labels_tuple = (clabels[i], clabels[j])
                    # Since we gave the labels in incremental order,
                    # we always should be right - but it does not
                    # hurt to check if set of labels is the same
                    if __debug__ and _shogun_exposes_slavesvm_labels:
                        if not sgsvmi.get_labels():
                            # We need to call classify() so labels get assigned
                            # to the multiclass SVM
                            sgsvm.classify()
                        assert (set([
                            sgsvmi.get_label(int(x))
                            for x in sgsvmi.get_support_vectors()
                        ]) == set(labels_tuple))
                    sens1, bias = self.__sg_helper(sgsvmi)
                    sens.append(sens1)
                    biases.append(bias)
                    sens_labels += [labels_tuple[::-1]]  # ??? positive first
                    isvm += 1
            assert (len(sens) == nsvms)  # we should have  covered all
        else:
            sens1, bias = self.__sg_helper(sgsvm)
            biases = np.atleast_1d(bias)
            sens = np.atleast_2d(sens1)
            if not clf.__is_regression__:
                assert (set(clf._attrmap.values()) == set([-1.0, 1.0]))
                assert (sens.shape[0] == 1)
                sens_labels = [(-1.0, 1.0)]

        ds = Dataset(np.atleast_2d(sens))
        if sens_labels is not None:
            if isinstance(sens_labels[0], tuple):
                # Need to have them in array of dtype object
                sens_labels = asobjarray(sens_labels)

            if len(clf._attrmap):
                sens_labels = clf._attrmap.to_literal(sens_labels,
                                                      recurse=True)
            ds.sa[clf.params.targets_attr] = sens_labels
        self.ca.biases = biases

        return ds
Example #8
0
def test_llemapper():
    skip_if_no_external('mdp', min_version='2.4')

    ds = Dataset(
        np.array([[0., 0., 0.], [0., 0., 1.], [0., 1., 0.], [1., 0., 0.],
                  [0., 1., 1.], [1., 0., 1.], [1., 1., 0.], [1., 1., 1.]]))
    pm = LLEMapper(3, output_dim=2)
    pm.train(ds)
    fmapped = pm(ds)
    assert_equal(fmapped.shape, (8, 2))
Example #9
0
    def _call(self, dataset=None):
        """Extract weights from GLMNET classifier.

        GLMNET always has weights available, so nothing has to be computed here.
        """
        clf = self.clf
        weights = clf.weights

        if __debug__:
            debug('GLMNET',
                  "Extracting weights for GLMNET - "+
                  "Result: min=%f max=%f" %\
                  (np.min(weights), np.max(weights)))

        #return weights
        if clf.params.family == 'multinomial':
            return Dataset(weights.T, sa={clf.params.targets_attr: clf._utargets})
        else:
            return Dataset(weights[np.newaxis])
    def _call(self, dataset):
        # first cast to floating point dtype, because noise is most likely
        # floating point as well and '+=' on int would not do the right thing
        if not np.issubdtype(dataset.samples.dtype, np.float):
            ds = dataset.copy(deep=False)
            ds.samples = dataset.samples.astype('float32')
            dataset = ds

        if __debug__:
            nfeatures = dataset.nfeatures

        # using a list here, to be able to handle output of unknown
        # dimensionality
        sens_map = []

        # compute the datameasure on the original dataset
        # this is used as a baseline
        orig_measure = self.__datameasure(dataset)

        # do for every _single_ feature in the dataset
        for feature in xrange(dataset.nfeatures):
            if __debug__:
                debug('PSA', "Analyzing %i features: %i [%i%%]" \
                    % (nfeatures,
                       feature+1,
                       float(feature+1)/nfeatures*100,), cr=True)

            # store current feature to restore it later on
            current_feature = dataset.samples[:, feature].copy()

            # add noise to current feature
            dataset.samples[:, feature] += self.__noise(size=len(dataset))

            # compute the datameasure on the perturbed dataset
            perturbed_measure = self.__datameasure(dataset)

            # restore the current feature
            dataset.samples[:, feature] = current_feature

            # difference from original datameasure is sensitivity
            sens_map.append(perturbed_measure.samples - orig_measure.samples)

        if __debug__:
            debug('PSA', '')

        # turn into an array and get rid of unnecessary axes -- ideally yielding
        # 2D array
        sens_map = np.array(sens_map).squeeze()
        # swap first to axis: we have nfeatures on first but want it as second
        # in a dataset
        sens_map = np.swapaxes(sens_map, 0, 1)
        return Dataset(sens_map)
Example #11
0
    def _predict(self, data):
        """Predict the class labels for the provided data.

        Returns a list of class labels (one for each data sample).
        """
        # make sure we're talking about arrays
        data = np.asarray(data)

        # checks only in debug mode
        if __debug__:
            if not data.ndim == 2:
                raise ValueError, "Data array must be two-dimensional."

            if not data.shape[1] == self.__data.nfeatures:
                raise ValueError, "Length of data samples (features) does " \
                                  "not match the classifier."

        # compute the distance matrix between training and test data with
        # distances stored row-wise, ie. distances between test sample [0]
        # and all training samples will end up in row 0
        dists = self.__dfx(self.__data.samples, data).T
        if self.ca.is_enabled('distances'):
            # TODO: theoretically we should have used deepcopy for sa
            #       here
            self.ca.distances = Dataset(dists, fa=self.__data.sa.copy())

        # determine the k nearest neighbors per test sample
        knns = dists.argsort(axis=1)[:, :self.__k]

        # predicted class labels will go here
        predicted = []

        if self.__voting == 'majority':
            vfx = self.get_majority_vote
        elif self.__voting == 'weighted':
            vfx = self.get_weighted_vote
        else:
            raise ValueError, "kNN told to perform unknown voting '%s'." \
                  % self.__voting

        # perform voting
        results = [vfx(knn) for knn in knns]

        # extract predictions
        predicted = [r[0] for r in results]

        # store the predictions in the state. Relies on State._setitem to do
        # nothing if the relevant state member is not enabled
        self.ca.predictions = predicted
        self.ca.estimates = np.array([r[1] for r in results])

        return predicted
Example #12
0
def test_fxmapper():
    origdata = np.arange(24).reshape(3,8)
    ds = Dataset(origdata.copy())
    ds.samples *= -1

    # test a mapper that doesn't change the shape
    # it shouldn't mapper along with axis it is applied
    m_s = FxMapper('samples', np.absolute)
    m_f = FxMapper('features', np.absolute)
    a_m = absolute_features()
    assert_array_equal(m_s.forward(ds), origdata)
    assert_array_equal(a_m.forward(ds), origdata)
    assert_array_equal(m_s.forward(ds), m_f.forward(ds))
Example #13
0
def aggregate_features(dataset, fx=np.mean):
    """Apply a function to each row of the samples matrix of a dataset.

    The functor given as `fx` has to honour an `axis` keyword argument in the
    way that NumPy used it (e.g. NumPy.mean, var).

    Returns
    -------
     a new `Dataset` object with the aggregated feature(s).
    """
    agg = fx(dataset.samples, axis=1)

    return Dataset(samples=np.array(agg, ndmin=2).T, sa=dataset.sa)
Example #14
0
def test_icamapper():
    # data: 40 sample feature line in 2d space (40x2; samples x features)
    samples = np.vstack([np.arange(40.) for i in range(2)]).T
    samples -= samples.mean()
    samples += np.random.normal(size=samples.shape, scale=0.1)
    ndlin = Dataset(samples)

    pm = ICAMapper()
    pm.train(ndlin.copy())
    assert_equal(pm.proj.shape, (2, 2))

    p = pm.forward(ndlin.copy())
    assert_equal(p.shape, (40, 2))
    # check that the mapped data can be fully recovered by 'reverse()'
    assert_array_almost_equal(pm.reverse(p), ndlin)
Example #15
0
    def _call(self, dataset=None):
        """Extract weights from LARS classifier.

        LARS always has weights available, so nothing has to be computed here.
        """
        clf = self.clf
        weights = clf.weights

        if __debug__:
            debug('LARS',
                  "Extracting weights for LARS - "+
                  "Result: min=%f max=%f" %\
                  (np.min(weights), np.max(weights)))

        return Dataset(np.atleast_2d(weights))
Example #16
0
def test_featuregroup_mapper():
    ds = Dataset(np.arange(24).reshape(3,8))
    ds.fa['roi'] = [0, 1] * 4
    # just to check
    ds.sa['chunks'] = np.arange(3)

    # correct results
    csamples = [[3, 4], [11, 12], [19, 20]]
    croi = [0, 1]
    cchunks = np.arange(3)

    m = mean_group_feature(['roi'])
    mds = m.forward(ds)
    assert_equal(mds.shape, (3, 2))
    assert_array_equal(mds.samples, csamples)
    assert_array_equal(mds.fa.roi, np.unique([0, 1] * 4))
    # FAs should simply remain the same
    assert_array_equal(mds.sa.chunks, np.arange(3))
    def test_anova(self):
        """Additional aspects of OnewayAnova
        """
        oa = OneWayAnova()
        oa_custom = OneWayAnova(targets_attr='custom')

        ds = datasets['uni4large']
        ds_custom = Dataset(ds.samples, sa={'custom': ds.targets})

        r = oa(ds)
        self.failUnlessRaises(KeyError, oa_custom, ds)
        r_custom = oa_custom(ds_custom)

        self.failUnless(np.allclose(r.samples, r_custom.samples))

        # we should get the same results on subsequent runs
        r2 = oa(ds)
        r_custom2 = oa_custom(ds_custom)
        self.failUnless(np.allclose(r.samples, r2.samples))
        self.failUnless(np.allclose(r_custom.samples, r_custom2.samples))
Example #18
0
def test_pcamapper():
    # data: 40 sample feature line in 20d space (40x20; samples x features)
    ndlin = Dataset(
        np.concatenate([np.arange(40) for i in range(20)]).reshape(20, -1).T)

    pm = PCAMapper()
    # train PCA
    assert_raises(mdp.NodeException, pm.train, ndlin)
    ndlin.samples = ndlin.samples.astype('float')
    ndlin_noise = ndlin.copy()
    ndlin_noise.samples += np.random.random(size=ndlin.samples.shape)
    # we have no variance for more than one PCA component, hence just one
    # actual non-zero eigenvalue
    assert_raises(mdp.NodeException, pm.train, ndlin)
    pm.train(ndlin_noise)
    assert_equal(pm.proj.shape, (20, 20))
    # now project data into PCA space
    p = pm.forward(ndlin.samples)
    assert_equal(p.shape, (40, 20))
    # check that the mapped data can be fully recovered by 'reverse()'
    assert_array_almost_equal(pm.reverse(p), ndlin)
Example #19
0
    def _call(self, dataset=None):
        """Extract weights from SMLR classifier.

        SMLR always has weights available, so nothing has to be computed here.
        """
        clf = self.clf
        # transpose to have the number of features on the second axis
        # (as usual)
        weights = clf.weights.T

        if clf.params.has_bias:
            self.ca.biases = clf.biases

        if __debug__:
            debug('SMLR',
                  "Extracting weights for %d-class SMLR" %
                  (len(weights) + 1) +
                  "Result: min=%f max=%f" %\
                  (np.min(weights), np.max(weights)))

        # limit the labels to the number of sensitivity sets, to deal
        # with the case of `fit_all_weights=False`
        return Dataset(
            weights, sa={clf.params.targets_attr: clf._ulabels[:len(weights)]})
Example #20
0
def test_flatten():
    samples_shape = (2, 2, 4)
    data_shape = (4, ) + samples_shape
    data = np.arange(np.prod(data_shape)).reshape(data_shape).view(myarray)
    pristinedata = data.copy()
    target = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
              [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
              [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
              [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]]
    target = np.array(target).view(myarray)
    index_target = np.array([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3],
                             [0, 1, 0], [0, 1, 1], [0, 1, 2], [0, 1, 3],
                             [1, 0, 0], [1, 0, 1], [1, 0, 2], [1, 0, 3],
                             [1, 1, 0], [1, 1, 1], [1, 1, 2], [1, 1, 3]])

    # array subclass survives
    ok_(isinstance(data, myarray))

    # actually, there should be no difference between a plain FlattenMapper and
    # a chain that only has a FlattenMapper as the one element
    for fm in [
            FlattenMapper(inspace='voxel'),
            ChainMapper([
                FlattenMapper(inspace='voxel'),
                FeatureSliceMapper(slice(None))
            ])
    ]:
        # not working if untrained
        assert_raises(RuntimeError, fm.forward1,
                      np.arange(np.sum(samples_shape) + 1))

        fm.train(data)

        ok_(isinstance(fm.forward(data), myarray))
        ok_(isinstance(fm.forward1(data[2]), myarray))
        assert_array_equal(fm.forward(data), target)
        assert_array_equal(fm.forward1(data[2]), target[2])
        assert_raises(ValueError, fm.forward, np.arange(4))

        # all of that leaves that data unmodified
        assert_array_equal(data, pristinedata)

        # reverse mapping
        ok_(isinstance(fm.reverse(target), myarray))
        ok_(isinstance(fm.reverse1(target[0]), myarray))
        ok_(isinstance(fm.reverse(target[1:2]), myarray))
        assert_array_equal(fm.reverse(target), data)
        assert_array_equal(fm.reverse1(target[0]), data[0])
        assert_array_equal(fm.reverse(target[1:2]), data[1:2])
        assert_raises(ValueError, fm.reverse, np.arange(14))

        # check one dimensional data, treated as scalar samples
        oned = np.arange(5)
        fm.train(Dataset(oned))
        # needs 2D
        assert_raises(ValueError, fm.forward, oned)
        # doesn't match mapper, since Dataset turns `oned` into (5,1)
        assert_raises(ValueError, fm.forward, oned)
        assert_equal(Dataset(oned).nfeatures, 1)

        # try dataset mode, with some feature attribute
        fattr = np.arange(np.prod(samples_shape)).reshape(samples_shape)
        ds = Dataset(data, fa={'awesome': fattr.copy()})
        assert_equal(ds.samples.shape, data_shape)
        fm.train(ds)
        dsflat = fm.forward(ds)
        ok_(isinstance(dsflat, Dataset))
        ok_(isinstance(dsflat.samples, myarray))
        assert_array_equal(dsflat.samples, target)
        assert_array_equal(dsflat.fa.awesome,
                           np.arange(np.prod(samples_shape)))
        assert_true(isinstance(dsflat.fa['awesome'], ArrayCollectable))
        # test index creation
        assert_array_equal(index_target, dsflat.fa.voxel)

        # and back
        revds = fm.reverse(dsflat)
        ok_(isinstance(revds, Dataset))
        ok_(isinstance(revds.samples, myarray))
        assert_array_equal(revds.samples, data)
        assert_array_equal(revds.fa.awesome, fattr)
        assert_true(isinstance(revds.fa['awesome'], ArrayCollectable))
        assert_false('voxel' in revds.fa)
Example #21
0
 def wrap_samples(obj, data, *args, **kwargs):
     if is_datasetlike(data):
         return fx(obj, data, *args, **kwargs)
     else:
         return fx(obj, Dataset(data), *args, **kwargs)
Example #22
0
    def _call(self, dataset, callables=[]):
        # local bindings
        clf = self.clf
        model = clf.model

        # Labels for sensitivities to be returned
        sens_labels = None

        if clf.__is_regression__:
            nr_class = None
            svm_labels = None  # shouldn't bother to provide "targets" for regressions
        else:
            nr_class = model.nr_class
            svm_labels = model.labels

        # No need to warn since now we by default we do not do
        # anything evil and provide labels -- so it is up for a user
        # to decide either he wants to do something silly
        #if nr_class != 2:
        #    warning("You are estimating sensitivity for SVM %s trained on %d" %
        #            (str(clf), nr_class) +
        #            " classes. Make sure that it is what you intended to do" )

        svcoef = np.matrix(model.get_sv_coef())
        svs = np.matrix(model.get_sv())
        rhos = np.asarray(model.get_rho())

        self.ca.biases = rhos
        if self.params.split_weights:
            if nr_class != 2:
                raise NotImplementedError, \
                      "Cannot compute per-class weights for" \
                      " non-binary classification task"
            # libsvm might have different idea on the ordering
            # of labels, so we would need to map them back explicitely
            ds_labels = list(dataset.sa[
                clf.params.targets_attr].unique)  # labels in the dataset
            senses = [None for i in ds_labels]
            # first label is given positive value
            for i, (c, l) in enumerate([(svcoef > 0, lambda x: x),
                                        (svcoef < 0, lambda x: x * -1)]):
                # convert to array, and just take the meaningful dimension
                c_ = c.A[0]
                # NOTE svm_labels are numerical; ds_labels are literal
                senses[ds_labels.index(
                            clf._attrmap.to_literal(svm_labels[i]))] = \
                                (l(svcoef[:, c_] * svs[c_, :])).A[0]
            weights = np.array(senses)
            sens_labels = svm_labels
        else:
            # XXX yoh: .mean() is effectively
            # averages across "sensitivities" of all paired classifiers (I
            # think). See more info on this topic in svm.py on how sv_coefs
            # are stored
            #
            # First multiply SV coefficients with the actual SVs to get
            # weighted impact of SVs on decision, then for each feature
            # take mean across SVs to get a single weight value
            # per feature
            if nr_class is None or nr_class <= 2:
                # as simple as this
                weights = (svcoef * svs).A
                # and only in case of classification
                if nr_class:
                    # ??? First label seems corresponds to positive
                    sens_labels = [tuple(svm_labels[::-1])]
            else:
                # we need to compose correctly per each pair of classifiers.
                # See docstring for get_sv_coef for more details on internal
                # structure of bloody storage

                # total # of pairs
                npairs = nr_class * (nr_class - 1) / 2
                # # of SVs in each class
                NSVs_perclass = model.get_n_sv()
                # indices where each class starts in each row of SVs
                # name is after similar variable in libsvm internals
                nz_start = np.cumsum([0] + NSVs_perclass[:-1])
                nz_end = nz_start + NSVs_perclass
                # reserve storage
                weights = np.zeros((npairs, svs.shape[1]))
                ipair = 0  # index of the pair
                """
                // classifier (i,j): coefficients with
				// i are in sv_coef[j-1][nz_start[i]...],
				// j are in sv_coef[i][nz_start[j]...]
                """
                sens_labels = []
                for i in xrange(nr_class):
                    for j in xrange(i + 1, nr_class):
                        weights[ipair, :] = np.asarray(
                            svcoef[j - 1, nz_start[i]:nz_end[i]] *
                            svs[nz_start[i]:nz_end[i]] +
                            svcoef[i, nz_start[j]:nz_end[j]] *
                            svs[nz_start[j]:nz_end[j]])
                        # ??? First label corresponds to positive
                        # that is why [j], [i]
                        sens_labels += [(svm_labels[j], svm_labels[i])]
                        ipair += 1  # go to the next pair
                assert (ipair == npairs)

        if __debug__ and 'SVM' in debug.active:
            if nr_class:
                nsvs = model.get_n_sv()
            else:
                nsvs = model.get_total_n_sv()

            debug('SVM',
                  "Extracting weights for %s-class SVM: #SVs=%s, " % \
                  (nr_class, nsvs) + \
                  " SVcoefshape=%s SVs.shape=%s Rhos=%s." % \
                  (svcoef.shape, svs.shape, rhos) + \
                  " Result: min=%f max=%f" % (np.min(weights), np.max(weights)))

        ds_kwargs = {}
        if nr_class:  # for classification only
            # and we should have prepared the labels
            assert (sens_labels is not None)

            if len(clf._attrmap):
                if isinstance(sens_labels[0], tuple):
                    sens_labels = asobjarray(sens_labels)
                sens_labels = clf._attrmap.to_literal(sens_labels,
                                                      recurse=True)

            # NOTE: `weights` is already and always 2D
            ds_kwargs = dict(sa={clf.params.targets_attr: sens_labels})

        weights_ds = Dataset(weights, **ds_kwargs)
        return weights_ds
Example #23
0
 def _call(self, dataset):
     """Train linear SVM on `dataset` and extract weights from classifier.
     """
     sens = self.__mult * (np.arange(dataset.nfeatures) -
                           int(dataset.nfeatures / 2))
     return Dataset(sens[np.newaxis])
Example #24
0
def fmri_dataset(samples, targets=None, chunks=None, mask=None,
                 sprefix='voxel', tprefix='time', add_fa=None,):
    """Create a dataset from an fMRI timeseries image.

    The timeseries image serves as the samples data, with each volume becoming
    a sample. All 3D volume samples are flattened into one-dimensional feature
    vectors, optionally being masked (i.e. subset of voxels corresponding to
    non-zero elements in a mask image).

    In addition to (optional) samples attributes for targets and chunks the
    returned dataset contains a number of additional attributes:

    Samples attributes (per each volume):

      * volume index (time_indices)
      * volume acquisition time (time_coord)

    Feature attributes (per each voxel):

      * voxel indices (voxel_indices), sometimes referred to as ijk

    Dataset attributes:

      * dump of the NIfTI image header data (imghdr)
      * volume extent (voxel_dim)
      * voxel extent (voxel_eldim)

    The default attribute name is listed in parenthesis, but may be altered by
    the corresponding prefix arguments. The validity of the attribute values
    relies on correct settings in the NIfTI image header.

    Parameters
    ----------
    samples : str or NiftiImage or list
      fMRI timeseries, specified either as a filename (single file 4D image),
      an image instance (4D image), or a list of filenames or image instances
      (each list item corresponding to a 3D volume).
    targets : scalar or sequence
      Label attribute for each volume in the timeseries, or a scalar value that
      is assigned to all samples.
    chunks : scalar or sequence
      Chunk attribute for each volume in the timeseries, or a scalar value that
      is assigned to all samples.
    mask : str or NiftiImage
      Filename or image instance of a 3D volume mask. Voxels corresponding to
      non-zero elements in the mask will be selected. The mask has to be in the
      same space (orientation and dimensions) as the timeseries image
    sprefix : str or None
      Prefix for attribute names describing spatial properties of the
      timeseries. If None, no such attributes are stored in the dataset.
    tprefix : str or None
      Prefix for attribute names describing temporal properties of the
      timeseries. If None, no such attributes are stored in the dataset.
    add_fa : dict or None
      Optional dictionary with additional volumetric data that shall be stored
      as feature attributes in the dataset. The dictionary key serves as the
      feature attribute name. Each value might be of any type supported by the
      'mask' argument of this function.

    Returns
    -------
    Dataset
    """
    # load the samples
    imgdata, imghdr = _load_anyimg(samples, ensure=True, enforce_dim=4)

    # figure out what the mask is, but only handle known cases, the rest
    # goes directly into the mapper which maybe knows more
    maskimg = _load_anyimg(mask)
    if maskimg is None:
        pass
    else:
        # take just data and ignore the header
        mask = maskimg[0]

    # compile the samples attributes
    sa = {}
    if not targets is None:
        sa['targets'] = _expand_attribute(targets, imgdata.shape[0], 'targets')
    if not chunks is None:
        sa['chunks'] = _expand_attribute(chunks, imgdata.shape[0], 'chunks')

    # create a dataset
    ds = Dataset(imgdata, sa=sa)
    if sprefix is None:
        inspace = None
    else:
        inspace = sprefix + '_indices'
    ds = ds.get_mapped(FlattenMapper(shape=imgdata.shape[1:], inspace=inspace))

    # now apply the mask if any
    if not mask is None:
        flatmask = ds.a.mapper.forward1(mask)
        # direct slicing is possible, and it is potentially more efficient,
        # so let's use it
        #mapper = FeatureSliceMapper(flatmask)
        #ds = ds.get_mapped(FeatureSliceMapper(flatmask))
        ds = ds[:, flatmask != 0]

    # load and store additional feature attributes
    if not add_fa is None:
        for fattr in add_fa:
            value = _load_anyimg(add_fa[fattr], ensure=True)[0]
            ds.fa[fattr] = ds.a.mapper.forward1(value)

    # store interesting props in the dataset
    ds.a['imghdr'] = imghdr
    # If there is a space assigned , store the extent of that space
    if sprefix is not None:
        ds.a[sprefix + '_dim'] = imgdata.shape[1:]
        # 'voxdim' is (x,y,z) while 'samples' are (t,z,y,x)
        ds.a[sprefix + '_eldim'] = _get_voxdim(imghdr)
        # TODO extend with the unit
    if tprefix is not None:
        ds.sa[tprefix + '_indices'] = np.arange(len(ds), dtype='int')
        ds.sa[tprefix + '_coords'] = np.arange(len(ds), dtype='float') \
                                     * _get_dt(imghdr)
        # TODO extend with the unit

    return ds
    def _sl_call(self, dataset, roi_ids, nproc):
        """Call to GNBSearchlight
        """
        # Local bindings
        gnb = self._gnb
        params = gnb.params
        splitter = self._splitter
        errorfx = self._errorfx
        qe = self._qe

        ## if False:
        ##     class A(object):
        ##         pass
        ##     self = A()
        ##     import numpy as np
        ##     from mvpa.clfs.gnb import GNB
        ##     from mvpa.datasets.splitters import NFoldSplitter
        ##     from mvpa.misc.errorfx import MeanMismatchErrorFx
        ##     #from mvpa.testing.datasets import datasets
        ##     from mvpa.datasets import Dataset
        ##     from mvpa.misc.neighborhood import IndexQueryEngine, Sphere
        ##     from mvpa.clfs.distance import absmin_distance
        ##     import time
        ##     if __debug__:
        ##         from mvpa.base import debug
        ##         debug.active += ['SLC.*']
        ##         # XXX is it that ugly?
        ##         debug.active.pop(debug.active.index('SLC_'))
        ##         debug.metrics += ['reltime']
        ##     dataset = datasets['3dlarge']
        ##     sphere = Sphere(radius=1,
        ##                     distance_func=absmin_distance)
        ##     qe = IndexQueryEngine(myspace=sphere)

        ##     # Fracisco's data
        ##     dataset = ds_fp
        ##     qe = IndexQueryEngine(voxel_indices=sphere)

        ##     qe.train(dataset)
        ##     roi_ids = np.arange(dataset.nfeatures)
        ##     gnb = GNB()
        ##     params = gnb.params
        ##     splitter = NFoldSplitter()
        ##     errorfx = MeanMismatchErrorFx()

        if __debug__:
            time_start = time.time()

        targets_sa_name = params.targets_attr
        targets_sa = dataset.sa[targets_sa_name]

        if __debug__:
            debug_slc_ = 'SLC_' in debug.active

        # get the dataset information into easy vars
        X = dataset.samples
        if len(X.shape) != 2:
            raise ValueError, \
                  'Unlike GNB, GNBSearchlight (for now) operates on already' \
                  'flattened datasets'
        labels = targets_sa.value
        ulabels = targets_sa.unique
        nlabels = len(ulabels)
        label2index = dict((l, il) for il, l in enumerate(ulabels))
        labels_numeric = np.array([label2index[l] for l in labels])
        ulabels_numeric = [label2index[l] for l in ulabels]
        # set the feature dimensions
        nsamples = len(X)
        s_shape = X.shape[1:]  # shape of a single sample

        #
        # Everything toward optimization ;)
        #
        # Silly Yarik thinks that it might be worth to pre-compute
        # statistics per each feature within a block of the samples
        # which always come together in splits -- most often it is a
        # (chunk, label) combination, but since we simply use a
        # splitter -- who knows! Therefore lets figure out what are
        # those blocks and operate on them instead of original samples.
        #
        # After additional thinking about this -- probably it would be
        # just minor additional improvements (ie not worth it) but
        # since it is coded already -- let it me so

        # 1. Query splitter for the splits we will have
        if __debug__:
            debug(
                'SLC', 'Phase 1. Initializing splits using %s on %s' %
                (splitter, dataset))
        # check the splitter -- splitcfg isn't sufficient
        # TODO: RF splitters so we could reliably obtain the configuration
        #       splitcfg just returns what to split into the other in terms
        #       of chunks... and we need actual indicies
        if splitter.permute_attr is not None:
            raise NotImplementedError, \
                  "Splitters which permute targets aren't supported here"
        # Lets just create a dummy ds which will store for us actual sample
        # indicies
        # XXX we could make it even more lightweight I guess...
        dataset_indicies = Dataset(np.arange(nsamples), sa=dataset.sa)
        splits = list(splitter(dataset_indicies))
        nsplits = len(splits)
        assert (len(splits[0]) == 2)  # assure that we have only 2
        # splits here for cvte

        # 2. Figure out the new 'chunks x labels' blocks of combinations
        #    of samples
        if __debug__:
            debug(
                'SLC', 'Phase 2. Blocking data for %i splits and %i labels' %
                (nsplits, nlabels))
        # array of indicies for label, split1, split2, ...
        # through which we will pass later on to figure out
        # unique combinations
        combinations = np.ones((nsamples, 1 + nsplits), dtype=int) * -1
        # labels
        combinations[:, 0] = labels_numeric
        for isplit, (split1, split2) in enumerate(splits):
            combinations[split1.samples[:, 0], 1 + isplit] = 1
            combinations[split2.samples[:, 0], 1 + isplit] = 2
        # sample descriptions -- should be unique for
        # samples within the same block
        descriptions = [tuple(c) for c in combinations]
        udescriptions = sorted(list(set(descriptions)))
        nblocks = len(udescriptions)
        description2block = dict([(d, i) for i, d in enumerate(udescriptions)])
        # Indices for samples to point to their block
        sample2block = np.array([description2block[d] for d in descriptions])

        # 3. Compute statistics per each block
        #
        if __debug__:
            debug('SLC',
                  'Phase 3. Computing statistics for %i blocks' % (nblocks, ))

        #
        # reusable containers which should stay of the same size
        #

        # sums and sums of squares per each block
        sums = np.zeros((nblocks, ) + s_shape)
        # sums of squares
        sums2 = np.zeros((nblocks, ) + s_shape)

        # per each label:
        means = np.zeros((nlabels, ) + s_shape)
        # means of squares for stddev computation
        means2 = np.zeros((nlabels, ) + s_shape)
        variances = np.zeros((nlabels, ) + s_shape)
        # degenerate dimension are added for easy broadcasting later on
        nsamples_per_class = np.zeros((nlabels, ) + (1, ) * len(s_shape))

        # results
        results = np.zeros((nsplits, ) + s_shape)

        block_counts = np.zeros((nblocks, ))
        block_labels = [None] * nblocks

        X2 = np.square(X)
        # silly way for now
        for l, s, s2, ib in zip(labels_numeric, X, X2, sample2block):
            sums[ib] += s
            sums2[ib] += s2
            block_counts[ib] += 1
            if block_labels[ib] is None:
                block_labels[ib] = l
            else:
                assert (block_labels[ib] == l)
        block_labels = np.asanyarray(block_labels)
        # additional silly tests for paranoid
        assert (block_labels.dtype.kind is 'i')

        # 4. Lets deduce all neighbors... might need to be RF into the
        #    parallel part later on
        nrois = len(roi_ids)
        if __debug__:
            debug(
                'SLC', 'Phase 4. Deducing neighbors information for %i ROIs' %
                (nrois, ))
        roi_fids = [qe.query_byid(f) for f in roi_ids]
        nroi_fids = len(roi_fids)
        # makes sense to waste precious ms only if ca is enabled
        if self.ca.is_enabled('roi_sizes'):
            roi_sizes = [len(x) for x in roi_fids]
        else:
            roi_sizes = []

        indexsum = self._indexsum
        if indexsum == 'sparse':
            if __debug__:
                debug(
                    'SLC', 'Phase 4b. Converting neighbors to sparse matrix '
                    'representation')
            # convert to "sparse representation" where column j contains
            # 1s only at the roi_fids[j] indices
            roi_fids = inds_to_coo(roi_fids,
                                   shape=(dataset.nfeatures, nroi_fids))
            indexsum_fx = lastdim_columnsums_spmatrix
        elif indexsum == 'fancy':
            indexsum_fx = lastdim_columnsums_fancy_indexing
        else:
            raise ValueError, \
                  "Do not know how to deal with indexsum=%s" % indexsum

        # 5. Lets do actual "splitting" and "classification"
        if __debug__:
            debug('SLC', 'Phase 5. Major loop')

        for isplit, split in enumerate(splits):
            if __debug__:
                debug('SLC', ' Split %i out of %i' % (isplit, nsplits))
            # figure out for a given splits the blocks we want to work
            # with
            # sample_indicies
            training_sis = split[0].samples[:, 0]
            # convert to blocks training split
            training_bis = np.unique(sample2block[training_sis])

            # now lets do our GNB business
            training_nsamples = 0
            for il, l in enumerate(ulabels_numeric):
                bis_il = training_bis[block_labels[training_bis] == l]
                nsamples_per_class[il] = N_float = \
                                         float(np.sum(block_counts[bis_il]))
                training_nsamples += N_float
                if N_float == 0.0:
                    variances[il] = means[il] = means2[il] = 0.
                else:
                    means[il] = np.sum(sums[bis_il], axis=0) / N_float
                    # Not yet normed
                    means2[il] = np.sum(sums2[bis_il], axis=0)

            ## Actually compute the non-0 variances
            non0labels = (nsamples_per_class.squeeze() != 0)
            if np.all(non0labels):
                # For a possible tiny speed up avoiding copying and
                # using (no) slicing
                non0labels = slice(None)

            if params.common_variance:
                variances[:] = \
                    np.sum(means2 - nsamples_per_class*np.square(means),
                           axis=0) \
                    / training_nsamples
            else:
                variances[non0labels] = \
                    (means2 - nsamples_per_class*np.square(means))[non0labels] \
                    / nsamples_per_class[non0labels]

            # assign priors
            priors = gnb._get_priors(nlabels, training_nsamples,
                                     nsamples_per_class)

            # proceed in a way we have in GNB code with logprob=True,
            # i.e. operating within the exponents -- should lead to some
            # performance advantage
            norm_weight = -0.5 * np.log(2 * np.pi * variances)
            # last added dimension would be for ROIs
            logpriors = np.log(priors[:, np.newaxis, np.newaxis])

            if __debug__:
                debug('SLC', "  'Training' is done")

            # Now it is time to "classify" our samples.
            # and for that we first need to compute corresponding
            # probabilities (or may be un
            data = X[split[1].samples[:, 0]]
            targets = labels_numeric[split[1].samples[:, 0]]

            # argument of exponentiation
            scaled_distances = \
                 -0.5 * (((data - means[:, np.newaxis, ...])**2) \
                         / variances[:, np.newaxis, ...])

            # incorporate the normalization from normals
            lprob_csfs = norm_weight[:, np.newaxis, ...] + scaled_distances

            ## First we need to reshape to get class x samples x features
            lprob_csf = lprob_csfs.reshape(lprob_csfs.shape[:2] + (-1, ))

            ## Now we come to naive part which requires looping
            ## through all spheres
            if __debug__:
                debug('SLC', "  Doing 'Searchlight'")
            # resultant logprobs for each class x sample x roi
            lprob_cs_sl = np.zeros(lprob_csfs.shape[:2] + (nroi_fids, ))
            indexsum_fx(lprob_csf, roi_fids, out=lprob_cs_sl)

            lprob_cs_sl += logpriors
            lprob_cs_cp_sl = lprob_cs_sl
            # for each of the ROIs take the class with maximal (log)probability
            predictions = lprob_cs_cp_sl.argmax(axis=0)
            # no need to map back [self.ulabels[c] for c in winners]
            #predictions = winners
            # assess the errors
            if __debug__:
                debug('SLC', "  Assessing accuracies")

            if isinstance(errorfx, MeanMismatchErrorFx):
                results[isplit, :] = \
                    (predictions != targets[:, None]).sum(axis=0) \
                    / float(len(targets))
            else:
                # somewhat silly but a way which allows to use pre-crafted
                # error functions without a chance to screw up
                for i, fpredictions in enumerate(predictions.T):
                    results[isplit, i] = errorfx(fpredictions, targets)

        if __debug__:
            debug(
                'SLC', "GNBSearchlight is done in %.3g sec" %
                (time.time() - time_start))

        return Dataset(results), roi_sizes
Example #26
0
    def _call(self, dataset):
        """Perform cross-validation on a dataset.

        'dataset' is passed to the splitter instance and serves as the source
        dataset to generate split for the single cross-validation folds.
        """
        # store the results of the splitprocessor
        results = []
        self.ca.splits = []

        # local bindings
        ca = self.ca
        clf = self.__transerror.clf
        expose_testdataset = self.__expose_testdataset

        # what ca to enable in terr
        terr_enable = []
        for state_var in ['confusion', 'training_confusion', 'samples_error']:
            if ca.is_enabled(state_var):
                terr_enable += [state_var]

        # charge ca with initial values
        summaryClass = clf.__summary_class__
        clf_hastestdataset = hasattr(clf, 'testdataset')

        self.ca.confusion = summaryClass()
        self.ca.training_confusion = summaryClass()
        self.ca.transerrors = []
        if ca.is_enabled('samples_error'):
            dataset.init_origids('samples',
                                 attr=self.__samples_idattr,
                                 mode='existing')
            self.ca.samples_error = dict([
                (id_, []) for id_ in dataset.sa[self.__samples_idattr].value
            ])

        # enable requested ca in child TransferError instance (restored
        # again below)
        if len(terr_enable):
            self.__transerror.ca.change_temporarily(enable_ca=terr_enable)

        # We better ensure that underlying classifier is not trained if we
        # are going to deepcopy transerror
        if ca.is_enabled("transerrors"):
            self.__transerror.untrain()

        # collect sum info about the split that where made for the resulting
        # dataset
        splitinfo = []

        # splitter
        for split in self.__splitter(dataset):
            splitinfo.append("%s->%s" % (','.join([
                str(c) for c in split[0].sa[self.__splitter.splitattr].unique
            ]), ','.join([
                str(c) for c in split[1].sa[self.__splitter.splitattr].unique
            ])))

            # only train classifier if splitter provides something in first
            # element of tuple -- the is the behavior of TransferError
            if ca.is_enabled("splits"):
                self.ca.splits.append(split)

            if ca.is_enabled("transerrors"):
                # copy first and then train, as some classifiers cannot be copied
                # when already trained, e.g. SWIG'ed stuff
                lastsplit = None
                for ds in split:
                    if ds is not None:
                        lastsplit = ds.a.lastsplit
                        break
                if lastsplit:
                    # only if we could deduce that it was last split
                    # use the 'mother' transerror
                    transerror = self.__transerror
                else:
                    # otherwise -- deep copy
                    transerror = deepcopy(self.__transerror)
            else:
                transerror = self.__transerror

            # assign testing dataset if given classifier can digest it
            if clf_hastestdataset and expose_testdataset:
                transerror.clf.testdataset = split[1]

            # run the beast
            result = transerror(split[1], split[0])

            # unbind the testdataset from the classifier
            if clf_hastestdataset and expose_testdataset:
                transerror.clf.testdataset = None

            # next line is important for 'self._harvest' call
            self._harvest(locals())

            # XXX Look below -- may be we should have not auto added .?
            #     then transerrors also could be deprecated
            if ca.is_enabled("transerrors"):
                self.ca.transerrors.append(transerror)

            # XXX: could be merged with next for loop using a utility class
            # that can add dict elements into a list
            if ca.is_enabled("samples_error"):
                for k, v in \
                  transerror.ca.samples_error.iteritems():
                    self.ca.samples_error[k].append(v)

            # pull in child ca
            for state_var in ['confusion', 'training_confusion']:
                if ca.is_enabled(state_var):
                    ca[state_var].value.__iadd__(
                        transerror.ca[state_var].value)

            if __debug__:
                debug("CROSSC", "Split #%d: result %s" \
                      % (len(results), `result`))
            results.append(result)

        # Since we could have operated with a copy -- bind the last used one back
        self.__transerror = transerror

        # put ca of child TransferError back into original config
        if len(terr_enable):
            self.__transerror.ca.reset_changed_temporarily()

        self.ca.results = results
        """Store conditional attribute if it is enabled"""
        results = Dataset(results, sa={'cv_fold': splitinfo})
        return results