Example #1
0
    def test_custom_targets(self, lrn):
        """Simple test if a learner could cope with custom sa not targets
        """

        # Since we are comparing performances of two learners, we need
        # to assure that if they depend on some random seed -- they
        # would use the same value.  Currently we have such stochastic
        # behavior in SMLR
        if 'seed' in lrn.params:
            from mvpa import _random_seed
            lrn = lrn.clone()  # clone the beast
            lrn.params.seed = _random_seed  # reuse the same seed
        lrn_ = lrn.clone()
        lrn_.params.targets_attr = 'custom'

        te = CrossValidatedTransferError(TransferError(lrn), NFoldSplitter())

        te_ = CrossValidatedTransferError(TransferError(lrn_), NFoldSplitter())
        nclasses = 2 * (1 + int('multiclass' in lrn.__tags__))
        dsname = ('uni%dsmall' % nclasses,
                  'sin_modulated')[int(lrn.__is_regression__)]
        ds = datasets[dsname]
        ds_ = ds.copy()
        ds_.sa['custom'] = ds_.sa['targets']
        ds_.sa.pop('targets')
        self.failUnless('targets' in ds.sa,
                        msg="'targets' should remain in original ds")

        try:
            cve = te(ds)
            cve_ = te_(ds_)
        except Exception, e:
            self.fail("Failed with %r" % e)
Example #2
0
    def __test_matthias_question(self):
        rfe_clf = LinearCSVMC(C=1)

        rfesvm_split = SplitClassifier(rfe_clf)
        clf = \
            FeatureSelectionClassifier(
            clf = LinearCSVMC(C=1),
            feature_selection = RFE(
                sensitivity_analyzer = rfesvm_split.get_sensitivity_analyzer(
                    combiner=first_axis_mean,
                    transformer=np.abs),
                transfer_error=ConfusionBasedError(
                    rfesvm_split,
                    confusion_state="confusion"),
                stopping_criterion=FixedErrorThresholdStopCrit(0.20),
                feature_selector=FractionTailSelector(
                    0.2, mode='discard', tail='lower'),
                update_sensitivity=True))

        splitter = NFoldSplitter(cvtype=1)
        no_permutations = 1000

        cv = CrossValidatedTransferError(
            TransferError(clf),
            splitter,
            null_dist=MCNullDist(permutations=no_permutations,
                                 tail='left'),
            enable_ca=['confusion'])
        error = cv(datasets['uni2small'])
        self.failUnless(error < 0.4)
        self.failUnless(cv.ca.null_prob < 0.05)
Example #3
0
    def test_split_classifier_extended(self, clf_):
        clf2 = clf_.clone()
        ds = datasets['uni2medium']  #self.data_bin_1
        clf = SplitClassifier(
            clf=clf_,  #SameSignClassifier(),
            splitter=NFoldSplitter(1),
            enable_ca=['confusion', 'feature_ids'])
        clf.train(ds)  # train the beast
        error = clf.ca.confusion.error

        cv = CrossValidatedTransferError(
            TransferError(clf2),
            NFoldSplitter(),
            postproc=mean_sample(),
            enable_ca=['confusion', 'training_confusion'])
        cverror = cv(ds).samples.squeeze()

        self.failUnless(
            abs(error - cverror) < 0.01,
            msg="We should get the same error using split classifier as"
            " using CrossValidatedTransferError. Got %s and %s" %
            (error, cverror))

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.failUnless(error < 0.25,
                            msg="clf should generalize more or less fine. "
                            "Got error %s" % error)
        self.failUnlessEqual(len(clf.ca.confusion.sets),
                             len(ds.UC),
                             msg="Should have 1 confusion per each split")
        self.failUnlessEqual(
            len(clf.clfs),
            len(ds.UC),
            msg="Should have number of classifiers equal # of epochs")
    def test_chi_square_searchlight(self):
        # only do partial to save time

        # Can't yet do this since test_searchlight isn't yet "under nose"
        #skip_if_no_external('scipy')
        if not externals.exists('scipy'):
            return

        from mvpa.misc.stats import chisquare

        transerror = TransferError(sample_clf_lin)
        cv = CrossValidatedTransferError(
                transerror,
                NFoldSplitter(cvtype=1),
                enable_ca=['confusion'])


        def getconfusion(data):
            cv(data)
            return chisquare(cv.ca.confusion.matrix)[0]

        sl = sphere_searchlight(getconfusion, radius=0,
                         center_ids=[3,50])

        # run searchlight
        results = sl(self.dataset)
        self.failUnless(results.nfeatures == 2)
    def test_simple_n_minus_one_cv(self):
        data = get_mv_pattern(3)
        data.init_origids('samples')

        self.failUnless(data.nsamples == 120)
        self.failUnless(data.nfeatures == 2)
        self.failUnless(
            (data.sa.targets == \
                [0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0] * 6).all())
        self.failUnless(
            (data.sa.chunks == \
                [k for k in range(1, 7) for i in range(20)]).all())
        assert_equal(len(np.unique(data.sa.origids)), data.nsamples)

        transerror = TransferError(sample_clf_nl)
        cv = CrossValidatedTransferError(
            transerror,
            NFoldSplitter(cvtype=1),
            enable_ca=['confusion', 'training_confusion', 'samples_error'])

        results = cv(data)
        self.failUnless((results.samples < 0.2).all()
                        and (results.samples >= 0.0).all())

        # TODO: test accessibility of {training_,}confusion{,s} of
        # CrossValidatedTransferError

        self.failUnless(isinstance(cv.ca.samples_error, dict))
        self.failUnless(len(cv.ca.samples_error) == data.nsamples)
        # one value for each origid
        assert_array_equal(sorted(cv.ca.samples_error.keys()),
                           sorted(data.sa.origids))
        for k, v in cv.ca.samples_error.iteritems():
            self.failUnless(len(v) == 1)
Example #6
0
    def test_regression_as_classifier(self, regr):
        """Basic tests of metaclass for using regressions as classifiers
        """
        for dsname in 'uni2small', 'uni4small':
            ds = datasets[dsname]

            clf = RegressionAsClassifier(regr, enable_ca=['distances'])
            cv = CrossValidatedTransferError(
                TransferError(clf),
                OddEvenSplitter(),
                postproc=mean_sample(),
                enable_ca=['confusion', 'training_confusion'])

            error = cv(ds).samples.squeeze()

            nlabels = len(ds.uniquetargets)
            if nlabels == 2 \
               and cfg.getboolean('tests', 'labile', default='yes'):
                self.failUnless(error < 0.3)

            # Check if does not puke on repr and str
            self.failUnless(str(clf) != "")
            self.failUnless(repr(clf) != "")

            self.failUnlessEqual(clf.ca.distances.shape,
                                 (ds.nsamples / 2, nlabels))
Example #7
0
    def test_split_classifier(self):
        ds = self.data_bin_1
        clf = SplitClassifier(
            clf=SameSignClassifier(),
            splitter=NFoldSplitter(1),
            enable_ca=['confusion', 'training_confusion', 'feature_ids'])
        clf.train(ds)  # train the beast
        error = clf.ca.confusion.error
        tr_error = clf.ca.training_confusion.error

        clf2 = clf.clone()
        cv = CrossValidatedTransferError(
            TransferError(clf2),
            NFoldSplitter(),
            postproc=mean_sample(),
            enable_ca=['confusion', 'training_confusion'])
        cverror = cv(ds).samples.squeeze()
        tr_cverror = cv.ca.training_confusion.error

        self.failUnlessEqual(
            error,
            cverror,
            msg="We should get the same error using split classifier as"
            " using CrossValidatedTransferError. Got %s and %s" %
            (error, cverror))

        self.failUnlessEqual(
            tr_error,
            tr_cverror,
            msg="We should get the same training error using split classifier as"
            " using CrossValidatedTransferError. Got %s and %s" %
            (tr_error, tr_cverror))

        self.failUnlessEqual(clf.ca.confusion.percent_correct,
                             100,
                             msg="Dummy clf should train perfectly")
        self.failUnlessEqual(len(clf.ca.confusion.sets),
                             len(ds.UC),
                             msg="Should have 1 confusion per each split")
        self.failUnlessEqual(
            len(clf.clfs),
            len(ds.UC),
            msg="Should have number of classifiers equal # of epochs")
        self.failUnlessEqual(clf.predict(ds.samples),
                             list(ds.targets),
                             msg="Should classify correctly")

        # feature_ids must be list of lists, and since it is not
        # feature-selecting classifier used - we expect all features
        # to be utilized
        #  NOT ANYMORE -- for BoostedClassifier we have now union of all
        #  used features across slave classifiers. That makes
        #  semantics clear. If you need to get deeper -- use upcoming
        #  harvesting facility ;-)
        # self.failUnlessEqual(len(clf.feature_ids), len(ds.uniquechunks))
        # self.failUnless(np.array([len(ids)==ds.nfeatures
        #                         for ids in clf.feature_ids]).all())

        # Just check if we get it at all ;-)
        summary = clf.summary()
Example #8
0
    def test_james_problem(self):
        percent = 80
        dataset = datasets['uni2small']
        rfesvm_split = LinearCSVMC()
        fs = \
            RFE(sensitivity_analyzer=rfesvm_split.get_sensitivity_analyzer(),
                transfer_error=TransferError(rfesvm_split),
                feature_selector=FractionTailSelector(
                    percent / 100.0,
                    mode='select', tail='upper'), update_sensitivity=True)

        clf = FeatureSelectionClassifier(
            clf = LinearCSVMC(),
            # on features selected via RFE
            feature_selection = fs)
             # update sensitivity at each step (since we're not using the
             # same CLF as sensitivity analyzer)
        clf.ca.enable('feature_ids')

        cv = CrossValidatedTransferError(
            TransferError(clf),
            NFoldSplitter(cvtype=1),
            postproc=mean_sample(),
            enable_ca=['confusion'],
            expose_testdataset=True)
        #cv = SplitClassifier(clf)
        try:
            error = cv(dataset).samples.squeeze()
        except Exception, e:
            self.fail('CrossValidation cannot handle classifier with RFE '
                      'feature selection. Got exception: %s' % (e,))
Example #9
0
    def test_tree_classifier(self):
        """Basic tests for TreeClassifier
        """
        ds = datasets['uni4small']
        clfs = clfswh['binary']  # pool of classifiers
        # Lets permute so each time we try some different combination
        # of the classifiers
        clfs = [clfs[i] for i in np.random.permutation(len(clfs))]
        # Test conflicting definition
        tclf = TreeClassifier(clfs[0], {
            'L0+2': (('L0', 'L2'), clfs[1]),
            'L2+3': (('L2', 'L3'), clfs[2])
        })
        self.failUnlessRaises(ValueError, tclf.train, ds)
        """Should raise exception since label 2 is in both"""

        # Test insufficient definition
        tclf = TreeClassifier(clfs[0], {
            'L0+5': (('L0', 'L5'), clfs[1]),
            'L2+3': (('L2', 'L3'), clfs[2])
        })
        self.failUnlessRaises(ValueError, tclf.train, ds)
        """Should raise exception since no group for L1"""

        # proper definition now
        tclf = TreeClassifier(clfs[0], {
            'L0+1': (('L0', 'L1'), clfs[1]),
            'L2+3': (('L2', 'L3'), clfs[2])
        })

        # Lets test train/test cycle using CVTE
        cv = CrossValidatedTransferError(
            TransferError(tclf),
            OddEvenSplitter(),
            postproc=mean_sample(),
            enable_ca=['confusion', 'training_confusion'])
        cverror = cv(ds).samples.squeeze()
        try:
            rtclf = repr(tclf)
        except:
            self.fail(msg="Could not obtain repr for TreeClassifier")

        # Test accessibility of .clfs
        self.failUnless(tclf.clfs['L0+1'] is clfs[1])
        self.failUnless(tclf.clfs['L2+3'] is clfs[2])

        cvtrc = cv.ca.training_confusion
        cvtc = cv.ca.confusion
        if cfg.getboolean('tests', 'labile', default='yes'):
            # just a dummy check to make sure everything is working
            self.failUnless(cvtrc != cvtc)
            self.failUnless(cverror < 0.3)

        # TODO: whenever implemented
        tclf = TreeClassifier(clfs[0], {
            'L0': (('L0', ), clfs[1]),
            'L1+2+3': (('L1', 'L2', 'L3'), clfs[2])
        })
    def test_cache_speedup(self):
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)

        ck = sgSVM(kernel=CachedKernel(kernel=RbfSGKernel(sigma=2)), C=1)
        sk = sgSVM(kernel=RbfSGKernel(sigma=2), C=1)

        cv_c = CrossValidatedTransferError(TransferError(ck),
                                           splitter=NFoldSplitter())
        cv_s = CrossValidatedTransferError(TransferError(sk),
                                           splitter=NFoldSplitter())

        #data = datasets['uni4large']
        P = 5000
        data = normal_feature_dataset(snr=2,
                                      perlabel=200,
                                      nchunks=10,
                                      means=np.random.randn(2, P),
                                      nfeatures=P)

        t0 = time()
        ck.params.kernel.compute(data)
        cachetime = time() - t0

        t0 = time()
        cached_err = cv_c(data)
        ccv_time = time() - t0

        t0 = time()
        norm_err = cv_s(data)
        ncv_time = time() - t0

        assert_almost_equal(np.asanyarray(cached_err), np.asanyarray(norm_err))
        ok_(cachetime < ncv_time)
        ok_(ccv_time < ncv_time)
        #print 'Regular CV time: %s seconds'%ncv_time
        #print 'Caching time: %s seconds'%cachetime
        #print 'Cached CV time: %s seconds'%ccv_time

        speedup = ncv_time / (ccv_time + cachetime)
        #print 'Speedup factor: %s'%speedup

        # Speedup ideally should be 10, though it's not purely linear
        self.failIf(speedup < 2, 'Problem caching data - too slow!')
    def test_cached_kernel_different_datasets(self):
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)

        # Inspired by the problem Swaroop ran into
        k = LinearSGKernel(normalizer_cls=False)
        k_ = LinearSGKernel(normalizer_cls=False)  # to be cached
        ck = CachedKernel(k_)

        clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1)
        clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1)

        cvte = CrossValidatedTransferError(TransferError(clf), NFoldSplitter())
        cvte_ = CrossValidatedTransferError(TransferError(clf_),
                                            NFoldSplitter())

        te = TransferError(clf)
        te_ = TransferError(clf_)

        for r in xrange(2):
            ds1 = datasets['uni2medium']
            errs1 = cvte(ds1)
            ck.compute(ds1)
            ok_(ck._recomputed)
            errs1_ = cvte_(ds1)
            ok_(~ck._recomputed)
            assert_array_equal(errs1, errs1_)

            ds2 = datasets['uni3small']
            errs2 = cvte(ds2)
            ck.compute(ds2)
            ok_(ck._recomputed)
            errs2_ = cvte_(ds2)
            ok_(~ck._recomputed)
            assert_array_equal(errs2, errs2_)

            ssel = np.round(datasets['uni2large'].samples[:5, 0]).astype(int)
            terr = te(datasets['uni3small_test'][ssel],
                      datasets['uni3small_train'][::2])
            terr_ = te_(datasets['uni3small_test'][ssel],
                        datasets['uni3small_train'][::2])
            ok_(~ck._recomputed)
            ok_(terr == terr_)
 def test_harvesting(self):
     # get a dataset with a very high SNR
     data = get_mv_pattern(10)
     # do crossval with default errorfx and 'mean' combiner
     transerror = TransferError(clfswh['linear'][0])
     cv = CrossValidatedTransferError(
         transerror,
         NFoldSplitter(cvtype=1),
         harvest_attribs=['transerror.clf.ca.training_time'])
     result = cv(data)
     ok_(cv.ca.harvested.has_key('transerror.clf.ca.training_time'))
     assert_equal(len(cv.ca.harvested['transerror.clf.ca.training_time']),
                  len(data.UC))
    def test_noise_classification(self):
        # get a dataset with a very high SNR
        data = get_mv_pattern(10)

        # do crossval with default errorfx and 'mean' combiner
        transerror = TransferError(sample_clf_nl)
        cv = CrossValidatedTransferError(transerror, NFoldSplitter(cvtype=1))

        # must return a scalar value
        result = cv(data)
        # must be perfect
        self.failUnless((result.samples < 0.05).all())

        # do crossval with permuted regressors
        cv = CrossValidatedTransferError(
            transerror,
            NFoldSplitter(cvtype=1, permute_attr='targets', nrunspersplit=10))
        results = cv(data)

        # must be at chance level
        pmean = np.array(results).mean()
        self.failUnless(pmean < 0.58 and pmean > 0.42)
    def test_vstack_and_origids_issue(self):
        # That is actually what swaroop hit
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)

        # Inspired by the problem Swaroop ran into
        k = LinearSGKernel(normalizer_cls=False)
        k_ = LinearSGKernel(normalizer_cls=False)  # to be cached
        ck = CachedKernel(k_)

        clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1)
        clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1)

        cvte = CrossValidatedTransferError(TransferError(clf), NFoldSplitter())
        cvte_ = CrossValidatedTransferError(TransferError(clf_),
                                            NFoldSplitter())

        ds = datasets['uni2large_test'].copy(deep=True)
        ok_(~('orig_ids' in ds.sa))  # assure that there are None
        ck.compute(ds)  # so we initialize origids
        ok_('origids' in ds.sa)
        ds2 = ds.copy(deep=True)
        ds2.samples = np.zeros(ds2.shape)
        from mvpa.base.dataset import vstack
        ds_vstacked = vstack((ds2, ds))
        # should complaint now since there would not be unique
        # samples' origids
        if __debug__:
            assert_raises(ValueError, ck.compute, ds_vstacked)

        ds_vstacked.init_origids('samples')  # reset origids
        ck.compute(ds_vstacked)

        errs = cvte(ds_vstacked)
        errs_ = cvte_(ds_vstacked)
        # Following test would have failed since origids
        # were just ints, and then non-unique after vstack
        assert_array_equal(errs.samples, errs_.samples)
Example #15
0
    def test_classifier_generalization(self, clf):
        """Simple test if classifiers can generalize ok on simple data
        """
        te = CrossValidatedTransferError(TransferError(clf),
                                         NFoldSplitter(),
                                         postproc=mean_sample())
        # check the default
        self.failUnless(isinstance(te.transerror.errorfx, MeanMismatchErrorFx))

        nclasses = 2 * (1 + int('multiclass' in clf.__tags__))

        ds = datasets['uni%dmedium' % nclasses]
        try:
            cve = te(ds).samples.squeeze()
        except Exception, e:
            self.fail("Failed with %s" % e)
Example #16
0
    def test_values(self, clf):
        if isinstance(clf, MulticlassClassifier):
            # TODO: handle those values correctly
            return
        ds = datasets['uni2small']
        clf.ca.change_temporarily(enable_ca=['estimates'])
        cv = CrossValidatedTransferError(
            TransferError(clf),
            OddEvenSplitter(),
            enable_ca=['confusion', 'training_confusion'])
        _ = cv(ds)
        #print clf.descr, clf.values[0]
        # basic test either we get 1 set of values per each sample
        self.failUnlessEqual(len(clf.ca.estimates), ds.nsamples / 2)

        clf.ca.reset_changed_temporarily()
Example #17
0
    def test_regressions_classifiers(self, clf):
        """Simple tests on regressions being used as classifiers
        """
        # check if we get values set correctly
        clf.ca.change_temporarily(enable_ca=['estimates'])
        self.failUnlessRaises(UnknownStateError, clf.ca['estimates']._get)
        cv = CrossValidatedTransferError(
            TransferError(clf),
            NFoldSplitter(),
            enable_ca=['confusion', 'training_confusion'])
        ds = datasets['uni2small'].copy()
        # we want numeric labels to maintain the previous behavior, especially
        # since we deal with regressions here
        ds.sa.targets = AttributeMap().to_numeric(ds.targets)
        cverror = cv(ds)

        self.failUnless(len(clf.ca.estimates) == ds[ds.chunks == 1].nsamples)
        clf.ca.reset_changed_temporarily()
    def test_partial_searchlight_with_full_report(self):
        # compute N-1 cross-validation for each sphere
        transerror = TransferError(sample_clf_lin)
        cv = CrossValidatedTransferError(
                transerror,
                NFoldSplitter(cvtype=1))
        # contruct diameter 1 (or just radius 0) searchlight
        sl = sphere_searchlight(cv, radius=0,
                         center_ids=[3,50])

        # run searchlight
        results = sl(self.dataset)

        # only two spheres but error for all CV-folds
        self.failUnlessEqual(results.shape, (len(self.dataset.UC), 2))

        # test if we graciously puke if center_ids are out of bounds
        dataset0 = self.dataset[:, :50] # so we have no 50th feature
        self.failUnlessRaises(IndexError, sl, dataset0)
    def test_null_dist_prob(self, l_clf):
        train = datasets['uni2medium']

        num_perm = 10
        # define class to estimate NULL distribution of errors
        # use left tail of the distribution since we use MeanMatchFx as error
        # function and lower is better
        terr = TransferError(clf=l_clf,
                             null_dist=MCNullDist(permutations=num_perm,
                                                  tail='left'))

        # check reasonable error range
        err = terr(train, train)
        self.failUnless(err < 0.4)

        # Lets do the same for CVTE
        cvte = CrossValidatedTransferError(TransferError(clf=l_clf),
                                           OddEvenSplitter(),
                                           null_dist=MCNullDist(
                                               permutations=num_perm,
                                               tail='left',
                                               enable_ca=['dist_samples']),
                                           postproc=mean_sample())
        cv_err = cvte(train)

        # check that the result is highly significant since we know that the
        # data has signal
        null_prob = terr.ca.null_prob
        if cfg.getboolean('tests', 'labile', default='yes'):
            self.failUnless(
                null_prob <= 0.1,
                msg="Failed to check that the result is highly significant "
                "(got %f) since we know that the data has signal" % null_prob)

            self.failUnless(
                cvte.ca.null_prob <= 0.1,
                msg="Failed to check that the result is highly significant "
                "(got p(cvte)=%f) since we know that the data has signal" %
                cvte.ca.null_prob)

            # and we should be able to access the actual samples of the distribution
            self.failUnlessEqual(len(cvte.null_dist.ca.dist_samples), num_perm)
Example #20
0
    def test_ifs(self, svm):

        # data measure and transfer error quantifier use the SAME clf!
        trans_error = TransferError(svm)
        data_measure = CrossValidatedTransferError(trans_error,
                                                   NFoldSplitter(1),
                                                   postproc=mean_sample())

        ifs = IFS(data_measure,
                  trans_error,
                  feature_selector=\
                    # go for lower tail selection as data_measure will return
                    # errors -> low is good


                    FixedNElementTailSelector(1, tail='lower', mode='select'),
                  )
        wdata = self.get_data()
        wdata_nfeatures = wdata.nfeatures
        tdata = self.get_data()
        tdata_nfeatures = tdata.nfeatures

        sdata, stdata = ifs(wdata, tdata)

        # fail if orig datasets are changed
        self.failUnless(wdata.nfeatures == wdata_nfeatures)
        self.failUnless(tdata.nfeatures == tdata_nfeatures)

        # check that the features set with the least error is selected
        self.failUnless(len(ifs.ca.errors))
        e = np.array(ifs.ca.errors)
        self.failUnless(sdata.nfeatures == e.argmin() + 1)

        # repeat with dataset where selection order is known
        signal = datasets['dumb2']
        sdata, stdata = ifs(signal, signal)
        self.failUnless((sdata.samples[:, 0] == signal.samples[:, 0]).all())
 def test_auc(self, clf):
     """Test AUC computation
     """
     if isinstance(clf, MulticlassClassifier):
         # TODO: handle those values correctly
         return
     clf.ca.change_temporarily(enable_ca=['estimates'])
     # uni2 dataset with reordered labels
     ds2 = datasets['uni2small'].copy()
     # revert labels
     ds2.sa['targets'].value = ds2.targets[::-1].copy()
     # same with uni3
     ds3 = datasets['uni3small'].copy()
     ul = ds3.sa['targets'].unique
     nl = ds3.targets.copy()
     for l in xrange(3):
         nl[ds3.targets == ul[l]] = ul[(l + 1) % 3]
     ds3.sa.targets = nl
     for ds in [datasets['uni2small'], ds2, datasets['uni3small'], ds3]:
         cv = CrossValidatedTransferError(
             TransferError(clf),
             OddEvenSplitter(),
             enable_ca=['confusion', 'training_confusion'])
         cverror = cv(ds)
         stats = cv.ca.confusion.stats
         Nlabels = len(ds.uniquetargets)
         # so we at least do slightly above chance
         self.failUnless(stats['ACC'] > 1.2 / Nlabels)
         auc = stats['AUC']
         if (Nlabels == 2) or (Nlabels > 2 and auc[0] is not np.nan):
             mauc = np.min(stats['AUC'])
             if cfg.getboolean('tests', 'labile', default='yes'):
                 self.failUnless(
                     mauc > 0.55,
                     msg='All AUCs must be above chance. Got minimal '
                     'AUC=%.2g among %s' % (mauc, stats['AUC']))
     clf.ca.reset_changed_temporarily()
    def test_spatial_searchlight(self, common_variance):
        """Tests both generic and GNBSearchlight
        Test of GNBSearchlight anyways requires a ground-truth
        comparison to the generic version, so we are doing sweepargs here
        """
        # compute N-1 cross-validation for each sphere
        # YOH: unfortunately sample_clf_lin is not guaranteed
        #      to provide exactly the same results due to inherent
        #      iterative process.  Therefore lets use something quick
        #      and pure Python
        gnb = GNB(common_variance=common_variance)
        transerror = TransferError(gnb)
        cv = CrossValidatedTransferError(
                transerror,
                NFoldSplitter(cvtype=1))

        skwargs = dict(radius=1, enable_ca=['roi_sizes', 'raw_results'])
        sls = [sphere_searchlight(cv, **skwargs),
               #GNBSearchlight(gnb, NFoldSplitter(cvtype=1))
               sphere_gnbsearchlight(gnb, NFoldSplitter(cvtype=1),
                                     indexsum='fancy', **skwargs)
               ]

        if externals.exists('scipy'):
            sls += [ sphere_gnbsearchlight(gnb, NFoldSplitter(cvtype=1),
                                           indexsum='sparse', **skwargs)]

        # Just test nproc whenever common_variance is True
        if externals.exists('pprocess') and common_variance:
            sls += [sphere_searchlight(cv, nproc=2, **skwargs)]

        all_results = []
        ds = datasets['3dsmall'].copy()
        ds.fa['voxel_indices'] = ds.fa.myspace
        for sl in sls:
            # run searchlight
            results = sl(ds)
            all_results.append(results)

            # check for correct number of spheres
            self.failUnless(results.nfeatures == 106)
            # and measures (one per xfold)
            self.failUnless(len(results) == len(ds.UC))

            # check for chance-level performance across all spheres
            self.failUnless(0.4 < results.samples.mean() < 0.6)

            mean_errors = results.samples.mean(axis=0)
            # that we do get different errors ;)
            self.failUnless(len(np.unique(mean_errors) > 3))

            # check resonable sphere sizes
            self.failUnless(len(sl.ca.roi_sizes) == 106)
            self.failUnless(max(sl.ca.roi_sizes) == 7)
            self.failUnless(min(sl.ca.roi_sizes) == 4)

            # check base-class state
            self.failUnlessEqual(sl.ca.raw_results.nfeatures, 106)

        if len(all_results) > 1:
            # if we had multiple searchlights, we can check either they all
            # gave the same result (they should have)
            aresults = np.array([a.samples for a in all_results])
            dresults = np.abs(aresults - aresults.mean(axis=0))
            dmax = np.max(dresults)
            self.failUnless(dmax <= 1e-13)
Example #23
0
    def test_regressions(self, regr):
        """Simple tests on regressions
        """
        ds = datasets['chirp_linear']
        # we want numeric labels to maintain the previous behavior, especially
        # since we deal with regressions here
        ds.sa.targets = AttributeMap().to_numeric(ds.targets)

        cve = CrossValidatedTransferError(
            TransferError(regr),
            splitter=NFoldSplitter(),
            postproc=mean_sample(),
            enable_ca=['training_confusion', 'confusion'])
        # check the default
        self.failUnless(isinstance(cve.transerror.errorfx, CorrErrorFx))
        corr = np.asscalar(cve(ds).samples)

        # Our CorrErrorFx should never return NaN
        self.failUnless(not np.isnan(corr))
        self.failUnless(corr == cve.ca.confusion.stats['CCe'])

        splitregr = SplitClassifier(
            regr,
            splitter=OddEvenSplitter(),
            enable_ca=['training_confusion', 'confusion'])
        splitregr.train(ds)
        split_corr = splitregr.ca.confusion.stats['CCe']
        split_corr_tr = splitregr.ca.training_confusion.stats['CCe']

        for confusion, error in (
            (cve.ca.confusion, corr),
            (splitregr.ca.confusion, split_corr),
            (splitregr.ca.training_confusion, split_corr_tr),
        ):
            #TODO: test confusion statistics
            # Part of it for now -- CCe
            for conf in confusion.summaries:
                stats = conf.stats
                if cfg.getboolean('tests', 'labile', default='yes'):
                    self.failUnless(stats['CCe'] < 0.5)
                self.failUnlessEqual(stats['CCe'], stats['Summary CCe'])

            s0 = confusion.as_string(short=True)
            s1 = confusion.as_string(short=False)

            for s in [s0, s1]:
                self.failUnless(len(s) > 10,
                                msg="We should get some string representation "
                                "of regression summary. Got %s" % s)
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.failUnless(
                    error < 0.2,
                    msg="Regressions should perform well on a simple "
                    "dataset. Got correlation error of %s " % error)

            # Test access to summary statistics
            # YOH: lets start making testing more reliable.
            #      p-value for such accident to have is verrrry tiny,
            #      so if regression works -- it better has at least 0.5 ;)
            #      otherwise fix it! ;)
            # YOH: not now -- issues with libsvr in SG and linear kernel
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.failUnless(confusion.stats['CCe'] < 0.5)

        # just to check if it works fine
        split_predictions = splitregr.predict(ds.samples)
Example #24
0
from mvpa.datasets.splitters import OddEvenSplitter
from mvpa.clfs.svm import LinearCSVMC
from mvpa.clfs.transerror import TransferError
from mvpa.algorithms.cvtranserror import CrossValidatedTransferError
from mvpa.measures.searchlight import sphere_searchlight
from mvpa.testing.datasets import datasets
from mvpa.mappers.fx import mean_sample
"""For the sake of simplicity, let's use a small artificial dataset."""

# Lets just use our tiny 4D dataset from testing battery
dataset = datasets['3dlarge']
"""Now it only takes three lines for a searchlight analysis."""

# setup measure to be computed in each sphere (cross-validated
# generalization error on odd/even splits)
cv = CrossValidatedTransferError(TransferError(LinearCSVMC()),
                                 OddEvenSplitter())

# setup searchlight with 2 voxels radius and measure configured above
sl = sphere_searchlight(cv, radius=2, space='myspace', postproc=mean_sample())

# run searchlight on dataset
sl_map = sl(dataset)

print 'Best performing sphere error:', np.min(sl_map.samples)
"""
If this analysis is done on a fMRI dataset using `NiftiDataset` the resulting
searchlight map (`sl_map`) can be mapped back into the original dataspace
and viewed as a brain overlay. :ref:`Another example <example_searchlight>`
shows a typical application of this algorithm.

.. Mention the fact that it also is a special `SensitivityAnalyzer`
Example #25
0
    def test_cper_class(self, clf):
        if not (clf.params.has_key('C')):
            # skip those without C
            return

        ds = datasets['uni2medium'].copy()
        ds__ = datasets['uni2medium'].copy()
        #
        # ballanced set
        # Lets add a bit of noise to drive classifier nuts. same
        # should be done for disballanced set
        ds__.samples = ds__.samples + \
                       0.5 * np.random.normal(size=(ds__.samples.shape))
        #
        # disballanced set
        # lets overpopulate label 0
        times = 20
        ds_ = ds[(range(ds.nsamples) + range(ds.nsamples/2) * times)]
        ds_.samples = ds_.samples + \
                      0.5 * np.random.normal(size=(ds_.samples.shape))
        spl = get_nsamples_per_attr(ds_, 'targets') #_.samplesperlabel
        #print ds_.targets, ds_.chunks

        cve = CrossValidatedTransferError(TransferError(clf), NFoldSplitter(),
                                          enable_ca='confusion')
        # on balanced
        e = cve(ds__)
        tpr_1 = cve.ca.confusion.stats["TPR"][1]

        # on disbalanced
        e = cve(ds_)
        tpr_2 =  cve.ca.confusion.stats["TPR"][1]

        # Set '1 C per label'
        # recreate cvte since previous might have operated on copies
        cve = CrossValidatedTransferError(TransferError(clf), NFoldSplitter(),
                                          enable_ca='confusion')
        oldC = clf.params.C
        # TODO: provide clf.params.C not with a tuple but dictionary
        #       with C per label (now order is deduced in a cruel way)
        ratio = np.sqrt(float(spl[ds_.UT[0]])/spl[ds_.UT[1]])
        clf.params.C = (-1/ratio, -1*ratio)
        try:
            # on disbalanced but with balanced C
            e_ = cve(ds_)
            # reassign C
            clf.params.C = oldC
        except:
            clf.params.C = oldC
            raise
        tpr_3 = cve.ca.confusion.stats["TPR"][1]

        # Actual tests
        if cfg.getboolean('tests', 'labile', default='yes'):
            self.failUnless(tpr_1 > 0.25,
                            msg="Without disballance we should have some "
                            "hits, but got TPR=%.3f" % tpr_1)

            self.failUnless(tpr_2 < 0.25,
                            msg="With disballance we should have almost no "
                            "hits for minor, but got TPR=%.3f" % tpr_2)

            self.failUnless(tpr_3 > 0.25,
                            msg="With disballanced data but ratio-based Cs "
                            "we should have some hits for minor, but got "
                            "TPR=%.3f" % tpr_3)