Beispiel #1
0
    def test_split_classifier_extended(self, clf_):
        clf2 = clf_.clone()
        ds = datasets['uni2%s' % self._get_clf_ds(clf2)]
        clf = SplitClassifier(clf=clf_, #SameSignClassifier(),
                enable_ca=['stats', 'feature_ids'])
        clf.train(ds)                   # train the beast
        error = clf.ca.stats.error

        cv = CrossValidation(clf2, NFoldPartitioner(), postproc=mean_sample(),
            enable_ca=['stats', 'training_stats'])
        cverror = cv(ds).samples.squeeze()

        self.failUnless(abs(error-cverror)<0.01,
                msg="We should get the same error using split classifier as"
                    " using CrossValidation. Got %s and %s"
                    % (error, cverror))

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.failUnless(error < 0.25,
                msg="clf should generalize more or less fine. "
                    "Got error %s" % error)
        self.failUnlessEqual(len(clf.ca.stats.sets), len(ds.UC),
            msg="Should have 1 confusion per each split")
        self.failUnlessEqual(len(clf.clfs), len(ds.UC),
            msg="Should have number of classifiers equal # of epochs")
Beispiel #2
0
    def test_split_classifier_extended(self, clf_):
        clf2 = clf_.clone()
        ds = datasets['uni2medium']  #self.data_bin_1
        clf = SplitClassifier(
            clf=clf_,  #SameSignClassifier(),
            splitter=NFoldSplitter(1),
            enable_ca=['confusion', 'feature_ids'])
        clf.train(ds)  # train the beast
        error = clf.ca.confusion.error

        cv = CrossValidatedTransferError(
            TransferError(clf2),
            NFoldSplitter(),
            postproc=mean_sample(),
            enable_ca=['confusion', 'training_confusion'])
        cverror = cv(ds).samples.squeeze()

        self.failUnless(
            abs(error - cverror) < 0.01,
            msg="We should get the same error using split classifier as"
            " using CrossValidatedTransferError. Got %s and %s" %
            (error, cverror))

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.failUnless(error < 0.25,
                            msg="clf should generalize more or less fine. "
                            "Got error %s" % error)
        self.failUnlessEqual(len(clf.ca.confusion.sets),
                             len(ds.UC),
                             msg="Should have 1 confusion per each split")
        self.failUnlessEqual(
            len(clf.clfs),
            len(ds.UC),
            msg="Should have number of classifiers equal # of epochs")
Beispiel #3
0
    def test_james_problem(self):
        percent = 80
        dataset = datasets['uni2small']
        rfesvm_split = LinearCSVMC()
        fs = \
            RFE(rfesvm_split.get_sensitivity_analyzer(),
                ProxyMeasure(rfesvm_split,
                             postproc=BinaryFxNode(mean_mismatch_error,
                                                   'targets')),
                Splitter('train'),
                fselector=FractionTailSelector(
                    percent / 100.0,
                    mode='select', tail='upper'), update_sensitivity=True)

        clf = FeatureSelectionClassifier(
            LinearCSVMC(),
            # on features selected via RFE
            fs)
             # update sensitivity at each step (since we're not using the
             # same CLF as sensitivity analyzer)

        cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(),
            enable_ca=['confusion'])
        #cv = SplitClassifier(clf)
        try:
            error = cv(dataset).samples.squeeze()
        except Exception, e:
            self.fail('CrossValidation cannot handle classifier with RFE '
                      'feature selection. Got exception: %s' % (e,))
Beispiel #4
0
    def test_regression_as_classifier(self, regr):
        """Basic tests of metaclass for using regressions as classifiers
        """
        for dsname in 'uni2small', 'uni4small':
            ds = datasets[dsname]

            clf = RegressionAsClassifier(regr, enable_ca=['distances'])
            cv = CrossValidatedTransferError(
                TransferError(clf),
                OddEvenSplitter(),
                postproc=mean_sample(),
                enable_ca=['confusion', 'training_confusion'])

            error = cv(ds).samples.squeeze()

            nlabels = len(ds.uniquetargets)
            if nlabels == 2 \
               and cfg.getboolean('tests', 'labile', default='yes'):
                self.failUnless(error < 0.3)

            # Check if does not puke on repr and str
            self.failUnless(str(clf) != "")
            self.failUnless(repr(clf) != "")

            self.failUnlessEqual(clf.ca.distances.shape,
                                 (ds.nsamples / 2, nlabels))
Beispiel #5
0
    def test_regression_as_classifier(self, regr):
        """Basic tests of metaclass for using regressions as classifiers
        """
        for dsname in 'uni2small', 'uni4small':
            ds = datasets[dsname]

            clf = RegressionAsClassifier(regr, enable_ca=['distances'])
            cv = CrossValidation(clf, OddEvenPartitioner(),
                    postproc=mean_sample(),
                    enable_ca=['stats', 'training_stats'])

            error = cv(ds).samples.squeeze()

            nlabels = len(ds.uniquetargets)
            if nlabels == 2 \
               and cfg.getboolean('tests', 'labile', default='yes'):
                self.failUnless(error < 0.3,
                                msg="Got error %.2f on %s dataset"
                                % (error, dsname))

            # Check if does not puke on repr and str
            self.failUnless(str(clf) != "")
            self.failUnless(repr(clf) != "")

            self.failUnlessEqual(clf.ca.distances.shape,
                                 (ds.nsamples / 2, nlabels))
Beispiel #6
0
    def test_james_problem(self):
        percent = 80
        dataset = datasets["uni2small"]
        rfesvm_split = LinearCSVMC()
        fs = RFE(
            sensitivity_analyzer=rfesvm_split.get_sensitivity_analyzer(),
            transfer_error=TransferError(rfesvm_split),
            feature_selector=FractionTailSelector(percent / 100.0, mode="select", tail="upper"),
            update_sensitivity=True,
        )

        clf = FeatureSelectionClassifier(
            clf=LinearCSVMC(),
            # on features selected via RFE
            feature_selection=fs,
        )
        # update sensitivity at each step (since we're not using the
        # same CLF as sensitivity analyzer)
        clf.ca.enable("feature_ids")

        cv = CrossValidatedTransferError(
            TransferError(clf),
            NFoldSplitter(cvtype=1),
            postproc=mean_sample(),
            enable_ca=["confusion"],
            expose_testdataset=True,
        )
        # cv = SplitClassifier(clf)
        try:
            error = cv(dataset).samples.squeeze()
        except Exception, e:
            self.fail(
                "CrossValidation cannot handle classifier with RFE " "feature selection. Got exception: %s" % (e,)
            )
Beispiel #7
0
    def test_james_problem(self):
        percent = 80
        dataset = datasets['uni2small']
        rfesvm_split = LinearCSVMC()
        fs = \
            RFE(sensitivity_analyzer=rfesvm_split.get_sensitivity_analyzer(),
                transfer_error=TransferError(rfesvm_split),
                feature_selector=FractionTailSelector(
                    percent / 100.0,
                    mode='select', tail='upper'), update_sensitivity=True)

        clf = FeatureSelectionClassifier(
            clf = LinearCSVMC(),
            # on features selected via RFE
            feature_selection = fs)
             # update sensitivity at each step (since we're not using the
             # same CLF as sensitivity analyzer)
        clf.ca.enable('feature_ids')

        cv = CrossValidatedTransferError(
            TransferError(clf),
            NFoldSplitter(cvtype=1),
            postproc=mean_sample(),
            enable_ca=['confusion'],
            expose_testdataset=True)
        #cv = SplitClassifier(clf)
        try:
            error = cv(dataset).samples.squeeze()
        except Exception, e:
            self.fail('CrossValidation cannot handle classifier with RFE '
                      'feature selection. Got exception: %s' % (e,))
Beispiel #8
0
    def test_split_classifier(self):
        ds = self.data_bin_1
        clf = SplitClassifier(
            clf=SameSignClassifier(),
            splitter=NFoldSplitter(1),
            enable_ca=['confusion', 'training_confusion', 'feature_ids'])
        clf.train(ds)  # train the beast
        error = clf.ca.confusion.error
        tr_error = clf.ca.training_confusion.error

        clf2 = clf.clone()
        cv = CrossValidatedTransferError(
            TransferError(clf2),
            NFoldSplitter(),
            postproc=mean_sample(),
            enable_ca=['confusion', 'training_confusion'])
        cverror = cv(ds).samples.squeeze()
        tr_cverror = cv.ca.training_confusion.error

        self.failUnlessEqual(
            error,
            cverror,
            msg="We should get the same error using split classifier as"
            " using CrossValidatedTransferError. Got %s and %s" %
            (error, cverror))

        self.failUnlessEqual(
            tr_error,
            tr_cverror,
            msg="We should get the same training error using split classifier as"
            " using CrossValidatedTransferError. Got %s and %s" %
            (tr_error, tr_cverror))

        self.failUnlessEqual(clf.ca.confusion.percent_correct,
                             100,
                             msg="Dummy clf should train perfectly")
        self.failUnlessEqual(len(clf.ca.confusion.sets),
                             len(ds.UC),
                             msg="Should have 1 confusion per each split")
        self.failUnlessEqual(
            len(clf.clfs),
            len(ds.UC),
            msg="Should have number of classifiers equal # of epochs")
        self.failUnlessEqual(clf.predict(ds.samples),
                             list(ds.targets),
                             msg="Should classify correctly")

        # feature_ids must be list of lists, and since it is not
        # feature-selecting classifier used - we expect all features
        # to be utilized
        #  NOT ANYMORE -- for BoostedClassifier we have now union of all
        #  used features across slave classifiers. That makes
        #  semantics clear. If you need to get deeper -- use upcoming
        #  harvesting facility ;-)
        # self.failUnlessEqual(len(clf.feature_ids), len(ds.uniquechunks))
        # self.failUnless(np.array([len(ids)==ds.nfeatures
        #                         for ids in clf.feature_ids]).all())

        # Just check if we get it at all ;-)
        summary = clf.summary()
Beispiel #9
0
    def test_tree_classifier(self):
        """Basic tests for TreeClassifier
        """
        ds = datasets['uni4small']
        clfs = clfswh['binary']  # pool of classifiers
        # Lets permute so each time we try some different combination
        # of the classifiers
        clfs = [clfs[i] for i in np.random.permutation(len(clfs))]
        # Test conflicting definition
        tclf = TreeClassifier(clfs[0], {
            'L0+2': (('L0', 'L2'), clfs[1]),
            'L2+3': (('L2', 'L3'), clfs[2])
        })
        self.failUnlessRaises(ValueError, tclf.train, ds)
        """Should raise exception since label 2 is in both"""

        # Test insufficient definition
        tclf = TreeClassifier(clfs[0], {
            'L0+5': (('L0', 'L5'), clfs[1]),
            'L2+3': (('L2', 'L3'), clfs[2])
        })
        self.failUnlessRaises(ValueError, tclf.train, ds)
        """Should raise exception since no group for L1"""

        # proper definition now
        tclf = TreeClassifier(clfs[0], {
            'L0+1': (('L0', 'L1'), clfs[1]),
            'L2+3': (('L2', 'L3'), clfs[2])
        })

        # Lets test train/test cycle using CVTE
        cv = CrossValidatedTransferError(
            TransferError(tclf),
            OddEvenSplitter(),
            postproc=mean_sample(),
            enable_ca=['confusion', 'training_confusion'])
        cverror = cv(ds).samples.squeeze()
        try:
            rtclf = repr(tclf)
        except:
            self.fail(msg="Could not obtain repr for TreeClassifier")

        # Test accessibility of .clfs
        self.failUnless(tclf.clfs['L0+1'] is clfs[1])
        self.failUnless(tclf.clfs['L2+3'] is clfs[2])

        cvtrc = cv.ca.training_confusion
        cvtc = cv.ca.confusion
        if cfg.getboolean('tests', 'labile', default='yes'):
            # just a dummy check to make sure everything is working
            self.failUnless(cvtrc != cvtc)
            self.failUnless(cverror < 0.3)

        # TODO: whenever implemented
        tclf = TreeClassifier(clfs[0], {
            'L0': (('L0', ), clfs[1]),
            'L1+2+3': (('L1', 'L2', 'L3'), clfs[2])
        })
Beispiel #10
0
    def test_tree_classifier(self):
        """Basic tests for TreeClassifier
        """
        ds = datasets['uni4small']
        clfs = clfswh['binary']         # pool of classifiers
        # Lets permute so each time we try some different combination
        # of the classifiers
        clfs = [clfs[i] for i in np.random.permutation(len(clfs))]
        # Test conflicting definition
        tclf = TreeClassifier(clfs[0], {
            'L0+2' : (('L0', 'L2'), clfs[1]),
            'L2+3' : (('L2', 'L3'), clfs[2])})
        self.failUnlessRaises(ValueError, tclf.train, ds)
        """Should raise exception since label 2 is in both"""

        # Test insufficient definition
        tclf = TreeClassifier(clfs[0], {
            'L0+5' : (('L0', 'L5'), clfs[1]),
            'L2+3' : (('L2', 'L3'),       clfs[2])})
        self.failUnlessRaises(ValueError, tclf.train, ds)
        """Should raise exception since no group for L1"""

        # proper definition now
        tclf = TreeClassifier(clfs[0], {
            'L0+1' : (('L0', 'L1'), clfs[1]),
            'L2+3' : (('L2', 'L3'), clfs[2])})

        # Lets test train/test cycle using CVTE
        cv = CrossValidatedTransferError(
            TransferError(tclf),
            OddEvenSplitter(),
            postproc=mean_sample(),
            enable_ca=['confusion', 'training_confusion'])
        cverror = cv(ds).samples.squeeze()
        try:
            rtclf = repr(tclf)
        except:
            self.fail(msg="Could not obtain repr for TreeClassifier")

        # Test accessibility of .clfs
        self.failUnless(tclf.clfs['L0+1'] is clfs[1])
        self.failUnless(tclf.clfs['L2+3'] is clfs[2])

        cvtrc = cv.ca.training_confusion
        cvtc = cv.ca.confusion
        if cfg.getboolean('tests', 'labile', default='yes'):
            # just a dummy check to make sure everything is working
            self.failUnless(cvtrc != cvtc)
            self.failUnless(cverror < 0.3)

        # TODO: whenever implemented
        tclf = TreeClassifier(clfs[0], {
            'L0' : (('L0',), clfs[1]),
            'L1+2+3' : (('L1', 'L2', 'L3'),    clfs[2])})
Beispiel #11
0
    def test_null_dist_prob(self, l_clf):
        train = datasets['uni2medium']

        num_perm = 10
        permutator = AttributePermutator('targets', count=num_perm)
        # define class to estimate NULL distribution of errors
        # use left tail of the distribution since we use MeanMatchFx as error
        # function and lower is better
        terr = TransferMeasure(
            l_clf,
            Repeater(count=2),
            postproc=BinaryFxNode(mean_mismatch_error, 'targets'),
            null_dist=MCNullDist(permutator,
                                 tail='left'))

        # check reasonable error range
        err = terr(train)
        self.failUnless(np.mean(err) < 0.4)

        # Lets do the same for CVTE
        cvte = CrossValidation(l_clf, OddEvenPartitioner(),
            null_dist=MCNullDist(permutator,
                                 tail='left',
                                 enable_ca=['dist_samples']),
            postproc=mean_sample())
        cv_err = cvte(train)

        # check that the result is highly significant since we know that the
        # data has signal
        null_prob = np.asscalar(terr.ca.null_prob)

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.failUnless(null_prob <= 0.1,
                msg="Failed to check that the result is highly significant "
                    "(got %f) since we know that the data has signal"
                    % null_prob)

            self.failUnless(np.asscalar(cvte.ca.null_prob) <= 0.1,
                msg="Failed to check that the result is highly significant "
                    "(got p(cvte)=%f) since we know that the data has signal"
                    % np.asscalar(cvte.ca.null_prob))

        # we should be able to access the actual samples of the distribution
        # yoh: why it is 3D really?
        # mih: because these are the distribution samples for the ONE error
        #      collapsed into ONE value across all folds. It will also be
        #      3d if the return value of the measure isn't a scalar and it is
        #      not collapsed across folds. it simply corresponds to the shape
        #      of the output dataset of the respective measure (+1 axis)
        # Some permutations could have been skipped since classifier failed
        # to train due to degenerate situation etc, thus accounting for them
        self.failUnlessEqual(cvte.null_dist.ca.dist_samples.shape[2],
                             num_perm - cvte.null_dist.ca.skipped)
Beispiel #12
0
    def test_split_classifier(self):
        ds = self.data_bin_1
        clf = SplitClassifier(clf=SameSignClassifier(),
                splitter=NFoldSplitter(1),
                enable_ca=['confusion', 'training_confusion',
                               'feature_ids'])
        clf.train(ds)                   # train the beast
        error = clf.ca.confusion.error
        tr_error = clf.ca.training_confusion.error

        clf2 = clf.clone()
        cv = CrossValidatedTransferError(
            TransferError(clf2),
            NFoldSplitter(),
            postproc=mean_sample(),
            enable_ca=['confusion', 'training_confusion'])
        cverror = cv(ds).samples.squeeze()
        tr_cverror = cv.ca.training_confusion.error

        self.failUnlessEqual(error, cverror,
                msg="We should get the same error using split classifier as"
                    " using CrossValidatedTransferError. Got %s and %s"
                    % (error, cverror))

        self.failUnlessEqual(tr_error, tr_cverror,
                msg="We should get the same training error using split classifier as"
                    " using CrossValidatedTransferError. Got %s and %s"
                    % (tr_error, tr_cverror))

        self.failUnlessEqual(clf.ca.confusion.percent_correct,
                             100,
                             msg="Dummy clf should train perfectly")
        self.failUnlessEqual(len(clf.ca.confusion.sets),
                             len(ds.UC),
                             msg="Should have 1 confusion per each split")
        self.failUnlessEqual(len(clf.clfs), len(ds.UC),
                             msg="Should have number of classifiers equal # of epochs")
        self.failUnlessEqual(clf.predict(ds.samples), list(ds.targets),
                             msg="Should classify correctly")

        # feature_ids must be list of lists, and since it is not
        # feature-selecting classifier used - we expect all features
        # to be utilized
        #  NOT ANYMORE -- for BoostedClassifier we have now union of all
        #  used features across slave classifiers. That makes
        #  semantics clear. If you need to get deeper -- use upcoming
        #  harvesting facility ;-)
        # self.failUnlessEqual(len(clf.feature_ids), len(ds.uniquechunks))
        # self.failUnless(np.array([len(ids)==ds.nfeatures
        #                         for ids in clf.feature_ids]).all())

        # Just check if we get it at all ;-)
        summary = clf.summary()
Beispiel #13
0
def test_function_ptrs():
    ds = load_example_fmri_dataset()
    # add a mapper with a function ptr inside
    ds = ds.get_mapped(mean_sample())
    f = tempfile.NamedTemporaryFile()
    h5save(f.name, ds)
    ds_loaded = h5load(f.name)
    fresh = load_example_fmri_dataset().O
    # check that the reconstruction function pointer in the FxMapper points
    # to the right one
    assert_array_equal(ds_loaded.a.mapper.forward(fresh),
                        ds.samples)
Beispiel #14
0
    def test_classifier_generalization(self, clf):
        """Simple test if classifiers can generalize ok on simple data
        """
        te = CrossValidatedTransferError(TransferError(clf), NFoldSplitter(),
                                         postproc=mean_sample())
        nclasses = 2 * (1 + int('multiclass' in clf.__tags__))

        ds = datasets['uni%dmedium' % nclasses]
        try:
            cve = te(ds).samples.squeeze()
        except Exception, e:
            self.fail("Failed with %s" % e)
Beispiel #15
0
def test_function_ptrs():
    if not externals.exists('nifti') and not externals.exists('nibabel'):
        raise SkipTest
    ds = load_example_fmri_dataset()
    # add a mapper with a function ptr inside
    ds = ds.get_mapped(mean_sample())
    f = tempfile.NamedTemporaryFile()
    h5save(f.name, ds)
    ds_loaded = h5load(f.name)
    fresh = load_example_fmri_dataset().O
    # check that the reconstruction function pointer in the FxMapper points
    # to the right one
    assert_array_equal(ds_loaded.a.mapper.forward(fresh), ds.samples)
Beispiel #16
0
    def test_classifier_generalization(self, clf):
        """Simple test if classifiers can generalize ok on simple data
        """
        te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample())
        # check the default
        #self.failUnless(te.transerror.errorfx is mean_mismatch_error)

        nclasses = 2 * (1 + int('multiclass' in clf.__tags__))

        ds = datasets['uni%d%s' % (nclasses, self._get_clf_ds(clf))]
        try:
            cve = te(ds).samples.squeeze()
        except Exception, e:
            self.fail("Failed with %s" % e)
Beispiel #17
0
    def test_classifier_generalization(self, clf):
        """Simple test if classifiers can generalize ok on simple data
        """
        te = CrossValidatedTransferError(TransferError(clf),
                                         NFoldSplitter(),
                                         postproc=mean_sample())
        # check the default
        self.failUnless(isinstance(te.transerror.errorfx, MeanMismatchErrorFx))

        nclasses = 2 * (1 + int('multiclass' in clf.__tags__))

        ds = datasets['uni%dmedium' % nclasses]
        try:
            cve = te(ds).samples.squeeze()
        except Exception, e:
            self.fail("Failed with %s" % e)
Beispiel #18
0
    def test_null_dist_prob(self, l_clf):
        train = datasets['uni2medium']

        num_perm = 10
        permutator = AttributePermutator('targets', count=num_perm)
        # define class to estimate NULL distribution of errors
        # use left tail of the distribution since we use MeanMatchFx as error
        # function and lower is better
        terr = TransferMeasure(
            l_clf,
            Splitter(None, count=2),
            postproc=BinaryFxNode(mean_mismatch_error, 'targets'),
            null_dist=MCNullDist(permutator,
                                 tail='left'))

        # check reasonable error range
        err = terr(train)
        self.failUnless(np.mean(err) < 0.4)

        # Lets do the same for CVTE
        cvte = CrossValidation(l_clf, OddEvenPartitioner(),
            null_dist=MCNullDist(permutator,
                                 tail='left',
                                 enable_ca=['dist_samples']),
            postproc=mean_sample())
        cv_err = cvte(train)

        # check that the result is highly significant since we know that the
        # data has signal
        null_prob = terr.ca.null_prob
        if cfg.getboolean('tests', 'labile', default='yes'):
            self.failUnless(null_prob <= 0.1,
                msg="Failed to check that the result is highly significant "
                    "(got %f) since we know that the data has signal"
                    % null_prob)

            self.failUnless(cvte.ca.null_prob <= 0.1,
                msg="Failed to check that the result is highly significant "
                    "(got p(cvte)=%f) since we know that the data has signal"
                    % cvte.ca.null_prob)

            # and we should be able to access the actual samples of the distribution
            self.failUnlessEqual(len(cvte.null_dist.ca.dist_samples),
                                 num_perm)
Beispiel #19
0
    def _run_core(self,):
        """
        Core routine for detecting outliers

        Parameters
        ----------
        imgfile :
        motionfile :
        """
        attr = SampleAttributes(self.inputs.attributes_file)

        dataset = fmri_dataset(
            samples=self.inputs.samples_file,
            labels=attr.labels,
            chunks=attr.chunks,
            mask=self.inputs.mask_file)

        if 'rest' in dataset.uniquelabels:
            dataset = dataset[dataset.sa.labels != 'rest']

        # zscore dataset relative to baseline ('rest') mean
        zscore(dataset, chunks_attr=True, dtype='float32')

        # choose classifier
        clf = LinearCSVMC()

        # setup measure to be computed by Searchlight
        # cross-validated mean transfer using an N-fold dataset splitter
        cv = CrossValidatedTransferError(TransferError(clf),
                                         NFoldSplitter())


        sl = sphere_searchlight(cv, radius=self.inputs.radius,
                                space='voxel_indices',
                                nproc=2, mapper=mean_sample())

        ds = dataset.copy(deep=False,
                      sa=['labels', 'chunks'], fa=['voxel_indices'], a=[])

        sl_map = sl(ds)
        # map sensitivity map into original dataspace
        orig_sl_map = dataset.map2nifti(sl_map)

        orig_sl_map.save(self._get_output_filename())
Beispiel #20
0
    def test_ifs(self, svm):

        # measure for feature selection criterion and performance assesment
        # use the SAME clf!
        errorfx = mean_mismatch_error
        fmeasure = CrossValidation(svm, NFoldPartitioner(), postproc=mean_sample())
        pmeasure = ProxyMeasure(svm, postproc=BinaryFxNode(errorfx, 'targets'))

        ifs = IFS(fmeasure,
                  pmeasure,
                  Splitter('purpose', attr_values=['train', 'test']),
                  fselector=\
                    # go for lower tail selection as data_measure will return
                    # errors -> low is good
                    FixedNElementTailSelector(1, tail='lower', mode='select'),
                  )
        wdata = self.get_data()
        wdata.sa['purpose'] = np.repeat('train', len(wdata))
        tdata = self.get_data()
        tdata.sa['purpose'] = np.repeat('test', len(tdata))
        ds = vstack((wdata, tdata))
        orig_nfeatures = ds.nfeatures

        ifs.train(ds)
        resds = ifs(ds)

        # fail if orig datasets are changed
        self.failUnless(ds.nfeatures == orig_nfeatures)

        # check that the features set with the least error is selected
        self.failUnless(len(ifs.ca.errors))
        e = np.array(ifs.ca.errors)
        self.failUnless(resds.nfeatures == e.argmin() + 1)


        # repeat with dataset where selection order is known
        wsignal = datasets['dumb2'].copy()
        wsignal.sa['purpose'] = np.repeat('train', len(wsignal))
        tsignal = datasets['dumb2'].copy()
        tsignal.sa['purpose'] = np.repeat('test', len(tsignal))
        signal = vstack((wsignal, tsignal))
        ifs.train(signal)
        resds = ifs(signal)
        self.failUnless((resds.samples[:,0] == signal.samples[:,0]).all())
    def test_null_dist_prob(self, l_clf):
        train = datasets['uni2medium']

        num_perm = 10
        # define class to estimate NULL distribution of errors
        # use left tail of the distribution since we use MeanMatchFx as error
        # function and lower is better
        terr = TransferError(clf=l_clf,
                             null_dist=MCNullDist(permutations=num_perm,
                                                  tail='left'))

        # check reasonable error range
        err = terr(train, train)
        self.failUnless(err < 0.4)

        # Lets do the same for CVTE
        cvte = CrossValidatedTransferError(TransferError(clf=l_clf),
                                           OddEvenSplitter(),
                                           null_dist=MCNullDist(
                                               permutations=num_perm,
                                               tail='left',
                                               enable_ca=['dist_samples']),
                                           postproc=mean_sample())
        cv_err = cvte(train)

        # check that the result is highly significant since we know that the
        # data has signal
        null_prob = terr.ca.null_prob
        if cfg.getboolean('tests', 'labile', default='yes'):
            self.failUnless(
                null_prob <= 0.1,
                msg="Failed to check that the result is highly significant "
                "(got %f) since we know that the data has signal" % null_prob)

            self.failUnless(
                cvte.ca.null_prob <= 0.1,
                msg="Failed to check that the result is highly significant "
                "(got p(cvte)=%f) since we know that the data has signal" %
                cvte.ca.null_prob)

            # and we should be able to access the actual samples of the distribution
            self.failUnlessEqual(len(cvte.null_dist.ca.dist_samples), num_perm)
Beispiel #22
0
    def test_ifs(self, svm):

        # data measure and transfer error quantifier use the SAME clf!
        trans_error = TransferError(svm)
        data_measure = CrossValidatedTransferError(trans_error,
                                                   NFoldSplitter(1),
                                                   postproc=mean_sample())

        ifs = IFS(data_measure,
                  trans_error,
                  feature_selector=\
                    # go for lower tail selection as data_measure will return
                    # errors -> low is good


                    FixedNElementTailSelector(1, tail='lower', mode='select'),
                  )
        wdata = self.get_data()
        wdata_nfeatures = wdata.nfeatures
        tdata = self.get_data()
        tdata_nfeatures = tdata.nfeatures

        sdata, stdata = ifs(wdata, tdata)

        # fail if orig datasets are changed
        self.failUnless(wdata.nfeatures == wdata_nfeatures)
        self.failUnless(tdata.nfeatures == tdata_nfeatures)

        # check that the features set with the least error is selected
        self.failUnless(len(ifs.ca.errors))
        e = np.array(ifs.ca.errors)
        self.failUnless(sdata.nfeatures == e.argmin() + 1)

        # repeat with dataset where selection order is known
        signal = datasets['dumb2']
        sdata, stdata = ifs(signal, signal)
        self.failUnless((sdata.samples[:, 0] == signal.samples[:, 0]).all())
Beispiel #23
0
    def test_ifs(self, svm):

        # data measure and transfer error quantifier use the SAME clf!
        trans_error = TransferError(svm)
        data_measure = CrossValidatedTransferError(trans_error,
                                                   NFoldSplitter(1),
                                                   postproc=mean_sample())

        ifs = IFS(data_measure,
                  trans_error,
                  feature_selector=\
                    # go for lower tail selection as data_measure will return
                    # errors -> low is good
                    FixedNElementTailSelector(1, tail='lower', mode='select'),
                  )
        wdata = self.get_data()
        wdata_nfeatures = wdata.nfeatures
        tdata = self.get_data()
        tdata_nfeatures = tdata.nfeatures

        sdata, stdata = ifs(wdata, tdata)

        # fail if orig datasets are changed
        self.failUnless(wdata.nfeatures == wdata_nfeatures)
        self.failUnless(tdata.nfeatures == tdata_nfeatures)

        # check that the features set with the least error is selected
        self.failUnless(len(ifs.ca.errors))
        e = np.array(ifs.ca.errors)
        self.failUnless(sdata.nfeatures == e.argmin() + 1)


        # repeat with dataset where selection order is known
        signal = datasets['dumb2']
        sdata, stdata = ifs(signal, signal)
        self.failUnless((sdata.samples[:,0] == signal.samples[:,0]).all())
Beispiel #24
0
    def test_regressions(self, regr):
        """Simple tests on regressions
        """
        ds = datasets['chirp_linear']
        # we want numeric labels to maintain the previous behavior, especially
        # since we deal with regressions here
        ds.sa.targets = AttributeMap().to_numeric(ds.targets)

        cve = CrossValidatedTransferError(
            TransferError(regr),
            splitter=NFoldSplitter(),
            postproc=mean_sample(),
            enable_ca=['training_confusion', 'confusion'])
        # check the default
        self.failUnless(isinstance(cve.transerror.errorfx,
                                   CorrErrorFx))
        corr = np.asscalar(cve(ds).samples)

        # Our CorrErrorFx should never return NaN
        self.failUnless(not np.isnan(corr))
        self.failUnless(corr == cve.ca.confusion.stats['CCe'])

        splitregr = SplitClassifier(
            regr, splitter=OddEvenSplitter(),
            enable_ca=['training_confusion', 'confusion'])
        splitregr.train(ds)
        split_corr = splitregr.ca.confusion.stats['CCe']
        split_corr_tr = splitregr.ca.training_confusion.stats['CCe']

        for confusion, error in (
            (cve.ca.confusion, corr),
            (splitregr.ca.confusion, split_corr),
            (splitregr.ca.training_confusion, split_corr_tr),
            ):
            #TODO: test confusion statistics
            # Part of it for now -- CCe
            for conf in confusion.summaries:
                stats = conf.stats
                if cfg.getboolean('tests', 'labile', default='yes'):
                    self.failUnless(stats['CCe'] < 0.5)
                self.failUnlessEqual(stats['CCe'], stats['Summary CCe'])

            s0 = confusion.as_string(short=True)
            s1 = confusion.as_string(short=False)

            for s in [s0, s1]:
                self.failUnless(len(s) > 10,
                                msg="We should get some string representation "
                                "of regression summary. Got %s" % s)
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.failUnless(error < 0.2,
                            msg="Regressions should perform well on a simple "
                            "dataset. Got correlation error of %s " % error)

            # Test access to summary statistics
            # YOH: lets start making testing more reliable.
            #      p-value for such accident to have is verrrry tiny,
            #      so if regression works -- it better has at least 0.5 ;)
            #      otherwise fix it! ;)
            # YOH: not now -- issues with libsvr in SG and linear kernel
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.failUnless(confusion.stats['CCe'] < 0.5)

        # just to check if it works fine
        split_predictions = splitregr.predict(ds.samples)
Beispiel #25
0
from mvpa.algorithms.cvtranserror import CrossValidatedTransferError
from mvpa.measures.searchlight import sphere_searchlight
from mvpa.testing.datasets import datasets
from mvpa.mappers.fx import mean_sample
"""For the sake of simplicity, let's use a small artificial dataset."""

# Lets just use our tiny 4D dataset from testing battery
dataset = datasets['3dlarge']
"""Now it only takes three lines for a searchlight analysis."""

# setup measure to be computed in each sphere (cross-validated
# generalization error on odd/even splits)
cv = CrossValidatedTransferError(TransferError(LinearCSVMC()),
                                 OddEvenSplitter())

# setup searchlight with 2 voxels radius and measure configured above
sl = sphere_searchlight(cv, radius=2, space='myspace', postproc=mean_sample())

# run searchlight on dataset
sl_map = sl(dataset)

print 'Best performing sphere error:', np.min(sl_map.samples)
"""
If this analysis is done on a fMRI dataset using `NiftiDataset` the resulting
searchlight map (`sl_map`) can be mapped back into the original dataspace
and viewed as a brain overlay. :ref:`Another example <example_searchlight>`
shows a typical application of this algorithm.

.. Mention the fact that it also is a special `SensitivityAnalyzer`
"""
Beispiel #26
0
    def test_rfe(self, clf):

        # sensitivity analyser and transfer error quantifier use the SAME clf!
        sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample())
        pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error,
                                                           'targets'))
        cvmeasure = CrossValidation(clf, NFoldPartitioner(),
                                    errorfx=mean_mismatch_error,
                                    postproc=mean_sample())

        rfesvm_split = SplitClassifier(clf, OddEvenPartitioner())

        # explore few recipes
        for rfe, data in [
            # because the clf is already trained when computing the sensitivity
            # map, prevent retraining for transfer error calculation
            # Use absolute of the svm weights as sensitivity
            (RFE(sens_ana,
                pmeasure,
                Splitter('train'),
                fselector=FixedNElementTailSelector(1),
                train_pmeasure=False),
             self.get_data()),
            # use cross-validation within training to get error for the stopping point
            # but use full training data to derive sensitivity
            (RFE(sens_ana,
                 cvmeasure,
                 Repeater(2),            # give the same full dataset to sens_ana and cvmeasure
                 fselector=FractionTailSelector(
                     0.70,
                     mode='select', tail='upper'),
                train_pmeasure=True),
             normal_feature_dataset(perlabel=20, nchunks=5, nfeatures=200,
                                    nonbogus_features=[0, 1], snr=1.5)),
            # use cross-validation (via SplitClassifier) and get mean
            # of normed sensitivities across those splits
            (RFE(rfesvm_split.get_sensitivity_analyzer(
                    postproc=ChainMapper([ FxMapper('features', l2_normed),
                                           FxMapper('samples', np.mean),
                                           FxMapper('samples', np.abs)])),
                 ConfusionBasedError(rfesvm_split, confusion_state='stats'),
                 Repeater(2),             #  we will use the same full cv-training dataset
                 fselector=FractionTailSelector(
                     0.50,
                     mode='select', tail='upper'),
                 stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10),
                 train_pmeasure=False,    # we just extract it from existing confusion
                 update_sensitivity=True),
             normal_feature_dataset(perlabel=28, nchunks=7, nfeatures=200,
                                    nonbogus_features=[0, 1], snr=1.5))
            ]:
            # prep data
            # data = datasets['uni2medium']
            data_nfeatures = data.nfeatures

            rfe.train(data)
            resds = rfe(data)

            # fail if orig datasets are changed
            self.failUnless(data.nfeatures == data_nfeatures)

            # check that the features set with the least error is selected
            if len(rfe.ca.errors):
                e = np.array(rfe.ca.errors)
                if isinstance(rfe._fselector, FixedNElementTailSelector):
                    self.failUnless(resds.nfeatures == data_nfeatures - e.argmin())
                else:
                    # in this case we can even check if we had actual
                    # going down/up trend... although -- why up???
                    imin = np.argmin(e)
                    self.failUnless( 1 < imin < len(e) - 1 )
            else:
                self.failUnless(resds.nfeatures == data_nfeatures)

            # silly check if nfeatures is in decreasing order
            nfeatures = np.array(rfe.ca.nfeatures).copy()
            nfeatures.sort()
            self.failUnless( (nfeatures[::-1] == rfe.ca.nfeatures).all() )

            # check if history has elements for every step
            self.failUnless(set(rfe.ca.history)
                            == set(range(len(np.array(rfe.ca.errors)))))

            # Last (the largest number) can be present multiple times even
            # if we remove 1 feature at a time -- just need to stop well
            # in advance when we have more than 1 feature left ;)
            self.failUnless(rfe.ca.nfeatures[-1]
                            == len(np.where(rfe.ca.history
                                           ==max(rfe.ca.history))[0]))
Beispiel #27
0
    def test_regressions(self, regr):
        """Simple tests on regressions
        """
        ds = datasets['chirp_linear']
        # we want numeric labels to maintain the previous behavior, especially
        # since we deal with regressions here
        ds.sa.targets = AttributeMap().to_numeric(ds.targets)

        cve = CrossValidatedTransferError(
            TransferError(regr),
            splitter=NFoldSplitter(),
            postproc=mean_sample(),
            enable_ca=['training_confusion', 'confusion'])
        # check the default
        self.failUnless(isinstance(cve.transerror.errorfx, CorrErrorFx))
        corr = np.asscalar(cve(ds).samples)

        # Our CorrErrorFx should never return NaN
        self.failUnless(not np.isnan(corr))
        self.failUnless(corr == cve.ca.confusion.stats['CCe'])

        splitregr = SplitClassifier(
            regr,
            splitter=OddEvenSplitter(),
            enable_ca=['training_confusion', 'confusion'])
        splitregr.train(ds)
        split_corr = splitregr.ca.confusion.stats['CCe']
        split_corr_tr = splitregr.ca.training_confusion.stats['CCe']

        for confusion, error in (
            (cve.ca.confusion, corr),
            (splitregr.ca.confusion, split_corr),
            (splitregr.ca.training_confusion, split_corr_tr),
        ):
            #TODO: test confusion statistics
            # Part of it for now -- CCe
            for conf in confusion.summaries:
                stats = conf.stats
                if cfg.getboolean('tests', 'labile', default='yes'):
                    self.failUnless(stats['CCe'] < 0.5)
                self.failUnlessEqual(stats['CCe'], stats['Summary CCe'])

            s0 = confusion.as_string(short=True)
            s1 = confusion.as_string(short=False)

            for s in [s0, s1]:
                self.failUnless(len(s) > 10,
                                msg="We should get some string representation "
                                "of regression summary. Got %s" % s)
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.failUnless(
                    error < 0.2,
                    msg="Regressions should perform well on a simple "
                    "dataset. Got correlation error of %s " % error)

            # Test access to summary statistics
            # YOH: lets start making testing more reliable.
            #      p-value for such accident to have is verrrry tiny,
            #      so if regression works -- it better has at least 0.5 ;)
            #      otherwise fix it! ;)
            # YOH: not now -- issues with libsvr in SG and linear kernel
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.failUnless(confusion.stats['CCe'] < 0.5)

        # just to check if it works fine
        split_predictions = splitregr.predict(ds.samples)
Beispiel #28
0
    def test_tree_classifier(self):
        """Basic tests for TreeClassifier
        """
        ds = datasets['uni4medium']
        # make it simple of the beast -- take only informative ones
        # because classifiers for the tree are selected randomly, so
        # performance varies a lot and we just need to check on
        # correct operation
        ds = ds[:, ds.fa.nonbogus_targets != [None]]

        clfs = clfswh['binary']         # pool of classifiers
        # Lets permute so each time we try some different combination
        # of the classifiers but exclude those operating on %s of
        # features since we might not have enough for that
        clfs = [clfs[i] for i in np.random.permutation(len(clfs))
                if not '%' in str(clfs[i])]

        # Test conflicting definition
        tclf = TreeClassifier(clfs[0], {
            'L0+2' : (('L0', 'L2'), clfs[1]),
            'L2+3' : (('L2', 'L3'), clfs[2])})
        self.failUnlessRaises(ValueError, tclf.train, ds)
        """Should raise exception since label 2 is in both"""

        # Test insufficient definition
        tclf = TreeClassifier(clfs[0], {
            'L0+5' : (('L0', 'L5'), clfs[1]),
            'L2+3' : (('L2', 'L3'),       clfs[2])})
        self.failUnlessRaises(ValueError, tclf.train, ds)
        """Should raise exception since no group for L1"""

        # proper definition now
        tclf = TreeClassifier(clfs[0], {
            'L0+1' : (('L0', 'L1'), clfs[1]),
            'L2+3' : (('L2', 'L3'), clfs[2])})

        # Lets test train/test cycle using CVTE
        cv = CrossValidation(tclf, OddEvenPartitioner(), postproc=mean_sample(),
            enable_ca=['stats', 'training_stats'])
        cverror = cv(ds).samples.squeeze()
        try:
            rtclf = repr(tclf)
        except:
            self.fail(msg="Could not obtain repr for TreeClassifier")

        # Test accessibility of .clfs
        self.failUnless(tclf.clfs['L0+1'] is clfs[1])
        self.failUnless(tclf.clfs['L2+3'] is clfs[2])

        cvtrc = cv.ca.training_stats
        cvtc = cv.ca.stats

        if cfg.getboolean('tests', 'labile', default='yes'):
            # just a dummy check to make sure everything is working
            self.failUnless(cvtrc != cvtc)
            self.failUnless(cverror < 0.3,
                            msg="Got too high error = %s using %s"
                            % (cverror, tclf))

        # Test trailing nodes with no classifier

        # NB: It is necessary that the same classifier was not used at
        # different nodes, since it would be re-trained for a new set
        # of targets, thus leading to incorrect behavior/high error.
        # That is why we use separate pool of classifiers here
        clfs_mc = clfswh['multiclass']         # pool of classifiers
        clfs_mc = [clfs_mc[i] for i in np.random.permutation(len(clfs_mc))
                   if not '%' in str(clfs_mc[i])]

        tclf = TreeClassifier(clfs_mc[0], {
            'L0' : (('L0',), None),
            'L1+2+3' : (('L1', 'L2', 'L3'), clfs_mc[1])})

        cv = CrossValidation(tclf,
                             OddEvenPartitioner(),
                             postproc=mean_sample(),
                             enable_ca=['stats', 'training_stats'])
        cverror = np.asscalar(cv(ds))
        if cfg.getboolean('tests', 'labile', default='yes'):
            self.failUnless(cverror < 0.3,
                            msg="Got too high error = %s using %s"
                            % (cverror, tclf))
Beispiel #29
0
from mvpa.mappers.fx import mean_sample

"""For the sake of simplicity, let's use a small artificial dataset."""

# Lets just use our tiny 4D dataset from testing battery
dataset = datasets['3dlarge']

"""Now it only takes three lines for a searchlight analysis."""

# setup measure to be computed in each sphere (cross-validated
# generalization error on odd/even splits)
cv = CrossValidation(LinearCSVMC(), OddEvenPartitioner())

# setup searchlight with 2 voxels radius and measure configured above
sl = sphere_searchlight(cv, radius=2, space='myspace',
                        postproc=mean_sample())

# run searchlight on dataset
sl_map = sl(dataset)

print 'Best performing sphere error:', np.min(sl_map.samples)

"""
If this analysis is done on a fMRI dataset using `NiftiDataset` the resulting
searchlight map (`sl_map`) can be mapped back into the original dataspace
and viewed as a brain overlay. :ref:`Another example <example_searchlight>`
shows a typical application of this algorithm.

.. Mention the fact that it also is a special `SensitivityAnalyzer`
"""
Beispiel #30
0
"""For the sake of simplicity, let's use a small artificial dataset."""

# Lets just use our tiny 4D dataset from testing battery
dataset = datasets['3dlarge']

"""Now it only takes three lines for a searchlight analysis."""

# setup measure to be computed in each sphere (cross-validated
# generalization error on odd/even splits)
cv = CrossValidatedTransferError(
         TransferError(LinearCSVMC()),
         OddEvenSplitter())

# setup searchlight with 2 voxels radius and measure configured above
sl = sphere_searchlight(cv, radius=2, space='myspace',
                        postproc=mean_sample())

# run searchlight on dataset
sl_map = sl(dataset)

print 'Best performing sphere error:', np.min(sl_map.samples)

"""
If this analysis is done on a fMRI dataset using `NiftiDataset` the resulting
searchlight map (`sl_map`) can be mapped back into the original dataspace
and viewed as a brain overlay. :ref:`Another example <example_searchlight>`
shows a typical application of this algorithm.

.. Mention the fact that it also is a special `SensitivityAnalyzer`
"""