Beispiel #1
0
    def test_custom_split(self):
        #simulate half splitter
        hs = CustomPartitioner([(None,[0,1,2,3,4]),(None,[5,6,7,8,9])])
        spl = Splitter(attr='partitions')
        splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ]
        self.assertTrue(len(splits) == 2)

        for i,p in enumerate(splits):
            self.assertTrue( len(p) == 2 )
            self.assertTrue( p[0].nsamples == 50 )
            self.assertTrue( p[1].nsamples == 50 )

        assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4])
        assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9])
        assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9])
        assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4])


        # check fully customized split with working and validation set specified
        cs = CustomPartitioner([([0,3,4],[5,9])])
        # we want to discared the unselected partition of the data, hence attr_value
        # these two splitters should do exactly the same thing
        splitters = (Splitter(attr='partitions', attr_values=[1,2]),
                     Splitter(attr='partitions', ignore_values=(0,)))
        for spl in splitters:
            splits = [ list(spl.generate(p)) for p in cs.generate(self.data) ]
            self.assertTrue(len(splits) == 1)

            for i,p in enumerate(splits):
                self.assertTrue( len(p) == 2 )
                self.assertTrue( p[0].nsamples == 30 )
                self.assertTrue( p[1].nsamples == 20 )

            self.assertTrue((splits[0][1].sa['chunks'].unique == [5, 9]).all())
            self.assertTrue((splits[0][0].sa['chunks'].unique == [0, 3, 4]).all())
Beispiel #2
0
    def test_slicing(self):
        hs = HalfPartitioner()
        spl = Splitter(attr='partitions')
        splits = list(hs.generate(self.data))
        for s in splits:
            # partitioned dataset shared the data
            assert_true(s.samples.base is self.data.samples)
        splits = [list(spl.generate(p)) for p in hs.generate(self.data)]

        # with numpy 1.7.0b1 "chaining" was deprecated so let's create
        # check function appropriate for the given numpy version
        _a = np.arange(5)
        __a = _a[:4][:3]
        if __a.base is _a:
            # 1.7.0b1
            def is_the_same_base(x, base=self.data.samples):
                return x.base is base
        elif __a.base.base is _a:
            # prior 1.7.0b1
            def is_the_same_base(x, base=self.data.samples):
                return x.base.base is base
        else:
            raise RuntimeError("Uknown handling of .base by numpy")

        for s in splits:
            # we get slicing all the time
            assert_true(is_the_same_base(s[0].samples))
            assert_true(is_the_same_base(s[1].samples))
        spl = Splitter(attr='partitions', noslicing=True)
        splits = [list(spl.generate(p)) for p in hs.generate(self.data)]
        for s in splits:
            # we no slicing at all
            assert_false(s[0].samples.base is self.data.samples)
            assert_false(s[1].samples.base is self.data.samples)
        nfs = NFoldPartitioner()
        spl = Splitter(attr='partitions')
        splits = [list(spl.generate(p)) for p in nfs.generate(self.data)]
        for i, s in enumerate(splits):
            # training only first and last split
            if i == 0 or i == len(splits) - 1:
                assert_true(is_the_same_base(s[0].samples))
            else:
                assert_true(s[0].samples.base is None)
            # we get slicing all the time
            assert_true(s[1].samples.base.base is self.data.samples)
        step_ds = Dataset(np.random.randn(20, 2),
                          sa={'chunks': np.tile([0, 1], 10)})
        oes = OddEvenPartitioner()
        spl = Splitter(attr='partitions')
        splits = list(oes.generate(step_ds))
        for s in splits:
            # partitioned dataset shared the data
            assert_true(s.samples.base is step_ds.samples)
        splits = [list(spl.generate(p)) for p in oes.generate(step_ds)]
        assert_equal(len(splits), 2)
        for s in splits:
            # we get slicing all the time
            assert_true(s[0].samples.base.base is step_ds.samples)
            assert_true(s[1].samples.base.base is step_ds.samples)
Beispiel #3
0
 def test_clf_transfer_measure(self):
     # and now on a classifier
     clf = SMLR()
     enode = BinaryFxNode(mean_mismatch_error, 'targets')
     tm = TransferMeasure(clf, Splitter('chunks', count=2),
                          enable_ca=['stats'])
     res = tm(self.dataset)
     manual_error = np.mean(res.samples.squeeze() != res.sa.targets)
     postproc_error = enode(res)
     tm_err = TransferMeasure(clf, Splitter('chunks', count=2),
                              postproc=enode)
     auto_error = tm_err(self.dataset)
     ok_(manual_error == postproc_error.samples[0, 0])
Beispiel #4
0
def test_splitter():
    ds = give_data()
    # split with defaults
    spl1 = Splitter('chunks')
    assert_raises(NotImplementedError, spl1, ds)

    splits = list(spl1.generate(ds))
    assert_equal(len(splits), len(ds.sa['chunks'].unique))

    for split in splits:
        # it should have perform basic slicing!
        assert_true(split.samples.base is ds.samples)
        assert_equal(len(split.sa['chunks'].unique), 1)
        assert_true('lastsplit' in split.a)
    assert_true(splits[-1].a.lastsplit)

    # now again, more customized
    spl2 = Splitter('targets',
                    attr_values=[0, 1, 1, 2, 3, 3, 3],
                    count=4,
                    noslicing=True)
    splits = list(spl2.generate(ds))
    assert_equal(len(splits), 4)
    for split in splits:
        # it should NOT have perform basic slicing!
        assert_false(split.samples.base is ds.samples)
        assert_equal(len(split.sa['targets'].unique), 1)
        assert_equal(len(split.sa['chunks'].unique), 10)
    assert_true(splits[-1].a.lastsplit)

    # two should be identical
    assert_array_equal(splits[1].samples, splits[2].samples)

    # now go wild and split by feature attribute
    ds.fa['roi'] = np.repeat([0, 1], 5)
    # splitter should auto-detect that this is a feature attribute
    spl3 = Splitter('roi')
    splits = list(spl3.generate(ds))
    assert_equal(len(splits), 2)
    for split in splits:
        assert_true(split.samples.base is ds.samples)
        assert_equal(len(split.fa['roi'].unique), 1)
        assert_equal(split.shape, (100, 5))

    # and finally test chained splitters
    cspl = ChainNode([spl2, spl3, spl1])
    splits = list(cspl.generate(ds))
    # 4 target splits and 2 roi splits each and 10 chunks each
    assert_equal(len(splits), 80)
Beispiel #5
0
    def test_odd_even_split(self):
        oes = OddEvenPartitioner()
        spl = Splitter(attr='partitions')

        splits = [list(spl.generate(p)) for p in oes.generate(self.data)]

        self.assertTrue(len(splits) == 2)

        for i, p in enumerate(splits):
            self.assertTrue(len(p) == 2)
            self.assertTrue(p[0].nsamples == 50)
            self.assertTrue(p[1].nsamples == 50)

        assert_array_equal(splits[0][1].sa['chunks'].unique, [1, 3, 5, 7, 9])
        assert_array_equal(splits[0][0].sa['chunks'].unique, [0, 2, 4, 6, 8])
        assert_array_equal(splits[1][0].sa['chunks'].unique, [1, 3, 5, 7, 9])
        assert_array_equal(splits[1][1].sa['chunks'].unique, [0, 2, 4, 6, 8])

        # check if it works on pure odd and even chunk ids
        moresplits = [
            list(spl.generate(p)) for p in oes.generate(splits[0][0])
        ]

        for split in moresplits:
            self.assertTrue(split[0] != None)
            self.assertTrue(split[1] != None)
Beispiel #6
0
    def test_repeated_features(self):
        class CountFeatures(Measure):
            is_trained = True

            def _call(self, ds):
                return Dataset([ds.nfeatures],
                               fa={
                                   'nonbogus_targets':
                                   list(ds.fa['nonbogus_targets'].unique)
                               })

        cf = CountFeatures()
        spl = Splitter('fa.nonbogus_targets')
        nsplits = len(list(spl.generate(self.dataset)))
        assert_equal(nsplits, 3)
        rm = RepeatedMeasure(cf, spl, concat_as='features')
        res = rm(self.dataset)
        assert_equal(res.shape, (1, nsplits))
        # due to https://github.com/numpy/numpy/issues/641 we are
        # using list(set(...)) construct and there order of
        # nonbogus_targets.unique can vary from run to run, thus there
        # is no guarantee that we would get 18 first, which is a
        # questionable assumption anyways, thus performing checks
        # which do not require any specific order.
        # And yet due to another issue
        # https://github.com/numpy/numpy/issues/3759
        # we can't just is None for the bool mask
        None_fa = np.array([x is None for x in res.fa.nonbogus_targets])
        assert_array_equal(res.samples[0, None_fa], [18])
        assert_array_equal(res.samples[0, ~None_fa], [1, 1])

        if sys.version_info[0] < 3:
            # with python2 order seems to be consistent
            assert_array_equal(res.samples[0], [18, 1, 1])
Beispiel #7
0
    def test_cv_no_generator_custom_splitter(self):
        ds = Dataset(np.arange(4),
                     sa={
                         'category': ['to', 'to', 'from', 'from'],
                         'targets': ['a', 'b', 'c', 'd']
                     })

        class Measure(Classifier):
            def _train(self, ds_):
                assert_array_equal(ds_.samples, ds.samples[2:])
                assert_array_equal(ds_.sa.category, ['from'] * len(ds_))

            def _predict(self, ds_):
                assert (ds_ is not ds)  # we pass a shallow copy
                # could be called to predit training or testing data
                if np.all(ds_.sa.targets != ['c', 'd']):
                    assert_array_equal(ds_.samples, ds.samples[:2])
                    assert_array_equal(ds_.sa.category, ['to'] * len(ds_))
                else:
                    assert_array_equal(ds_.sa.category, ['from'] * len(ds_))

                return ['c', 'd']

        measure = Measure()
        cv = CrossValidation(measure,
                             splitter=Splitter('category', ['from', 'to']))
        res = cv(ds)
        assert_array_equal(res, [[1]])  # failed perfectly ;-)
Beispiel #8
0
    def test_confusion_based_error(self, l_clf):
        train = datasets['uni2medium']
        train = train[train.sa.train == 1]
        # to check if we fail to classify for 3 labels
        test3 = datasets['uni3medium']
        test3 = test3[test3.sa.train == 1]
        err = ConfusionBasedError(clf=l_clf)
        terr = TransferMeasure(l_clf,
                               Splitter('train', attr_values=[1, 1]),
                               postproc=BinaryFxNode(mean_mismatch_error,
                                                     'targets'))

        self.assertRaises(UnknownStateError, err, None)
        """Shouldn't be able to access the state yet"""

        l_clf.train(train)
        e, te = err(None), terr(train)
        te = np.asscalar(te)
        self.assertTrue(
            abs(e - te) < 1e-10,
            msg="ConfusionBasedError (%.2g) should be equal to TransferError "
            "(%.2g) on traindataset" % (e, te))

        # this will print nasty WARNING but it is ok -- it is just checking code
        # NB warnings are not printed while doing whole testing
        warning("Don't worry about the following warning.")
        if 'multiclass' in l_clf.__tags__:
            self.assertFalse(terr(test3) is None)

        # try copying the beast
        terr_copy = copy(terr)
Beispiel #9
0
def test_exclude_targets_combinations():
    partitioner = ChainNode([
        NFoldPartitioner(),
        ExcludeTargetsCombinationsPartitioner(
            k=2, targets_attr='targets', space='partitions')
    ],
                            space='partitions')
    from mvpa2.misc.data_generators import normal_feature_dataset
    ds = normal_feature_dataset(snr=0.,
                                nlabels=4,
                                perlabel=3,
                                nchunks=3,
                                nonbogus_features=[0, 1, 2, 3],
                                nfeatures=4)
    partitions = list(partitioner.generate(ds))
    assert_equal(len(partitions), 3 * 6)
    splitter = Splitter('partitions')
    combs = []
    comb_chunks = []
    for p in partitions:
        trds, teds = list(splitter.generate(p))[:2]
        comb = tuple(np.unique(teds.targets))
        combs.append(comb)
        comb_chunks.append(comb + tuple(np.unique(teds.chunks)))
    assert_equal(len(set(combs)),
                 6)  # just 6 possible combinations of 2 out of 4
    assert_equal(len(set(comb_chunks)), 3 * 6)  # all unique
Beispiel #10
0
    def test_split_featurewise_dataset_measure(self):
        ds = datasets['uni3small']
        sana = RepeatedMeasure(
            SMLR(fit_all_weights=True).get_sensitivity_analyzer(),
            ChainNode(
                [NFoldPartitioner(),
                 Splitter('partitions', attr_values=[1])]))

        sens = sana(ds)
        # a sensitivity for each chunk and each label combination
        assert_equal(sens.shape, (len(ds.sa['chunks'].unique) *
                                  len(ds.sa['targets'].unique), ds.nfeatures))

        # Lets try more complex example with 'boosting'
        ds = datasets['uni3medium']
        ds.init_origids('samples')
        sana = RepeatedMeasure(
            SMLR(fit_all_weights=True).get_sensitivity_analyzer(),
            Balancer(amount=0.25, count=2, apply_selection=True),
            enable_ca=['datasets', 'repetition_results'])
        sens = sana(ds)

        assert_equal(sens.shape,
                     (2 * len(ds.sa['targets'].unique), ds.nfeatures))
        splits = sana.ca.datasets
        self.assertEqual(len(splits), 2)
        self.assertTrue(
            np.all([s.nsamples == ds.nsamples // 4 for s in splits]))
        # should have used different samples
        self.assertTrue(np.any([splits[0].sa.origids != splits[1].sa.origids]))
        # and should have got different sensitivities
        self.assertTrue(np.any(sens[0] != sens[3]))
Beispiel #11
0
    def _train(self, dataset):
        pmeasure = ProxyMeasure(
            self.lrn,
            postproc=BinaryFxNode(self.errorfx, self.lrn.space),
            skip_train=True  # do not train since fmeasure will
        )

        # First we need to replicate our RFE construct but this time
        # with pmeasure for the classifier
        rfe = RFE(
            self.fmeasure,
            pmeasure,
            Splitter('partitions'),
            fselector=self.fselector,
            bestdetector=None,
            train_pmeasure=False,
            stopping_criterion=None,  # full "track"
            update_sensitivity=self.update_sensitivity,
            enable_ca=['errors', 'nfeatures'])

        errors, nfeatures = [], []

        if __debug__:
            debug("RFEC", "Stage 1: initial nested CV/RFE for %s", (dataset, ))

        for partition in self.partitioner.generate(dataset):
            rfe.train(partition)
            errors.append(rfe.ca.errors)
            nfeatures.append(rfe.ca.nfeatures)

        # mean errors across splits and find optimal number
        errors_mean = np.mean(errors, axis=0)
        nfeatures_mean = np.mean(nfeatures, axis=0)
        # we will take the "mean location" of the min to stay
        # within the most 'stable' choice

        mins_idx = np.where(errors_mean == np.min(errors_mean))[0]
        min_idx = mins_idx[int(len(mins_idx) / 2)]
        min_error = errors_mean[min_idx]
        assert (min_error == np.min(errors_mean))
        nfeatures_min = nfeatures_mean[min_idx]

        if __debug__:
            debug(
                "RFEC", "Choosing among %d choices to have %d features with "
                "mean error=%.2g (initial mean error %.2g)",
                (len(mins_idx), nfeatures_min, min_error, errors_mean[0]))

        self.nfeatures_min = nfeatures_min

        if __debug__:
            debug(
                "RFEC", "Stage 2: running RFE on full training dataset to "
                "distil best %d features" % nfeatures_min)

        super(SplitRFE, self)._train(dataset)
Beispiel #12
0
    def test_label_splitter(self):
        oes = OddEvenPartitioner(attr='targets')
        spl = Splitter(attr='partitions')

        splits = [list(spl.generate(p)) for p in oes.generate(self.data)]

        assert_array_equal(splits[0][0].sa['targets'].unique, [0, 2])
        assert_array_equal(splits[0][1].sa['targets'].unique, [1, 3])
        assert_array_equal(splits[1][0].sa['targets'].unique, [1, 3])
        assert_array_equal(splits[1][1].sa['targets'].unique, [0, 2])
Beispiel #13
0
    def test_splitter_gnbsearghlight(self):
        ds1 = datasets['3dsmall'].copy(deep=True)

        gnb_sl = GNBSearchlight(GNB(),
                                generator=CustomPartitioner([([0], [1])]),
                                qe=IndexQueryEngine(myspace=Sphere(2)),
                                splitter=Splitter(attr='partitions',
                                                  attr_values=[1, 2]),
                                errorfx=None)
        res = gnb_sl(ds1)
        assert_equal(res.nsamples, (ds1.chunks == 1).sum())
Beispiel #14
0
    def test_transfer_measure(self):
        # come up with my own measure that only checks if training data
        # and test data are the same
        class MyMeasure(Measure):
            def _train(self, ds):
                self._tds = ds
            def _call(self, ds):
                return Dataset(ds.samples == self._tds.samples)

        tm = TransferMeasure(MyMeasure(), Splitter('chunks', count=2))
        # result should not be all True (== identical)
        assert_true((tm(self.dataset).samples == False).any())
Beispiel #15
0
def get_partitioner(split_attr='group_split'):

    splitter = Splitter(attr='partitions', attr_values=(2, 3))

    if split_attr == 'group_split':

        splitrule = [
            # (leave, training, testing)
            (['3', '4'], ['1'], ['2']),
            (['3', '4'], ['2'], ['1']),
            (['1'], ['2'], ['3', '4']),
            (['2'], ['1'], ['3', '4']),
            (['1', '2'], ['3'], ['4']),
            (['1', '2'], ['4'], ['3']),
            (['3'], ['4'], ['1', '2']),
            (['4'], ['3'], ['1', '2']),
        ]
        partitioner = CustomPartitioner(splitrule=splitrule, attr=split_attr)

    elif split_attr == 'subject':

        partitioner = MemoryGroupSubjectPartitioner(group_attr='group_split',
                                                    subject_attr=split_attr,
                                                    attr=split_attr)

    elif split_attr == "group_mdm":
        partitioner = LeaveOneSubjectPerGroupPartitioner(
            group_attr='group', subject_attr="subject", attr="subject")

    elif split_attr == "subject_ofp":

        partitioner = partitioner = NFoldPartitioner(attr="subject")
        splitter = Splitter(attr="partitions")

    elif split_attr == 'group':

        partitioner = NFoldPartitioner(attr=split_attr)
        splitter = Splitter(attr='partitions')

    return partitioner, splitter
Beispiel #16
0
    def test_james_problem_multiclass(self):
        percent = 80
        dataset = datasets['uni4large']
        #dataset = dataset[:, dataset.a.nonbogus_features]

        rfesvm_split = LinearCSVMC()
        fs = \
            RFE(rfesvm_split.get_sensitivity_analyzer(
            postproc=ChainMapper([
                #FxMapper('features', l2_normed),
                #FxMapper('samples', np.mean),
                #FxMapper('samples', np.abs)
                FxMapper('features', lambda x: np.argsort(np.abs(x))),
                #maxofabs_sample()
                mean_sample()
                ])),
                ProxyMeasure(rfesvm_split,
                             postproc=BinaryFxNode(mean_mismatch_error,
                                                   'targets')),
                Splitter('train'),
                fselector=FractionTailSelector(
                    percent / 100.0,
                    mode='select', tail='upper'), update_sensitivity=True)

        clf = FeatureSelectionClassifier(
            LinearCSVMC(),
            # on features selected via RFE
            fs)

        # update sensitivity at each step (since we're not using the
        # same CLF as sensitivity analyzer)

        class StoreResults(object):
            def __init__(self):
                self.storage = []

            def __call__(self, data, node, result):
                self.storage.append((node.measure.mapper.ca.history,
                                     node.measure.mapper.ca.errors)),

        cv_storage = StoreResults()
        cv = CrossValidation(clf,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             callback=cv_storage,
                             enable_ca=['stats'])
        #cv = SplitClassifier(clf)
        try:
            error = cv(dataset).samples.squeeze()
        except Exception, e:
            self.fail('CrossValidation cannot handle classifier with RFE '
                      'feature selection. Got exception: %s' % (e, ))
Beispiel #17
0
    def test_simplest_cv_pat_gen(self):
        # create the generator
        nfs = NFoldPartitioner(cvtype=1)
        spl = Splitter(attr='partitions')
        # now get the xval pattern sets One-Fold CV)
        xvpat = [list(spl.generate(p)) for p in nfs.generate(self.data)]

        self.assertTrue(len(xvpat) == 10)

        for i, p in enumerate(xvpat):
            self.assertTrue(len(p) == 2)
            self.assertTrue(p[0].nsamples == 90)
            self.assertTrue(p[1].nsamples == 10)
            self.assertTrue(p[1].chunks[0] == i)
Beispiel #18
0
    def test_pseudo_cv_measure(self):
        clf = SMLR()
        enode = BinaryFxNode(mean_mismatch_error, 'targets')
        tm = TransferMeasure(clf, Splitter('partitions'), postproc=enode)
        cvgen = NFoldPartitioner()
        rm = RepeatedMeasure(tm, cvgen)
        res = rm(self.dataset)
        # one error per fold
        assert_equal(res.shape, (len(self.dataset.sa['chunks'].unique), 1))

        # we can do the same with Crossvalidation
        cv = CrossValidation(clf, cvgen, enable_ca=['stats', 'training_stats',
                                                    'datasets'])
        res = cv(self.dataset)
        assert_equal(res.shape, (len(self.dataset.sa['chunks'].unique), 1))
Beispiel #19
0
    def test_james_problem(self):
        percent = 80
        dataset = datasets['uni2small']
        rfesvm_split = LinearCSVMC()
        fs = \
            RFE(rfesvm_split.get_sensitivity_analyzer(),
                ProxyMeasure(rfesvm_split,
                             postproc=BinaryFxNode(mean_mismatch_error,
                                                   'targets')),
                Splitter('train'),
                fselector=FractionTailSelector(
                    percent / 100.0,
                    mode='select', tail='upper'), update_sensitivity=True)

        clf = FeatureSelectionClassifier(
            LinearCSVMC(),
            # on features selected via RFE
            fs)

        # update sensitivity at each step (since we're not using the
        # same CLF as sensitivity analyzer)

        class StoreResults(object):
            def __init__(self):
                self.storage = []

            def __call__(self, data, node, result):
                self.storage.append((node.measure.mapper.ca.history,
                                     node.measure.mapper.ca.errors)),

        cv_storage = StoreResults()
        cv = CrossValidation(clf,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             callback=cv_storage,
                             enable_ca=['confusion'])  # TODO -- it is stats
        #cv = SplitClassifier(clf)
        try:
            error = cv(dataset).samples.squeeze()
        except Exception as e:
            self.fail('CrossValidation cannot handle classifier with RFE '
                      'feature selection. Got exception: %s' % (e, ))

        assert (len(cv_storage.storage) == len(dataset.sa['chunks'].unique))
        assert (len(cv_storage.storage[0]) == 2)
        assert (len(cv_storage.storage[0][0]) == dataset.nfeatures)

        self.assertTrue(error < 0.2)
Beispiel #20
0
    def test_single_class(self, clf):
        """Test if binary and multiclass can handle single class training/testing
        """
        ds = datasets['uni2small']
        ds = ds[ds.sa.targets == 'L0']  #  only 1 label
        assert(ds.sa['targets'].unique == ['L0'])

        ds_ = list(OddEvenPartitioner().generate(ds))[0]
        # Here is our "nice" 0.6 substitute for TransferError:
        trerr = TransferMeasure(clf, Splitter('train'),
                                postproc=BinaryFxNode(mean_mismatch_error,
                                                      'targets'))
        try:
            err = np.asscalar(trerr(ds_))
        except Exception, e:
            self.fail(str(e))
Beispiel #21
0
    def test_repeated_features(self):
        print self.dataset
        print self.dataset.fa.nonbogus_targets
        class CountFeatures(Measure):
            is_trained = True
            def _call(self, ds):
                return ds.nfeatures

        cf = CountFeatures()
        spl = Splitter('fa.nonbogus_targets')
        nsplits = len(list(spl.generate(self.dataset)))
        assert_equal(nsplits, 3)
        rm = RepeatedMeasure(cf, spl, concat_as='features')
        res = rm(self.dataset)
        assert_equal(res.shape, (1, nsplits))
        assert_array_equal(res.samples[0], [18,1,1])
Beispiel #22
0
    def test_ifs(self, svm):

        # measure for feature selection criterion and performance assesment
        # use the SAME clf!
        errorfx = mean_mismatch_error
        fmeasure = CrossValidation(svm,
                                   NFoldPartitioner(),
                                   postproc=mean_sample())
        pmeasure = ProxyMeasure(svm, postproc=BinaryFxNode(errorfx, 'targets'))

        ifs = IFS(fmeasure,
                  pmeasure,
                  Splitter('purpose', attr_values=['train', 'test']),
                  fselector=\
                    # go for lower tail selection as data_measure will return
                    # errors -> low is good


                    FixedNElementTailSelector(1, tail='lower', mode='select'),
                  )
        wdata = self.get_data()
        wdata.sa['purpose'] = np.repeat('train', len(wdata))
        tdata = self.get_data()
        tdata.sa['purpose'] = np.repeat('test', len(tdata))
        ds = vstack((wdata, tdata))
        orig_nfeatures = ds.nfeatures

        ifs.train(ds)
        resds = ifs(ds)

        # fail if orig datasets are changed
        self.assertTrue(ds.nfeatures == orig_nfeatures)

        # check that the features set with the least error is selected
        self.assertTrue(len(ifs.ca.errors))
        e = np.array(ifs.ca.errors)
        self.assertTrue(resds.nfeatures == e.argmin() + 1)

        # repeat with dataset where selection order is known
        wsignal = datasets['dumb2'].copy()
        wsignal.sa['purpose'] = np.repeat('train', len(wsignal))
        tsignal = datasets['dumb2'].copy()
        tsignal.sa['purpose'] = np.repeat('test', len(tsignal))
        signal = vstack((wsignal, tsignal))
        ifs.train(signal)
        resds = ifs(signal)
        self.assertTrue((resds.samples[:, 0] == signal.samples[:, 0]).all())
Beispiel #23
0
    def test_gnb(self):
        gnb = GNB()
        gnb_nc = GNB(common_variance=False)
        gnb_n = GNB(normalize=True)
        gnb_n_nc = GNB(normalize=True, common_variance=False)
        gnb_lin = GNB(common_variance=True)

        ds = datasets['uni2medium']

        # Generic silly coverage just to assure that it works in all
        # possible scenarios:
        bools = (True, False)
        # There should be better way... heh
        for cv in bools:  # common_variance?
            for prior in ('uniform', 'laplacian_smoothing', 'ratio'):
                tp = None  # predictions -- all above should
                # result in the same predictions
                for n in bools:  # normalized?
                    for ls in bools:  # logspace?
                        for es in ((), ('estimates')):
                            gnb_ = GNB(common_variance=cv,
                                       prior=prior,
                                       normalize=n,
                                       logprob=ls,
                                       enable_ca=es)
                            tm = TransferMeasure(gnb_, Splitter('train'))
                            predictions = tm(ds).samples[:, 0]
                            if tp is None:
                                tp = predictions
                            assert_array_equal(predictions, tp)
                            # if normalized -- check if estimates are such
                            if n and 'estimates' in es:
                                v = gnb_.ca.estimates
                                if ls:  # in log space -- take exp ;)
                                    v = np.exp(v)
                                d1 = np.sum(v, axis=1) - 1.0
                                self.assertTrue(np.max(np.abs(d1)) < 1e-5)
                            # smoke test to see whether invocation of sensitivity analyser blows
                            # if gnb classifier isn't linear, and to see whether it doesn't blow
                            # when it is linear.
                            if cv:
                                assert 'has_sensitivity' in gnb_.__tags__
                                gnb_.get_sensitivity_analyzer()
                            if not cv:
                                with self.assertRaises(NotImplementedError):
                                    gnb_.get_sensitivity_analyzer()
Beispiel #24
0
    def test_counted_splitting(self):
        spl = Splitter(attr='partitions')
        # count > #chunks, should result in 10 splits
        nchunks = len(self.data.sa['chunks'].unique)
        for strategy in Partitioner._STRATEGIES:
            for count, target in [(nchunks * 2, nchunks), (nchunks, nchunks),
                                  (nchunks - 1, nchunks - 1), (3, 3), (0, 0),
                                  (1, 1)]:
                nfs = NFoldPartitioner(cvtype=1,
                                       count=count,
                                       selection_strategy=strategy)
                splits = [
                    list(spl.generate(p)) for p in nfs.generate(self.data)
                ]
                self.assertTrue(len(splits) == target)
                chosenchunks = [int(s[1].uniquechunks) for s in splits]

                # Test if configuration matches as well
                nsplits_cfg = len(nfs.get_partition_specs(self.data))
                self.assertEqual(nsplits_cfg, target)

                # Check if "lastsplit" dsattr was assigned appropriately
                nsplits = len(splits)
                if nsplits > 0:
                    # dummy-proof testing of last split
                    for ds_ in splits[-1]:
                        self.assertTrue(ds_.a.lastpartitionset)
                    # test all now
                    for isplit, split in enumerate(splits):
                        for ds_ in split:
                            ds_.a.lastpartitionset == isplit == nsplits - 1

                # Check results of different strategies
                if strategy == 'first':
                    self.assertEqual(chosenchunks, range(target))
                elif strategy == 'equidistant':
                    if target == 3:
                        self.assertEqual(chosenchunks, [0, 3, 7])
                elif strategy == 'random':
                    # none is selected twice
                    self.assertTrue(
                        len(set(chosenchunks)) == len(chosenchunks))
                    self.assertTrue(target == len(chosenchunks))
                else:
                    raise RuntimeError, "Add unittest for strategy %s" \
                          % strategy
Beispiel #25
0
    def test_svms(self, clf):
        knows_probabilities = \
            'probabilities' in clf.ca.keys() and clf.params.probability
        enable_ca = ['estimates']
        if knows_probabilities:
            enable_ca += ['probabilities']

        clf.ca.change_temporarily(enable_ca=enable_ca)
        spl = Splitter('train', count=2)
        traindata, testdata = list(spl.generate(datasets['uni2small']))
        clf.train(traindata)
        predicts = clf.predict(testdata.samples)
        # values should be different from predictions for SVMs we have
        self.assertTrue(np.any(predicts != clf.ca.estimates))

        if knows_probabilities and clf.ca.is_set('probabilities'):
            # XXX test more thoroughly what we are getting here ;-)
            self.assertEqual(len(clf.ca.probabilities), len(testdata.samples))
        clf.ca.reset_changed_temporarily()
Beispiel #26
0
 def _forward_dataset(self, ds):
     if self.__chunks_attr is None:
         return self._forward_dataset_helper(ds)
     else:
         # strip down dataset to speedup local processing
         if self.__attr_strategy == 'remove':
             keep_sa = []
         else:
             keep_sa = None
         proc_ds = ds.copy(deep=False, sa=keep_sa, fa=[], a=[])
         # process all chunks individually
         # use a customsplitter to speed-up splitting
         spl = Splitter(self.__chunks_attr)
         dses = [self._forward_dataset_helper(d)
                     for d in spl.generate(proc_ds)]
         # and merge them again
         mds = vstack(dses)
         # put back attributes
         mds.fa.update(ds.fa)
         mds.a.update(ds.a)
         return mds
Beispiel #27
0
    def test_gnbsearchlight_3partitions_and_splitter(self):
        ds = self.dataset[:, :20]
        # custom partitioner which provides 3 partitions
        part = CustomPartitioner([([2], [3], [1])])
        gnb_sl = sphere_gnbsearchlight(GNB(), part)
        res_gnb_sl = gnb_sl(ds)

        # compare results to full blown searchlight
        sl = sphere_searchlight(CrossValidation(GNB(), part))
        res_sl = sl(ds)

        assert_datasets_equal(res_gnb_sl, res_sl)

        # and theoretically for this simple single cross-validation we could
        # just use Splitter
        splitter = Splitter('chunks', [2, 3])
        # we have to put explicit None since can't become a kwarg in 1 day any
        # longer here
        gnb_sl_ = sphere_gnbsearchlight(GNB(), None, splitter=splitter)
        res_gnb_sl_ = gnb_sl_(ds)
        assert_datasets_equal(res_gnb_sl, res_gnb_sl_)
Beispiel #28
0
    def test_half_split(self):
        hs = HalfPartitioner()
        spl = Splitter(attr='partitions')

        splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ]

        self.assertTrue(len(splits) == 2)

        for i,p in enumerate(splits):
            self.assertTrue( len(p) == 2 )
            self.assertTrue( p[0].nsamples == 50 )
            self.assertTrue( p[1].nsamples == 50 )

        assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4])
        assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9])
        assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9])
        assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4])

        # check if it works on pure odd and even chunk ids
        moresplits = [ list(spl.generate(p)) for p in hs.generate(splits[0][0])]

        for split in moresplits:
            self.assertTrue(split[0] is not None)
            self.assertTrue(split[1] is not None)
Beispiel #29
0
    def test_gnb(self):
        gnb = GNB()
        gnb_nc = GNB(common_variance=False)
        gnb_n = GNB(normalize=True)
        gnb_n_nc = GNB(normalize=True, common_variance=False)

        ds = datasets['uni2medium']

        # Generic silly coverage just to assure that it works in all
        # possible scenarios:
        bools = (True, False)
        # There should be better way... heh
        for cv in bools:  # common_variance?
            for prior in ('uniform', 'laplacian_smoothing', 'ratio'):
                tp = None  # predictions -- all above should
                # result in the same predictions
                for n in bools:  # normalized?
                    for ls in bools:  # logspace?
                        for es in ((), ('estimates')):
                            gnb_ = GNB(common_variance=cv,
                                       prior=prior,
                                       normalize=n,
                                       logprob=ls,
                                       enable_ca=es)
                            tm = TransferMeasure(gnb_, Splitter('train'))
                            predictions = tm(ds).samples[:, 0]
                            if tp is None:
                                tp = predictions
                            assert_array_equal(predictions, tp)
                            # if normalized -- check if estimates are such
                            if n and 'estimates' in es:
                                v = gnb_.ca.estimates
                                if ls:  # in log space -- take exp ;)
                                    v = np.exp(v)
                                d1 = np.sum(v, axis=1) - 1.0
                                self.assertTrue(np.max(np.abs(d1)) < 1e-5)
Beispiel #30
0
def test_rfe_sensmap():
    # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2013q3/002538.html
    # just a smoke test. fails with
    from mvpa2.clfs.svm import LinearCSVMC
    from mvpa2.clfs.meta import FeatureSelectionClassifier
    from mvpa2.measures.base import CrossValidation, RepeatedMeasure
    from mvpa2.generators.splitters import Splitter
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.misc.errorfx import mean_mismatch_error
    from mvpa2.mappers.fx import mean_sample
    from mvpa2.mappers.fx import maxofabs_sample
    from mvpa2.generators.base import Repeater
    from mvpa2.featsel.rfe import RFE
    from mvpa2.featsel.helpers import FractionTailSelector, BestDetector
    from mvpa2.featsel.helpers import NBackHistoryStopCrit
    from mvpa2.datasets import vstack

    from mvpa2.misc.data_generators import normal_feature_dataset

    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    fds = normal_feature_dataset(nlabels=3,
                                 snr=1, # 100,   # pure signal! ;)
                                 perlabel=9,
                                 nfeatures=6,
                                 nonbogus_features=range(3),
                                 nchunks=3)
    clfsvm = LinearCSVMC()

    rfesvm = RFE(clfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()),
                 CrossValidation(
                     clfsvm,
                     NFoldPartitioner(),
                     errorfx=mean_mismatch_error, postproc=mean_sample()),
                 Repeater(2),
                 fselector=FractionTailSelector(0.70, mode='select', tail='upper'),
                 stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10),
                 update_sensitivity=True)

    fclfsvm = FeatureSelectionClassifier(clfsvm, rfesvm)

    sensanasvm = fclfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample())


    # manually repeating/splitting so we do both RFE sensitivity and classification
    senses, errors = [], []
    for i, pset in enumerate(NFoldPartitioner().generate(fds)):
        # split partitioned dataset
        split = [d for d in Splitter('partitions').generate(pset)]
        senses.append(sensanasvm(split[0])) # and it also should train the classifier so we would ask it about error
        errors.append(mean_mismatch_error(fclfsvm.predict(split[1]), split[1].targets))

    senses = vstack(senses)
    errors = vstack(errors)

    # Let's compare against rerunning the beast simply for classification with CV
    errors_cv = CrossValidation(fclfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error)(fds)
    # and they should match
    assert_array_equal(errors, errors_cv)

    # buggy!
    cv_sensana_svm = RepeatedMeasure(sensanasvm, NFoldPartitioner())
    senses_rm = cv_sensana_svm(fds)

    #print senses.samples, senses_rm.samples
    #print errors, errors_cv.samples
    assert_raises(AssertionError,
                  assert_array_almost_equal,
                  senses.samples, senses_rm.samples)
    raise SkipTest("Known failure for repeated measures: https://github.com/PyMVPA/PyMVPA/issues/117")