Example #1
0
def test_BinaryFxFeatureMeasure_mi(fx, ds_mi):
    ds, (mi_min, mi_max) = ds_mi
    res = fx(ds)
    assert_equal(res.shape, (1, ds.nfeatures))
    mi = res.samples[0]
    assert_array_less(mi_min, mi)
    assert_array_less(mi, mi_max)
    fx_str = str(fx)
    if not (__debug__ and 'ID_IN_REPR' in debug.active):
        assert_equal(fx_str, "<BinaryFxFeaturewiseMeasure: %s>" % fx.fx.__name__)
Example #2
0
def test_BinaryFxFeatureMeasure(ds):
    if not isinstance(ds.samples, np.ndarray):
        return
    # some simple function
    f = lambda x, y: np.sum((x.T*y).T, axis=0)
    fx = BinaryFxFeaturewiseMeasure(f, uni=False, numeric=True)
    fx_uni = BinaryFxFeaturewiseMeasure(f, uni=True, numeric=True)
    out = fx(ds)
    out_uni = fx_uni(ds)
    assert(len(out) == 1)
    assert_array_almost_equal(out.samples, out_uni)
    assert_equal(out.fa, out_uni.fa)
    ok_(str(fx).startswith("<BinaryFxFeaturewiseMeasure: lambda x, y:"))
Example #3
0
    def test_SplitRFE(self):
        # just a smoke test ATM
        from mvpa2.clfs.svm import LinearCSVMC
        from mvpa2.clfs.meta import MappedClassifier
        from mvpa2.misc.data_generators import normal_feature_dataset
        #import mvpa2.featsel.rfe
        #reload(mvpa2.featsel.rfe)
        from mvpa2.featsel.rfe import RFE, SplitRFE
        from mvpa2.generators.partition import NFoldPartitioner
        from mvpa2.featsel.helpers import FractionTailSelector
        from mvpa2.testing import ok_, assert_equal

        clf = LinearCSVMC(C=1)
        dataset = normal_feature_dataset(perlabel=20,
                                         nlabels=2,
                                         nfeatures=30,
                                         snr=1.,
                                         nonbogus_features=[1, 5])
        # flip one of the meaningful features around to see
        # if we are still getting proper selection
        dataset.samples[:, dataset.a.nonbogus_features[1]] *= -1
        # 4 partitions should be enough for testing
        partitioner = NFoldPartitioner(count=4)

        rfeclf = MappedClassifier(
            clf,
            SplitRFE(clf,
                     partitioner,
                     fselector=FractionTailSelector(0.2,
                                                    mode='discard',
                                                    tail='lower')))
        r0 = repr(rfeclf)

        ok_(rfeclf.mapper.nfeatures_min == 0)
        rfeclf.train(dataset)
        ok_(rfeclf.mapper.nfeatures_min > 0)
        predictions = rfeclf(dataset).samples

        # at least 1 of the nonbogus-features should be chosen
        ok_(
            len(
                set(dataset.a.nonbogus_features).intersection(
                    rfeclf.mapper.slicearg)) > 0)
        # check repr to have all needed pieces
        r = repr(rfeclf)
        s = str(rfeclf)
        ok_(('partitioner=NFoldP' in r) or
            ('partitioner=mvpa2.generators.partition.NFoldPartitioner' in r))
        ok_('lrn=' in r)
        ok_(not 'slicearg=' in r)
        assert_equal(r, r0)
Example #4
0
    def test_SplitRFE(self):
        # just a smoke test ATM
        from mvpa2.clfs.svm import LinearCSVMC
        from mvpa2.clfs.meta import MappedClassifier
        from mvpa2.misc.data_generators import normal_feature_dataset
        #import mvpa2.featsel.rfe
        #reload(mvpa2.featsel.rfe)
        from mvpa2.featsel.rfe import RFE, SplitRFE
        from mvpa2.generators.partition import NFoldPartitioner
        from mvpa2.featsel.helpers import FractionTailSelector
        from mvpa2.testing import ok_, assert_equal

        clf = LinearCSVMC(C=1)
        dataset = normal_feature_dataset(perlabel=20, nlabels=2, nfeatures=30,
                                         snr=1., nonbogus_features=[1,5])
        # flip one of the meaningful features around to see
        # if we are still getting proper selection
        dataset.samples[:, dataset.a.nonbogus_features[1]] *= -1
        # 4 partitions should be enough for testing
        partitioner = NFoldPartitioner(count=4)

        rfeclf = MappedClassifier(
            clf, SplitRFE(clf,
                          partitioner,
                          fselector=FractionTailSelector(
                              0.2, mode='discard', tail='lower')))
        r0 = repr(rfeclf)

        ok_(rfeclf.mapper.nfeatures_min == 0)
        rfeclf.train(dataset)
        ok_(rfeclf.mapper.nfeatures_min > 0)
        predictions = rfeclf(dataset).samples

        # at least 1 of the nonbogus-features should be chosen
        ok_(len(set(dataset.a.nonbogus_features).intersection(
                rfeclf.mapper.slicearg)) > 0)
        # check repr to have all needed pieces
        r = repr(rfeclf)
        s = str(rfeclf)
        ok_(('partitioner=NFoldP' in r) or
            ('partitioner=mvpa2.generators.partition.NFoldPartitioner' in r))
        ok_('lrn=' in r)
        ok_(not 'slicearg=' in r)
        assert_equal(r, r0)
def test_cluster_count():
    if externals.versions['scipy'] < '0.10':
        raise SkipTest
    # we get a ZERO cluster count of one if there are no clusters at all
    # this is needed to keept track of the number of bootstrap samples that yield
    # no cluster at all (high treshold) in order to compute p-values when there is no
    # actual cluster size histogram
    assert_equal(gct._get_map_cluster_sizes([0, 0, 0, 0]), [0])
    # if there is at least one cluster: no ZERO count
    assert_equal(gct._get_map_cluster_sizes([0, 0, 1, 0]), [1])
    for i in range(2):  # rerun tests for bool type of test_M
        test_M = np.array([[1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0],
                           [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1],
                           [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1],
                           [0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
                           [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0],
                           [0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0],
                           [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0],
                           [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0],
                           [1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0]])
        expected_result = [5, 4, 3, 3, 2, 0, 2]  # 5 clusters of size 1,
                                                 # 4 clusters of size 2 ...

        test_ds = Dataset([test_M])
        if i == 1:
            test_M = test_M.astype(bool)

        test_M_3d = np.hstack((test_M.flatten(),
                               test_M.flatten())).reshape(2, 9, 16)
        test_ds_3d = Dataset([test_M_3d])
        # expected_result^2
        expected_result_3d = np.array([0, 5, 0, 4, 0, 3, 0,
                                       3, 0, 2, 0, 0, 0, 2])

        size = 10000  # how many times bigger than test_M_3d
        test_M_3d_big = np.hstack((test_M_3d.flatten(), np.zeros(144)))
        test_M_3d_big = np.hstack((test_M_3d_big for i in range(size))
                                  ).reshape(3 * size, 9, 16)
        test_ds_3d_big = Dataset([test_M_3d_big])
        expected_result_3d_big = expected_result_3d * size

        # check basic cluster size determination for plain arrays and datasets
        # with a single sample
        for t, e in ((test_M, expected_result),
                     (test_ds, expected_result),
                     (test_M_3d, expected_result_3d),
                     (test_ds_3d, expected_result_3d),
                     (test_M_3d_big, expected_result_3d_big),
                     (test_ds_3d_big, expected_result_3d_big)):
            assert_array_equal(np.bincount(gct._get_map_cluster_sizes(t))[1:],
                               e)
        # old
        M = np.vstack([test_M_3d.flatten()] * 10)
        # new
        ds = dataset_wizard([test_M_3d] * 10)
        assert_array_equal(M, ds)
        expected_result = Counter(np.hstack([gct._get_map_cluster_sizes(test_M_3d)] * 10))
        assert_array_equal(expected_result,
                           gct.get_cluster_sizes(ds))

        # test the same with some arbitrary per-feature threshold
        thr = 4
        labels, num = measurements.label(test_M_3d)
        area = measurements.sum(test_M_3d, labels,
                                index=np.arange(labels.max() + 1))
        cluster_sizes_map = area[labels]  # .astype(int)
        thresholded_cluster_sizes_map = cluster_sizes_map > thr
        # old
        M = np.vstack([cluster_sizes_map.flatten()] * 10)
        # new
        ds = dataset_wizard([cluster_sizes_map] * 10)
        assert_array_equal(M, ds)
        expected_result = Counter(np.hstack(
            [gct._get_map_cluster_sizes(thresholded_cluster_sizes_map)] * 10))
        th_map = np.ones(cluster_sizes_map.flatten().shape) * thr
        # threshold dataset by hand
        ds.samples = ds.samples > th_map
        assert_array_equal(expected_result,
                           gct.get_cluster_sizes(ds))
def test_group_clusterthreshold_simple(n_proc):
    if n_proc > 1 and not externals.exists('joblib'):
        raise SkipTest
    feature_thresh_prob = 0.005
    nsubj = 10
    # make a nice 1D blob and a speck
    blob = np.array([0, 0, .5, 3, 5, 3, 3, 0, 2, 0])
    blob = Dataset([blob])
    # and some nice random permutations
    nperms = 100 * nsubj
    perm_samples = np.random.randn(nperms, blob.nfeatures)
    perms = Dataset(perm_samples,
                    sa=dict(chunks=np.repeat(range(nsubj), len(perm_samples) / nsubj)),
                    fa=dict(fid=range(perm_samples.shape[1])))
    # the algorithm instance
    # scale number of bootstraps to match desired probability
    # plus a safety margin to mimimize bad luck in sampling
    clthr = gct.GroupClusterThreshold(n_bootstrap=int(3. / feature_thresh_prob),
                                      feature_thresh_prob=feature_thresh_prob,
                                      fwe_rate=0.01, n_blocks=3, n_proc=n_proc)
    clthr.train(perms)
    # get the FE thresholds
    thr = clthr._thrmap
    # perms are normally distributed, hence the CDF should be close, std of the distribution
    # will scale 1/sqrt(nsubj)
    assert_true(np.abs(
        feature_thresh_prob - (1 - norm.cdf(thr.mean(),
                                            loc=0,
                                            scale=1. / np.sqrt(nsubj)))) < 0.01)

    clstr_sizes = clthr._null_cluster_sizes
    # getting anything but a lonely one feature cluster is very unlikely
    assert_true(max([c[0] for c in clstr_sizes.keys()]) <= 1)
    # threshold orig map
    res = clthr(blob)
    #
    # check output
    #
    # samples unchanged
    assert_array_equal(blob.samples, res.samples)
    # need to find the big cluster
    assert_true(len(res.a.clusterstats) > 0)
    assert_equal(len(res.a.clusterstats), res.fa.clusters_featurewise_thresh.max())
    # probs need to decrease with size, clusters are sorted by size (decreasing)
    assert_true(res.a.clusterstats['prob_raw'][0] <= res.a.clusterstats['prob_raw'][1])
    # corrected probs for every uncorrected cluster
    assert_true('prob_corrected' in res.a.clusterstats.dtype.names)
    # fwe correction always increases the p-values (if anything)
    assert_true(np.all(res.a.clusterstats['prob_raw'] <= res.a.clusterstats['prob_corrected']))
    # fwe thresholding only ever removes clusters
    assert_true(np.all(np.abs(res.fa.clusters_featurewise_thresh - res.fa.clusters_fwe_thresh) >= 0))
    # FWE should kill the small one
    assert_greater(res.fa.clusters_featurewise_thresh.max(),
                   res.fa.clusters_fwe_thresh.max())

    # check that the cluster results aren't depending in the actual location of
    # the clusters
    shifted_blob = Dataset([[.5, 3, 5, 3, 3, 0, 0, 0, 2, 0]])
    shifted_res = clthr(shifted_blob)
    assert_array_equal(res.a.clusterstats, shifted_res.a.clusterstats)

    # check that it averages multi-sample datasets
    # also checks that scenarios work where all features are part of one big
    # cluster
    multisamp = Dataset(np.arange(30).reshape(3, 10) + 100)
    avgres = clthr(multisamp)
    assert_equal(len(avgres), 1)
    assert_array_equal(avgres.samples[0], np.mean(multisamp.samples, axis=0))

    # retrain, this time with data from only a single subject
    perms = Dataset(perm_samples,
                    sa=dict(chunks=np.repeat(1, len(perm_samples))),
                    fa=dict(fid=range(perms.shape[1])))
    clthr.train(perms)
    # same blob -- 1st this should work without issues
    sglres = clthr(blob)
    # NULL estimation does no averaging
    # -> more noise -> fewer clusters -> higher p
    assert_greater_equal(len(res.a.clusterstats), len(sglres.a.clusterstats))
    assert_greater_equal(np.round(sglres.a.clusterstats[0]['prob_raw'], 4),
                         np.round(res.a.clusterstats[0]['prob_raw'], 4))
    # no again for real scientists: no FWE correction
    superclthr = gct.GroupClusterThreshold(
        n_bootstrap=int(3. / feature_thresh_prob),
        feature_thresh_prob=feature_thresh_prob,
        multicomp_correction=None, n_blocks=3,
        n_proc=n_proc)
    superclthr.train(perms)
    superres = superclthr(blob)
    assert_true('prob_corrected' in res.a.clusterstats.dtype.names)
    assert_true('clusters_fwe_thresh' in res.fa)
    assert_false('prob_corrected' in superres.a.clusterstats.dtype.names)
    assert_false('clusters_fwe_thresh' in superres.fa)

    # check validity test
    assert_raises(ValueError, gct.GroupClusterThreshold,
                  n_bootstrap=10, feature_thresh_prob=.09, n_proc=n_proc)
    # check mapped datasets
    blob = np.array([[0, 0, .5, 3, 5, 3, 3, 0, 2, 0],
                     [0, 0,  0, 0, 0, 0, 0, 0, 0, 0]])
    blob = dataset_wizard([blob])
    # and some nice random permutations
    nperms = 100 * nsubj
    perm_samples = np.random.randn(*((nperms,) + blob.shape))
    perms = dataset_wizard(
        perm_samples, chunks=np.repeat(range(nsubj), len(perm_samples) / nsubj))
    clthr.train(perms)
    twodres = clthr(blob)
    # finds two clusters of the same size
    assert_array_equal(twodres.a.clusterstats['size'],
                       res.a.clusterstats['size'])
Example #7
0
    def test_SplitRFE(self, fmeasure):
        # just a smoke test ATM
        from mvpa2.clfs.svm import LinearCSVMC
        from mvpa2.clfs.meta import MappedClassifier
        from mvpa2.misc.data_generators import normal_feature_dataset
        #import mvpa2.featsel.rfe
        #reload(mvpa2.featsel.rfe)
        from mvpa2.featsel.rfe import RFE, SplitRFE
        from mvpa2.generators.partition import NFoldPartitioner
        from mvpa2.featsel.helpers import FractionTailSelector
        from mvpa2.testing import ok_, assert_equal

        clf = LinearCSVMC(C=1)
        dataset = normal_feature_dataset(perlabel=20, nlabels=2, nfeatures=11,
                                         snr=1., nonbogus_features=[1, 5])
        # flip one of the meaningful features around to see
        # if we are still getting proper selection
        dataset.samples[:, dataset.a.nonbogus_features[1]] *= -1
        # 3 partitions should be enough for testing
        partitioner = NFoldPartitioner(count=3)

        rfeclf = MappedClassifier(
            clf, SplitRFE(clf,
                          partitioner,
                          fselector=FractionTailSelector(
                              0.5, mode='discard', tail='lower'),
                          fmeasure=fmeasure,
                           # need to update only when using clf's sens anal
                          update_sensitivity=fmeasure is None))
        r0 = repr(rfeclf)

        ok_(rfeclf.mapper.nfeatures_min == 0)
        rfeclf.train(dataset)
        ok_(rfeclf.mapper.nfeatures_min > 0)
        predictions = rfeclf(dataset).samples

        # at least 1 of the nonbogus-features should be chosen
        ok_(len(set(dataset.a.nonbogus_features).intersection(
                rfeclf.mapper.slicearg)) > 0)

        # check repr to have all needed pieces
        r = repr(rfeclf)
        s = str(rfeclf)
        ok_(('partitioner=NFoldP' in r) or
            ('partitioner=mvpa2.generators.partition.NFoldPartitioner' in r))
        ok_('lrn=' in r)
        ok_(not 'slicearg=' in r)
        assert_equal(r, r0)

        if externals.exists('joblib'):
            rfeclf.mapper.nproc = -1
            # compare results against the one ran in parallel
            _slicearg = rfeclf.mapper.slicearg
            _predictions = predictions
            rfeclf.train(dataset)
            predictions = rfeclf(dataset).samples
            assert_array_equal(predictions, _predictions)
            assert_array_equal(_slicearg, rfeclf.mapper.slicearg)

        # Test that we can collect stats from cas within cross-validation
        sensitivities = []
        nested_errors = []
        nested_nfeatures = []
        def store_me(data, node, result):
            sens = node.measure.get_sensitivity_analyzer(force_train=False)(data)
            sensitivities.append(sens)
            nested_errors.append(node.measure.mapper.ca.nested_errors)
            nested_nfeatures.append(node.measure.mapper.ca.nested_nfeatures)
        cv = CrossValidation(rfeclf, NFoldPartitioner(count=1), callback=store_me,
                             enable_ca=['stats'])
        _ = cv(dataset)
        # just to make sure we collected them
        assert_equal(len(sensitivities), 1)
        assert_equal(len(nested_errors), 1)
        assert_equal(len(nested_nfeatures), 1)
def test_cluster_count():
    skip_if_no_external('scipy', min_version='0.10')
    # we get a ZERO cluster count of one if there are no clusters at all
    # this is needed to keept track of the number of bootstrap samples that yield
    # no cluster at all (high treshold) in order to compute p-values when there is no
    # actual cluster size histogram
    assert_equal(gct._get_map_cluster_sizes([0, 0, 0, 0]), [0])
    # if there is at least one cluster: no ZERO count
    assert_equal(gct._get_map_cluster_sizes([0, 0, 1, 0]), [1])
    for i in range(2):  # rerun tests for bool type of test_M
        test_M = np.array([[1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0],
                           [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1],
                           [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1],
                           [0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
                           [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0],
                           [0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0],
                           [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0],
                           [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0],
                           [1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0]])
        expected_result = [5, 4, 3, 3, 2, 0, 2]  # 5 clusters of size 1,
        # 4 clusters of size 2 ...

        test_ds = Dataset([test_M])
        if i == 1:
            test_M = test_M.astype(bool)

        test_M_3d = np.hstack(
            (test_M.flatten(), test_M.flatten())).reshape(2, 9, 16)
        test_ds_3d = Dataset([test_M_3d])
        # expected_result^2
        expected_result_3d = np.array(
            [0, 5, 0, 4, 0, 3, 0, 3, 0, 2, 0, 0, 0, 2])

        size = 10000  # how many times bigger than test_M_3d
        test_M_3d_big = np.hstack((test_M_3d.flatten(), np.zeros(144)))
        test_M_3d_big = np.hstack(
            (test_M_3d_big for i in range(size))).reshape(3 * size, 9, 16)
        test_ds_3d_big = Dataset([test_M_3d_big])
        expected_result_3d_big = expected_result_3d * size

        # check basic cluster size determination for plain arrays and datasets
        # with a single sample
        for t, e in ((test_M, expected_result), (test_ds, expected_result),
                     (test_M_3d, expected_result_3d), (test_ds_3d,
                                                       expected_result_3d),
                     (test_M_3d_big, expected_result_3d_big),
                     (test_ds_3d_big, expected_result_3d_big)):
            assert_array_equal(
                np.bincount(gct._get_map_cluster_sizes(t))[1:], e)
        # old
        M = np.vstack([test_M_3d.flatten()] * 10)
        # new
        ds = dataset_wizard([test_M_3d] * 10)
        assert_array_equal(M, ds)
        expected_result = Counter(
            np.hstack([gct._get_map_cluster_sizes(test_M_3d)] * 10))
        assert_array_equal(expected_result, gct.get_cluster_sizes(ds))

        # test the same with some arbitrary per-feature threshold
        thr = 4
        labels, num = measurements.label(test_M_3d)
        area = measurements.sum(test_M_3d,
                                labels,
                                index=np.arange(labels.max() + 1))
        cluster_sizes_map = area[labels]  # .astype(int)
        thresholded_cluster_sizes_map = cluster_sizes_map > thr
        # old
        M = np.vstack([cluster_sizes_map.flatten()] * 10)
        # new
        ds = dataset_wizard([cluster_sizes_map] * 10)
        assert_array_equal(M, ds)
        expected_result = Counter(
            np.hstack(
                [gct._get_map_cluster_sizes(thresholded_cluster_sizes_map)] *
                10))
        th_map = np.ones(cluster_sizes_map.flatten().shape) * thr
        # threshold dataset by hand
        ds.samples = ds.samples > th_map
        assert_array_equal(expected_result, gct.get_cluster_sizes(ds))
def test_group_clusterthreshold_simple(n_proc):
    if n_proc > 1:
        skip_if_no_external('joblib')
    feature_thresh_prob = 0.005
    nsubj = 10
    # make a nice 1D blob and a speck
    blob = np.array([0, 0, .5, 3, 5, 3, 3, 0, 2, 0])
    blob = Dataset([blob])
    # and some nice random permutations
    nperms = 100 * nsubj
    perm_samples = np.random.randn(nperms, blob.nfeatures)
    perms = Dataset(perm_samples,
                    sa=dict(chunks=np.repeat(range(nsubj),
                                             len(perm_samples) / nsubj)),
                    fa=dict(fid=range(perm_samples.shape[1])))
    # the algorithm instance
    # scale number of bootstraps to match desired probability
    # plus a safety margin to mimimize bad luck in sampling
    clthr = gct.GroupClusterThreshold(n_bootstrap=int(3. /
                                                      feature_thresh_prob),
                                      feature_thresh_prob=feature_thresh_prob,
                                      fwe_rate=0.01,
                                      n_blocks=3,
                                      n_proc=n_proc)
    clthr.train(perms)
    # get the FE thresholds
    thr = clthr._thrmap
    # perms are normally distributed, hence the CDF should be close, std of the distribution
    # will scale 1/sqrt(nsubj)
    assert_true(
        np.abs(feature_thresh_prob -
               (1 - norm.cdf(thr.mean(), loc=0, scale=1. / np.sqrt(nsubj)))) <
        0.01)

    clstr_sizes = clthr._null_cluster_sizes
    # getting anything but a lonely one feature cluster is very unlikely
    assert_true(max([c[0] for c in clstr_sizes.keys()]) <= 1)
    # threshold orig map
    res = clthr(blob)
    #
    # check output
    #
    # samples unchanged
    assert_array_equal(blob.samples, res.samples)
    # need to find the big cluster
    assert_true(len(res.a.clusterstats) > 0)
    assert_equal(len(res.a.clusterstats),
                 res.fa.clusters_featurewise_thresh.max())
    # probs need to decrease with size, clusters are sorted by size (decreasing)
    assert_true(
        res.a.clusterstats['prob_raw'][0] <= res.a.clusterstats['prob_raw'][1])
    # corrected probs for every uncorrected cluster
    assert_true('prob_corrected' in res.a.clusterstats.dtype.names)
    # fwe correction always increases the p-values (if anything)
    assert_true(
        np.all(res.a.clusterstats['prob_raw'] <=
               res.a.clusterstats['prob_corrected']))
    # check expected cluster sizes, ordered large -> small
    assert_array_equal(res.a.clusterstats['size'], [4, 1])
    # check max position
    assert_array_equal(res.a.clusterlocations['max'], [[4], [8]])
    # center of mass: eyeballed
    assert_array_almost_equal(res.a.clusterlocations['center_of_mass'],
                              [[4.429], [8]], 3)
    # other simple stats
    #[0, 0, .5, 3, 5, 3, 3, 0, 2, 0]
    assert_array_equal(res.a.clusterstats['mean'], [3.5, 2])
    assert_array_equal(res.a.clusterstats['min'], [3, 2])
    assert_array_equal(res.a.clusterstats['max'], [5, 2])
    assert_array_equal(res.a.clusterstats['median'], [3, 2])
    assert_array_almost_equal(res.a.clusterstats['std'], [0.866, 0], 3)

    # fwe thresholding only ever removes clusters
    assert_true(
        np.all(
            np.abs(res.fa.clusters_featurewise_thresh -
                   res.fa.clusters_fwe_thresh) >= 0))
    # FWE should kill the small one
    assert_greater(res.fa.clusters_featurewise_thresh.max(),
                   res.fa.clusters_fwe_thresh.max())

    # check that the cluster results aren't depending in the actual location of
    # the clusters
    shifted_blob = Dataset([[.5, 3, 5, 3, 3, 0, 0, 0, 2, 0]])
    shifted_res = clthr(shifted_blob)
    assert_array_equal(res.a.clusterstats, shifted_res.a.clusterstats)

    # check that it averages multi-sample datasets
    # also checks that scenarios work where all features are part of one big
    # cluster
    multisamp = Dataset(np.arange(30).reshape(3, 10) + 100)
    avgres = clthr(multisamp)
    assert_equal(len(avgres), 1)
    assert_array_equal(avgres.samples[0], np.mean(multisamp.samples, axis=0))

    # retrain, this time with data from only a single subject
    perms = Dataset(perm_samples,
                    sa=dict(chunks=np.repeat(1, len(perm_samples))),
                    fa=dict(fid=range(perms.shape[1])))
    clthr.train(perms)
    # same blob -- 1st this should work without issues
    sglres = clthr(blob)
    # NULL estimation does no averaging
    # -> more noise -> fewer clusters -> higher p
    assert_greater_equal(len(res.a.clusterstats), len(sglres.a.clusterstats))
    assert_greater_equal(np.round(sglres.a.clusterstats[0]['prob_raw'], 4),
                         np.round(res.a.clusterstats[0]['prob_raw'], 4))
    # no again for real scientists: no FWE correction
    superclthr = gct.GroupClusterThreshold(
        n_bootstrap=int(3. / feature_thresh_prob),
        feature_thresh_prob=feature_thresh_prob,
        multicomp_correction=None,
        n_blocks=3,
        n_proc=n_proc)
    superclthr.train(perms)
    superres = superclthr(blob)
    assert_true('prob_corrected' in res.a.clusterstats.dtype.names)
    assert_true('clusters_fwe_thresh' in res.fa)
    assert_false('prob_corrected' in superres.a.clusterstats.dtype.names)
    assert_false('clusters_fwe_thresh' in superres.fa)

    # check validity test
    assert_raises(ValueError,
                  gct.GroupClusterThreshold,
                  n_bootstrap=10,
                  feature_thresh_prob=.09,
                  n_proc=n_proc)
    # check mapped datasets
    blob = np.array([[0, 0, .5, 3, 5, 3, 3, 0, 2, 0],
                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
    blob = dataset_wizard([blob])
    # and some nice random permutations
    nperms = 100 * nsubj
    perm_samples = np.random.randn(*((nperms, ) + blob.shape))
    perms = dataset_wizard(perm_samples,
                           chunks=np.repeat(range(nsubj),
                                            len(perm_samples) / nsubj))
    clthr.train(perms)
    twodres = clthr(blob)
    # finds two clusters of the same size
    assert_array_equal(twodres.a.clusterstats['size'],
                       res.a.clusterstats['size'])
Example #10
0
    def test_SplitRFE(self, fmeasure):
        # just a smoke test ATM
        from mvpa2.clfs.svm import LinearCSVMC
        from mvpa2.clfs.meta import MappedClassifier
        from mvpa2.misc.data_generators import normal_feature_dataset
        #import mvpa2.featsel.rfe
        #reload(mvpa2.featsel.rfe)
        from mvpa2.featsel.rfe import RFE, SplitRFE
        from mvpa2.generators.partition import NFoldPartitioner
        from mvpa2.featsel.helpers import FractionTailSelector
        from mvpa2.testing import ok_, assert_equal

        clf = LinearCSVMC(C=1)
        dataset = normal_feature_dataset(perlabel=20,
                                         nlabels=2,
                                         nfeatures=11,
                                         snr=1.,
                                         nonbogus_features=[1, 5])
        # flip one of the meaningful features around to see
        # if we are still getting proper selection
        dataset.samples[:, dataset.a.nonbogus_features[1]] *= -1
        # 3 partitions should be enough for testing
        partitioner = NFoldPartitioner(count=3)

        rfeclf = MappedClassifier(
            clf,
            SplitRFE(
                clf,
                partitioner,
                fselector=FractionTailSelector(0.5,
                                               mode='discard',
                                               tail='lower'),
                fmeasure=fmeasure,
                # need to update only when using clf's sens anal
                update_sensitivity=fmeasure is None))
        r0 = repr(rfeclf)

        ok_(rfeclf.mapper.nfeatures_min == 0)
        rfeclf.train(dataset)
        ok_(rfeclf.mapper.nfeatures_min > 0)
        predictions = rfeclf(dataset).samples

        # at least 1 of the nonbogus-features should be chosen
        ok_(
            len(
                set(dataset.a.nonbogus_features).intersection(
                    rfeclf.mapper.slicearg)) > 0)

        # check repr to have all needed pieces
        r = repr(rfeclf)
        s = str(rfeclf)
        ok_(('partitioner=NFoldP' in r) or
            ('partitioner=mvpa2.generators.partition.NFoldPartitioner' in r))
        ok_('lrn=' in r)
        ok_(not 'slicearg=' in r)
        assert_equal(r, r0)

        if externals.exists('joblib'):
            rfeclf.mapper.nproc = -1
            # compare results against the one ran in parallel
            _slicearg = rfeclf.mapper.slicearg
            _predictions = predictions
            rfeclf.train(dataset)
            predictions = rfeclf(dataset).samples
            assert_array_equal(predictions, _predictions)
            assert_array_equal(_slicearg, rfeclf.mapper.slicearg)

        # Test that we can collect stats from cas within cross-validation
        sensitivities = []
        nested_errors = []
        nested_nfeatures = []

        def store_me(data, node, result):
            sens = node.measure.get_sensitivity_analyzer(
                force_train=False)(data)
            sensitivities.append(sens)
            nested_errors.append(node.measure.mapper.ca.nested_errors)
            nested_nfeatures.append(node.measure.mapper.ca.nested_nfeatures)

        cv = CrossValidation(rfeclf,
                             NFoldPartitioner(count=1),
                             callback=store_me,
                             enable_ca=['stats'])
        _ = cv(dataset)
        # just to make sure we collected them
        assert_equal(len(sensitivities), 1)
        assert_equal(len(nested_errors), 1)
        assert_equal(len(nested_nfeatures), 1)