Exemple #1
0
def test_simple_cluster_level_thresholding():
    nf = 13
    nperms = 100
    pthr_feature = 0.5  # just for testing
    pthr_cluster = 0.5
    rand_acc = np.random.normal(size=(nperms, nf))
    acc = np.random.normal(size=(1, nf))

    # Step 1 is to "fit" "Nonparametrics" per each of the features
    from mvpa2.clfs.stats import Nonparametric
    dists = [Nonparametric(samples) for samples in rand_acc.T]
    # we should be able to assert "p" value for each random sample for each feature
    rand_acc_p = np.array([dist.rcdf(v)
                           for dist, v in zip(dists, rand_acc.T)]).T

    rand_acc_p_slow = np.array(
        [[dist.rcdf(v) for dist, v in zip(dists, sample)]
         for sample in rand_acc])
    assert_array_equal(rand_acc_p_slow, rand_acc_p)

    assert_equal(rand_acc_p.shape, rand_acc.shape)
    assert (np.all(rand_acc_p <= 1))
    assert (np.all(rand_acc_p > 0))

    # 2: apply the same to our acc
    acc_p = np.array([dist.rcdf(v) for dist, v in zip(dists, acc[0])])[None, :]
    assert (np.all(acc_p <= 1))
    assert (np.all(acc_p > 0))

    skip_if_no_external('scipy')
    # Now we need to do our fancy cluster level madness
    from mvpa2.algorithms.group_clusterthr import \
        get_cluster_sizes, _transform_to_pvals, get_cluster_pvals, \
        get_thresholding_map, repeat_cluster_vals

    rand_acc_p_thr = rand_acc_p < pthr_feature
    acc_p_thr = acc_p < pthr_feature

    rand_cluster_sizes = get_cluster_sizes(rand_acc_p_thr)
    acc_cluster_sizes = get_cluster_sizes(acc_p_thr)

    # This is how we can compute it within present implementation.
    # It will be a bit different (since it doesn't account for target value if
    # I got it right), and would work only for accuracies
    thr_map = get_thresholding_map(rand_acc, pthr_feature)
    rand_cluster_sizes_ = get_cluster_sizes(rand_acc > thr_map)
    acc_cluster_sizes_ = get_cluster_sizes(acc > thr_map)

    assert_equal(rand_cluster_sizes, rand_cluster_sizes_)
    assert_equal(acc_cluster_sizes, acc_cluster_sizes_)

    #print rand_cluster_sizes
    #print acc_cluster_sizes

    # That is how it is done in group_clusterthr atm
    # store cluster size histogram for later p-value evaluation
    # use a sparse matrix for easy consumption (max dim is the number of
    # features, i.e. biggest possible cluster)
    from scipy.sparse import dok_matrix
    scl = dok_matrix((1, nf + 1), dtype=int)
    for s in rand_cluster_sizes:
        scl[0, s] = rand_cluster_sizes[s]

    test_count_sizes = repeat_cluster_vals(acc_cluster_sizes)
    test_pvals = _transform_to_pvals(test_count_sizes, scl.astype('float'))
    # needs conversion to array for comparisons
    test_pvals = np.asanyarray(test_pvals)
    # critical cluster_level threshold (without FW correction between clusters)
    # would be
    clusters_passed_threshold = test_count_sizes[test_pvals <= pthr_cluster]

    if len(clusters_passed_threshold):
        thr_cluster_size = min(clusters_passed_threshold)
        #print("Min cluster size which passed threshold: %d" % thr_cluster_size)
    else:
        #print("No clusters passed threshold")
        pass
    #print test_count_sizes, test_pvals

    acc_cluster_ps = get_cluster_pvals(acc_cluster_sizes, rand_cluster_sizes)

    for test_pval, test_count_size in zip(test_pvals, test_count_sizes):
        assert_almost_equal(acc_cluster_ps[test_count_size], test_pval)
def test_cluster_count():
    skip_if_no_external('scipy', min_version='0.10')
    # we get a ZERO cluster count of one if there are no clusters at all
    # this is needed to keept track of the number of bootstrap samples that yield
    # no cluster at all (high treshold) in order to compute p-values when there is no
    # actual cluster size histogram
    assert_equal(gct._get_map_cluster_sizes([0, 0, 0, 0]), [0])
    # if there is at least one cluster: no ZERO count
    assert_equal(gct._get_map_cluster_sizes([0, 0, 1, 0]), [1])
    for i in range(2):  # rerun tests for bool type of test_M
        test_M = np.array([[1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0],
                           [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1],
                           [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1],
                           [0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
                           [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0],
                           [0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0],
                           [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0],
                           [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0],
                           [1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0]])
        expected_result = [5, 4, 3, 3, 2, 0, 2]  # 5 clusters of size 1,
        # 4 clusters of size 2 ...

        test_ds = Dataset([test_M])
        if i == 1:
            test_M = test_M.astype(bool)

        test_M_3d = np.hstack(
            (test_M.flatten(), test_M.flatten())).reshape(2, 9, 16)
        test_ds_3d = Dataset([test_M_3d])
        # expected_result^2
        expected_result_3d = np.array(
            [0, 5, 0, 4, 0, 3, 0, 3, 0, 2, 0, 0, 0, 2])

        size = 10000  # how many times bigger than test_M_3d
        test_M_3d_big = np.hstack((test_M_3d.flatten(), np.zeros(144)))
        test_M_3d_big = np.hstack(
            (test_M_3d_big for i in range(size))).reshape(3 * size, 9, 16)
        test_ds_3d_big = Dataset([test_M_3d_big])
        expected_result_3d_big = expected_result_3d * size

        # check basic cluster size determination for plain arrays and datasets
        # with a single sample
        for t, e in ((test_M, expected_result), (test_ds, expected_result),
                     (test_M_3d, expected_result_3d), (test_ds_3d,
                                                       expected_result_3d),
                     (test_M_3d_big, expected_result_3d_big),
                     (test_ds_3d_big, expected_result_3d_big)):
            assert_array_equal(
                np.bincount(gct._get_map_cluster_sizes(t))[1:], e)
        # old
        M = np.vstack([test_M_3d.flatten()] * 10)
        # new
        ds = dataset_wizard([test_M_3d] * 10)
        assert_array_equal(M, ds)
        expected_result = Counter(
            np.hstack([gct._get_map_cluster_sizes(test_M_3d)] * 10))
        assert_array_equal(expected_result, gct.get_cluster_sizes(ds))

        # test the same with some arbitrary per-feature threshold
        thr = 4
        labels, num = measurements.label(test_M_3d)
        area = measurements.sum(test_M_3d,
                                labels,
                                index=np.arange(labels.max() + 1))
        cluster_sizes_map = area[labels]  # .astype(int)
        thresholded_cluster_sizes_map = cluster_sizes_map > thr
        # old
        M = np.vstack([cluster_sizes_map.flatten()] * 10)
        # new
        ds = dataset_wizard([cluster_sizes_map] * 10)
        assert_array_equal(M, ds)
        expected_result = Counter(
            np.hstack(
                [gct._get_map_cluster_sizes(thresholded_cluster_sizes_map)] *
                10))
        th_map = np.ones(cluster_sizes_map.flatten().shape) * thr
        # threshold dataset by hand
        ds.samples = ds.samples > th_map
        assert_array_equal(expected_result, gct.get_cluster_sizes(ds))
def test_cluster_count():
    if externals.versions['scipy'] < '0.10':
        raise SkipTest
    # we get a ZERO cluster count of one if there are no clusters at all
    # this is needed to keept track of the number of bootstrap samples that yield
    # no cluster at all (high treshold) in order to compute p-values when there is no
    # actual cluster size histogram
    assert_equal(gct._get_map_cluster_sizes([0, 0, 0, 0]), [0])
    # if there is at least one cluster: no ZERO count
    assert_equal(gct._get_map_cluster_sizes([0, 0, 1, 0]), [1])
    for i in range(2):  # rerun tests for bool type of test_M
        test_M = np.array([[1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0],
                           [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1],
                           [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1],
                           [0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
                           [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0],
                           [0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0],
                           [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0],
                           [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0],
                           [1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0]])
        expected_result = [5, 4, 3, 3, 2, 0, 2]  # 5 clusters of size 1,
                                                 # 4 clusters of size 2 ...

        test_ds = Dataset([test_M])
        if i == 1:
            test_M = test_M.astype(bool)

        test_M_3d = np.hstack((test_M.flatten(),
                               test_M.flatten())).reshape(2, 9, 16)
        test_ds_3d = Dataset([test_M_3d])
        # expected_result^2
        expected_result_3d = np.array([0, 5, 0, 4, 0, 3, 0,
                                       3, 0, 2, 0, 0, 0, 2])

        size = 10000  # how many times bigger than test_M_3d
        test_M_3d_big = np.hstack((test_M_3d.flatten(), np.zeros(144)))
        test_M_3d_big = np.hstack((test_M_3d_big for i in range(size))
                                  ).reshape(3 * size, 9, 16)
        test_ds_3d_big = Dataset([test_M_3d_big])
        expected_result_3d_big = expected_result_3d * size

        # check basic cluster size determination for plain arrays and datasets
        # with a single sample
        for t, e in ((test_M, expected_result),
                     (test_ds, expected_result),
                     (test_M_3d, expected_result_3d),
                     (test_ds_3d, expected_result_3d),
                     (test_M_3d_big, expected_result_3d_big),
                     (test_ds_3d_big, expected_result_3d_big)):
            assert_array_equal(np.bincount(gct._get_map_cluster_sizes(t))[1:],
                               e)
        # old
        M = np.vstack([test_M_3d.flatten()] * 10)
        # new
        ds = dataset_wizard([test_M_3d] * 10)
        assert_array_equal(M, ds)
        expected_result = Counter(np.hstack([gct._get_map_cluster_sizes(test_M_3d)] * 10))
        assert_array_equal(expected_result,
                           gct.get_cluster_sizes(ds))

        # test the same with some arbitrary per-feature threshold
        thr = 4
        labels, num = measurements.label(test_M_3d)
        area = measurements.sum(test_M_3d, labels,
                                index=np.arange(labels.max() + 1))
        cluster_sizes_map = area[labels]  # .astype(int)
        thresholded_cluster_sizes_map = cluster_sizes_map > thr
        # old
        M = np.vstack([cluster_sizes_map.flatten()] * 10)
        # new
        ds = dataset_wizard([cluster_sizes_map] * 10)
        assert_array_equal(M, ds)
        expected_result = Counter(np.hstack(
            [gct._get_map_cluster_sizes(thresholded_cluster_sizes_map)] * 10))
        th_map = np.ones(cluster_sizes_map.flatten().shape) * thr
        # threshold dataset by hand
        ds.samples = ds.samples > th_map
        assert_array_equal(expected_result,
                           gct.get_cluster_sizes(ds))
Exemple #4
0
def test_simple_cluster_level_thresholding():
    nf = 13
    nperms = 100
    pthr_feature = 0.5  # just for testing
    pthr_cluster = 0.5
    rand_acc = np.random.normal(size=(nperms, nf))
    acc = np.random.normal(size=(1, nf))

    # Step 1 is to "fit" "Nonparametrics" per each of the features
    from mvpa2.clfs.stats import Nonparametric
    dists = [Nonparametric(samples) for samples in rand_acc.T]
    # we should be able to assert "p" value for each random sample for each feature
    rand_acc_p = np.array(
        [dist.rcdf(v) for dist, v in zip(dists, rand_acc.T)]
        ).T

    rand_acc_p_slow = np.array([
        [dist.rcdf(v) for dist, v in zip(dists, sample)]
         for sample in rand_acc])
    assert_array_equal(rand_acc_p_slow, rand_acc_p)

    assert_equal(rand_acc_p.shape, rand_acc.shape)
    assert(np.all(rand_acc_p <= 1))
    assert(np.all(rand_acc_p > 0))

    # 2: apply the same to our acc
    acc_p = np.array([dist.rcdf(v) for dist, v in zip(dists, acc[0])])[None, :]
    assert(np.all(acc_p <= 1))
    assert(np.all(acc_p > 0))

    skip_if_no_external('scipy')
    # Now we need to do our fancy cluster level madness
    from mvpa2.algorithms.group_clusterthr import \
        get_cluster_sizes, _transform_to_pvals, get_cluster_pvals, \
        get_thresholding_map, repeat_cluster_vals

    rand_acc_p_thr = rand_acc_p < pthr_feature
    acc_p_thr = acc_p < pthr_feature

    rand_cluster_sizes = get_cluster_sizes(rand_acc_p_thr)
    acc_cluster_sizes = get_cluster_sizes(acc_p_thr)

    # This is how we can compute it within present implementation.
    # It will be a bit different (since it doesn't account for target value if
    # I got it right), and would work only for accuracies
    thr_map = get_thresholding_map(rand_acc, pthr_feature)
    rand_cluster_sizes_ = get_cluster_sizes(rand_acc > thr_map)
    acc_cluster_sizes_ = get_cluster_sizes(acc > thr_map)

    assert_equal(rand_cluster_sizes, rand_cluster_sizes_)
    assert_equal(acc_cluster_sizes, acc_cluster_sizes_)

    #print rand_cluster_sizes
    #print acc_cluster_sizes

    # That is how it is done in group_clusterthr atm
    # store cluster size histogram for later p-value evaluation
    # use a sparse matrix for easy consumption (max dim is the number of
    # features, i.e. biggest possible cluster)
    from scipy.sparse import dok_matrix
    scl = dok_matrix((1, nf + 1), dtype=int)
    for s in rand_cluster_sizes:
        scl[0, s] = rand_cluster_sizes[s]

    test_count_sizes = repeat_cluster_vals(acc_cluster_sizes)
    test_pvals = _transform_to_pvals(test_count_sizes, scl.astype('float'))
    # needs conversion to array for comparisons
    test_pvals = np.asanyarray(test_pvals)
    # critical cluster_level threshold (without FW correction between clusters)
    # would be
    clusters_passed_threshold = test_count_sizes[test_pvals <= pthr_cluster]

    if len(clusters_passed_threshold):
        thr_cluster_size = min(clusters_passed_threshold)
        #print("Min cluster size which passed threshold: %d" % thr_cluster_size)
    else:
        #print("No clusters passed threshold")
        pass
    #print test_count_sizes, test_pvals


    acc_cluster_ps = get_cluster_pvals(acc_cluster_sizes, rand_cluster_sizes)

    for test_pval, test_count_size in zip(test_pvals, test_count_sizes):
        assert_almost_equal(acc_cluster_ps[test_count_size], test_pval)