Ejemplo n.º 1
0
    def test_surface_face_normal(self, do_deterioriate_surface):
        vec1 = np.random.normal(size=(3,))
        vec2 = np.random.normal(size=(3,))

        vec_normal = -np.cross(vec1, vec2)

        plane = generate_plane((0, 0, 0), vec1, vec2, 10, 10)

        if do_deterioriate_surface:
            plane = SurfingSurfaceTests.deterioriate_surface(plane)

        plane_face_normals = plane.face_normals

        has_non_nan = False

        for f_n in plane_face_normals:
            if np.any(np.isnan(f_n)):
                continue

            assert_vector_direction_almost_equal(f_n, vec_normal, decimal=0)
            assert_almost_equal(f_n, surf.normalized(
                plane.nanmean_face_normal), decimal=0)

            has_non_nan = True

        if not has_non_nan:
            assert False, "Test should include faces with non-NaN normals"
Ejemplo n.º 2
0
    def test_surface_face_normal(self, do_deterioriate_surface):
        vec1 = np.random.normal(size=(3,))
        vec2 = np.random.normal(size=(3,))

        vec_normal = -np.cross(vec1, vec2)

        plane = generate_plane((0, 0, 0), vec1, vec2, 10, 10)

        if do_deterioriate_surface:
            plane = SurfingSurfaceTests.deterioriate_surface(plane)

        plane_face_normals = plane.face_normals

        has_non_nan = False

        for f_n in plane_face_normals:
            if np.any(np.isnan(f_n)):
                continue

            assert_vector_direction_almost_equal(f_n, vec_normal, decimal=0)
            assert_almost_equal(f_n, surf.normalized(
                plane.nanmean_face_normal), decimal=0)

            has_non_nan = True

        if not has_non_nan:
            assert False, "Test should include faces with non-NaN normals"
Ejemplo n.º 3
0
def test_balancer():
    ds = give_data()
    # only mark the selection in an attribute
    bal = Balancer()
    res = bal(ds)
    # we get a new dataset, with shared samples
    assert_false(ds is res)
    assert_true(ds.samples is res.samples.base)
    # should kick out 2 samples in each chunk of 10
    assert_almost_equal(np.mean(res.sa.balanced_set), 0.8)
    # same as above, but actually apply the selection
    bal = Balancer(apply_selection=True, count=5)
    # just run it once
    res = bal(ds)
    # we get a new dataset, with shared samples
    assert_false(ds is res)
    # should kick out 2 samples in each chunk of 10
    assert_equal(len(res), int(0.8 * len(ds)))
    # now use it as a generator
    dses = list(bal.generate(ds))
    assert_equal(len(dses), 5)
    # with limit
    bal = Balancer(limit={'chunks': 3}, apply_selection=True)
    res = bal(ds)
    assert_equal(res.sa['chunks'].unique, (3, ))
    assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4)
    # same but include all offlimit samples
    bal = Balancer(limit={'chunks': 3},
                   include_offlimit=True,
                   apply_selection=True)
    res = bal(ds)
    assert_array_equal(res.sa['chunks'].unique, range(10))
    # chunk three still balanced, but the rest is not, i.e. all samples included
    assert_equal(
        get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(),
        [2] * 4)
    assert_equal(
        get_nelements_per_value(res.sa.chunks).values(),
        [10, 10, 10, 8, 10, 10, 10, 10, 10, 10])
    # fixed amount
    bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True)
    res = bal(ds)
    assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4)
    # fraction
    bal = Balancer(amount=0.499, limit=None, apply_selection=True)
    res = bal(ds)
    assert_array_equal(
        np.round(
            np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5),
        np.array(get_nelements_per_value(res.sa.targets).values()))
    # check on feature attribute
    ds.fa['one'] = np.tile([1, 2], 5)
    ds.fa['chk'] = np.repeat([1, 2], 5)
    bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True)
    res = bal(ds)
    assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
Ejemplo n.º 4
0
def test_balancer():
    ds = give_data()
    # only mark the selection in an attribute
    bal = Balancer()
    res = bal(ds)
    # we get a new dataset, with shared samples
    assert_false(ds is res)
    assert_true(ds.samples is res.samples.base)
    # should kick out 2 samples in each chunk of 10
    assert_almost_equal(np.mean(res.sa.balanced_set), 0.8)
    # same as above, but actually apply the selection
    bal = Balancer(apply_selection=True, count=5)
    # just run it once
    res = bal(ds)
    # we get a new dataset, with shared samples
    assert_false(ds is res)
    # should kick out 2 samples in each chunk of 10
    assert_equal(len(res), int(0.8 * len(ds)))
    # now use it as a generator
    dses = list(bal.generate(ds))
    assert_equal(len(dses), 5)
    # with limit
    bal = Balancer(limit={'chunks': 3}, apply_selection=True)
    res = bal(ds)
    assert_equal(res.sa['chunks'].unique, (3,))
    assert_equal(get_nelements_per_value(res.sa.targets).values(),
                 [2] * 4)
    # same but include all offlimit samples
    bal = Balancer(limit={'chunks': 3}, include_offlimit=True,
                   apply_selection=True)
    res = bal(ds)
    assert_array_equal(res.sa['chunks'].unique, range(10))
    # chunk three still balanced, but the rest is not, i.e. all samples included
    assert_equal(get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(),
                 [2] * 4)
    assert_equal(get_nelements_per_value(res.sa.chunks).values(),
                 [10, 10, 10, 8, 10, 10, 10, 10, 10, 10])
    # fixed amount
    bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True)
    res = bal(ds)
    assert_equal(get_nelements_per_value(res.sa.targets).values(),
                 [1] * 4)
    # fraction
    bal = Balancer(amount=0.499, limit=None, apply_selection=True)
    res = bal(ds)
    assert_array_equal(
            np.round(np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5),
            np.array(get_nelements_per_value(res.sa.targets).values()))
    # check on feature attribute
    ds.fa['one'] = np.tile([1,2], 5)
    ds.fa['chk'] = np.repeat([1,2], 5)
    bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True)
    res = bal(ds)
    assert_equal(get_nelements_per_value(res.fa.one).values(),
                 [4] * 2)
Ejemplo n.º 5
0
    def test_average_node_edge_length_tiny(self):
        a = np.random.uniform(low=2, high=5)
        b = np.random.uniform(low=2, high=5)
        c = (a ** 2 + b ** 2) ** .5

        vertices = [(0, 0, 0), (0, 0, a), (0, b, 0)]
        faces = [(0, 1, 2)]

        s = Surface(vertices, faces)
        expected_avg = [(a + b) / 2, (a + c) / 2, (b + c) / 2]
        assert_almost_equal(s.average_node_edge_length, expected_avg)
Ejemplo n.º 6
0
    def test_average_node_edge_length_tiny(self):
        a = np.random.uniform(low=2, high=5)
        b = np.random.uniform(low=2, high=5)
        c = (a ** 2 + b ** 2) ** .5

        vertices = [(0, 0, 0), (0, 0, a), (0, b, 0)]
        faces = [(0, 1, 2)]

        s = Surface(vertices, faces)
        expected_avg = [(a + b) / 2, (a + c) / 2, (b + c) / 2]
        assert_almost_equal(s.average_node_edge_length, expected_avg)
Ejemplo n.º 7
0
def test_mean_tpr():
    # Let's test now on some disbalanced sets
    assert_raises(ValueError, mean_tpr, [1], [])
    assert_raises(ValueError, mean_tpr, [], [1])
    assert_raises(ValueError, mean_tpr, [], [])

    # now interesting one where there were no target when it was in predicted
    assert_raises(ValueError, mean_tpr, [1], [0])
    assert_raises(ValueError, mean_tpr, [0, 1], [0, 0])
    # but it should be ok to have some targets not present in prediction
    assert_equal(mean_tpr([0, 0], [0, 1]), .5)
    # the same regardless how many samples in 0-class, if all misclassified
    # (winner by # of samples takes all)
    assert_equal(mean_tpr([0, 0, 0], [0, 0, 1]), .5)
    # whenever mean-accuracy would be different
    assert_almost_equal(mean_match_accuracy([0, 0, 0], [0, 0, 1]), 2 / 3.)
Ejemplo n.º 8
0
def test_mean_tpr():
    # Let's test now on some disbalanced sets
    assert_raises(ValueError, mean_tpr, [1], [])
    assert_raises(ValueError, mean_tpr, [], [1])
    assert_raises(ValueError, mean_tpr, [], [])

    # now interesting one where there were no target when it was in predicted
    assert_raises(ValueError, mean_tpr, [1], [0])
    assert_raises(ValueError, mean_tpr, [0, 1], [0, 0])
    # but it should be ok to have some targets not present in prediction
    assert_equal(mean_tpr([0, 0], [0, 1]), .5)
    # the same regardless how many samples in 0-class, if all misclassified
    # (winner by # of samples takes all)
    assert_equal(mean_tpr([0, 0, 0], [0, 0, 1]), .5)
    # whenever mean-accuracy would be different
    assert_almost_equal(mean_match_accuracy([0, 0, 0], [0, 0, 1]), 2/3.)
Ejemplo n.º 9
0
def test_mean_tpr_balanced():
    # in case of the balanced sets we should expect to match mean_match_accuracy
    for nclass in range(2, 4):
        for nsample in range(1, 3):
            target = np.repeat(np.arange(nclass), nsample)
            # perfect match
            assert_equal(mean_match_accuracy(target, target), 1.0)
            assert_equal(mean_tpr(target, target), 1.0)
            # perfect mismatch -- shift by nsample, so no target matches
            estimate = np.roll(target, nsample)
            assert_equal(mean_match_accuracy(target, estimate), 0)
            assert_equal(mean_tpr(target, estimate), 0)
            # do few permutations and see if both match
            for i in range(5):
                np.random.shuffle(estimate)
                assert_equal(mean_tpr(target, estimate),
                             mean_match_accuracy(target, estimate))
                assert_almost_equal(mean_tpr(target, estimate),
                                    1 - mean_fnr(target, estimate))
Ejemplo n.º 10
0
def test_mean_tpr_balanced():
    # in case of the balanced sets we should expect to match mean_match_accuracy
    for nclass in range(2, 4):
        for nsample in range(1, 3):
            target = np.repeat(np.arange(nclass), nsample)
            # perfect match
            assert_equal(mean_match_accuracy(target, target), 1.0)
            assert_equal(mean_tpr(target, target), 1.0)
            # perfect mismatch -- shift by nsample, so no target matches
            estimate = np.roll(target, nsample)
            assert_equal(mean_match_accuracy(target, estimate), 0)
            assert_equal(mean_tpr(target, estimate), 0)
            # do few permutations and see if both match
            for i in range(5):
                np.random.shuffle(estimate)
                assert_equal(
                    mean_tpr(target, estimate),
                    mean_match_accuracy(target, estimate))
                assert_almost_equal(
                    mean_tpr(target, estimate), 1-mean_fnr(target, estimate))
Ejemplo n.º 11
0
    def _assert_rotation_maps_vector(r, x, y):
        # rotation must be 3x3 numpy array
        assert_equal(r.shape, (3, 3))
        assert_is_instance(r, np.ndarray)

        # rotation applied to x must yield direction of y
        # (modulo rounding errors)
        def normed(v):
            n_v = np.linalg.norm(v)

            return 0 if n_v == 0 else v / n_v

        rx = r.dot(x)

        rx_normed = normed(rx)
        y_normed = normed(y)
        assert_vector_direction_almost_equal(rx_normed, y_normed)

        # since it is a rotation, the result must have the same
        # L2 norm as the input
        assert_almost_equal(np.linalg.norm(x), np.linalg.norm(rx))
Ejemplo n.º 12
0
    def _assert_rotation_maps_vector(r, x, y):
        # rotation must be 3x3 numpy array
        assert_equal(r.shape, (3, 3))
        assert_is_instance(r, np.ndarray)

        # rotation applied to x must yield direction of y
        # (modulo rounding errors)
        def normed(v):
            n_v = np.linalg.norm(v)

            return 0 if n_v == 0 else v / n_v

        rx = r.dot(x)

        rx_normed = normed(rx)
        y_normed = normed(y)
        assert_vector_direction_almost_equal(rx_normed, y_normed)

        # since it is a rotation, the result must have the same
        # L2 norm as the input
        assert_almost_equal(np.linalg.norm(x), np.linalg.norm(rx))
Ejemplo n.º 13
0
def test_gifti_dataset_with_anatomical_surface(fn, format_, include_nodes):
    ds = _get_test_dataset(include_nodes)

    nsamples, nfeatures = ds.shape
    vertices = np.random.normal(size=(nfeatures, 3))
    faces = np.asarray([i + np.arange(3)
                        for i in range(2 * nfeatures)]) % nfeatures
    surf = Surface(vertices, faces)

    img = map2gifti(ds, surface=surf)

    arr_index = 0

    if include_nodes:
        # check node indices
        node_arr = img.darrays[arr_index]
        assert_equal(node_arr.intent,
                     intent_codes.code['NIFTI_INTENT_NODE_INDEX'])
        assert_equal(node_arr.coordsys, None)
        assert_equal(node_arr.data.dtype, np.int32)
        assert_equal(node_arr.datatype, data_type_codes['int32'])

        arr_index += 1

    for sample in ds.samples:
        # check sample content
        arr = img.darrays[arr_index]
        data = arr.data
        assert_almost_equal(data, sample)
        assert_equal(arr.coordsys, None)
        assert_equal(arr.data.dtype, np.float32)
        assert_equal(arr.datatype, data_type_codes['float32'])

        arr_index += 1

    # check vertices
    vertex_arr = img.darrays[arr_index]
    assert_almost_equal(vertex_arr.data, vertices)
    assert_equal(vertex_arr.data.dtype, np.float32)
    assert_equal(vertex_arr.datatype, data_type_codes['float32'])

    # check faces
    arr_index += 1
    face_arr = img.darrays[arr_index]
    assert_almost_equal(face_arr.data, faces)
    assert_equal(face_arr.data.dtype, np.int32)
    assert_equal(face_arr.datatype, data_type_codes['int32'])

    # getting the functional data should ignore faces and vertices
    ds_again = gifti_dataset(img)
    assert_datasets_almost_equal(ds, ds_again)
Ejemplo n.º 14
0
def test_gifti_dataset_with_anatomical_surface(fn, format_, include_nodes):
    ds = _get_test_dataset(include_nodes)

    nsamples, nfeatures = ds.shape
    vertices = np.random.normal(size=(nfeatures, 3))
    faces = np.asarray([i + np.arange(3) for i in xrange(2 * nfeatures)]) % nfeatures
    surf = Surface(vertices, faces)

    img = map2gifti(ds, surface=surf)

    arr_index = 0

    if include_nodes:
        # check node indices
        node_arr = img.darrays[arr_index]
        assert_equal(node_arr.intent,
                     intent_codes.code['NIFTI_INTENT_NODE_INDEX'])
        assert_equal(node_arr.coordsys, None)
        assert_equal(node_arr.data.dtype, np.int32)
        assert_equal(node_arr.datatype, data_type_codes['int32'])

        arr_index += 1

    for sample in ds.samples:
        # check sample content
        arr = img.darrays[arr_index]
        data = arr.data
        assert_almost_equal(data, sample)
        assert_equal(arr.coordsys, None)
        assert_equal(arr.data.dtype, np.float32)
        assert_equal(arr.datatype, data_type_codes['float32'])

        arr_index += 1

    # check vertices
    vertex_arr = img.darrays[arr_index]
    assert_almost_equal(vertex_arr.data, vertices)
    assert_equal(vertex_arr.data.dtype, np.float32)
    assert_equal(vertex_arr.datatype, data_type_codes['float32'])

    # check faces
    arr_index += 1
    face_arr = img.darrays[arr_index]
    assert_almost_equal(face_arr.data, faces)
    assert_equal(face_arr.data.dtype, np.int32)
    assert_equal(face_arr.datatype, data_type_codes['int32'])

    # getting the functional data should ignore faces and vertices
    ds_again = gifti_dataset(img)
    assert_datasets_almost_equal(ds, ds_again)
Ejemplo n.º 15
0
def test_balancer():
    ds = give_data()
    ds.sa['ids'] = np.arange(len(ds))  # some sa to ease tracking of samples

    # only mark the selection in an attribute
    bal = Balancer()
    res = bal(ds)
    # we get a new dataset, with shared samples
    assert_false(ds is res)
    assert_true(ds.samples is res.samples.base)
    # should kick out 2 samples in each chunk of 10
    assert_almost_equal(np.mean(res.sa.balanced_set), 0.8)
    # same as above, but actually apply the selection
    bal = Balancer(apply_selection=True, count=5)
    # just run it once
    res = bal(ds)
    # we get a new dataset, with shared samples
    assert_false(ds is res)
    # should kick out 2 samples in each chunk of 10
    assert_equal(len(res), int(0.8 * len(ds)))
    # now use it as a generator
    dses = list(bal.generate(ds))
    assert_equal(len(dses), 5)

    # if we rerun again, it would be a different selection
    res2 = bal(ds)
    assert_true(np.any(res.sa.ids != bal(ds).sa.ids))

    # but if we create a balancer providing seed rng int,
    # should be identical results
    bal = Balancer(apply_selection=True, count=5, rng=1)
    assert_false(np.any(bal(ds).sa.ids != bal(ds).sa.ids))

    # But results should differ if we use .generate to produce those multiple
    # balanced datasets
    b = Balancer(apply_selection=True, count=3, rng=1)
    balanced = list(b.generate(ds))
    assert_false(all(balanced[0].sa.ids == balanced[1].sa.ids))
    assert_false(all(balanced[0].sa.ids == balanced[2].sa.ids))
    assert_false(all(balanced[1].sa.ids == balanced[2].sa.ids))

    # And should be exactly the same
    for ds_a, ds_b in zip(balanced, b.generate(ds)):
        assert_datasets_equal(ds_a, ds_b)

    # Contribution by Chris Markiewicz
    # And interleaving __call__ and generator fetches
    gen1 = b.generate(ds)
    gen2 = b.generate(ds)

    seq1, seq2, seq3 = [], [], []

    for i in xrange(3):
        seq1.append(gen1.next())
        seq2.append(gen2.next())
        seq3.append(b(ds))

    # Produces expected sequences

    for i in xrange(3):
        assert_datasets_equal(balanced[i], seq1[i])
        assert_datasets_equal(balanced[i], seq2[i])

    # And all __call__s return the same result
    ds_a = seq3[0]
    for ds_b in seq3[1:]:
        assert_array_equal(ds_a.sa.ids, ds_b.sa.ids)

    # with limit
    bal = Balancer(limit={'chunks': 3}, apply_selection=True)
    res = bal(ds)
    assert_equal(res.sa['chunks'].unique, (3,))
    assert_equal(get_nelements_per_value(res.sa.targets).values(),
                 [2] * 4)
    # same but include all offlimit samples
    bal = Balancer(limit={'chunks': 3}, include_offlimit=True,
                   apply_selection=True)
    res = bal(ds)
    assert_array_equal(res.sa['chunks'].unique, range(10))
    # chunk three still balanced, but the rest is not, i.e. all samples included
    assert_equal(get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(),
                 [2] * 4)
    assert_equal(get_nelements_per_value(res.sa.chunks).values(),
                 [10, 10, 10, 8, 10, 10, 10, 10, 10, 10])
    # fixed amount
    bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True)
    res = bal(ds)
    assert_equal(get_nelements_per_value(res.sa.targets).values(),
                 [1] * 4)
    # fraction
    bal = Balancer(amount=0.499, limit=None, apply_selection=True)
    res = bal(ds)
    assert_array_equal(
            np.round(np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5),
            np.array(get_nelements_per_value(res.sa.targets).values()))
    # check on feature attribute
    ds.fa['one'] = np.tile([1, 2], 5)
    ds.fa['chk'] = np.repeat([1, 2], 5)
    bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True)
    res = bal(ds)
    assert_equal(get_nelements_per_value(res.fa.one).values(),
                 [4] * 2)
Ejemplo n.º 16
0
def assert_vector_direction_almost_equal(x, y, *args, **kwargs):
    n_x = np.linalg.norm(x)
    n_y = np.linalg.norm(y)

    assert_almost_equal(np.dot(x, y), n_x * n_y, *args, **kwargs)
Ejemplo n.º 17
0
def assert_vector_direction_almost_equal(x, y, *args, **kwargs):
    n_x = np.linalg.norm(x)
    n_y = np.linalg.norm(y)

    assert_almost_equal(np.dot(x, y), n_x * n_y, *args, **kwargs)
Ejemplo n.º 18
0
def test_simple_cluster_level_thresholding():
    nf = 13
    nperms = 100
    pthr_feature = 0.5  # just for testing
    pthr_cluster = 0.5
    rand_acc = np.random.normal(size=(nperms, nf))
    acc = np.random.normal(size=(1, nf))

    # Step 1 is to "fit" "Nonparametrics" per each of the features
    from mvpa2.clfs.stats import Nonparametric
    dists = [Nonparametric(samples) for samples in rand_acc.T]
    # we should be able to assert "p" value for each random sample for each feature
    rand_acc_p = np.array([dist.rcdf(v)
                           for dist, v in zip(dists, rand_acc.T)]).T

    rand_acc_p_slow = np.array(
        [[dist.rcdf(v) for dist, v in zip(dists, sample)]
         for sample in rand_acc])
    assert_array_equal(rand_acc_p_slow, rand_acc_p)

    assert_equal(rand_acc_p.shape, rand_acc.shape)
    assert (np.all(rand_acc_p <= 1))
    assert (np.all(rand_acc_p > 0))

    # 2: apply the same to our acc
    acc_p = np.array([dist.rcdf(v) for dist, v in zip(dists, acc[0])])[None, :]
    assert (np.all(acc_p <= 1))
    assert (np.all(acc_p > 0))

    skip_if_no_external('scipy')
    # Now we need to do our fancy cluster level madness
    from mvpa2.algorithms.group_clusterthr import \
        get_cluster_sizes, _transform_to_pvals, get_cluster_pvals, \
        get_thresholding_map, repeat_cluster_vals

    rand_acc_p_thr = rand_acc_p < pthr_feature
    acc_p_thr = acc_p < pthr_feature

    rand_cluster_sizes = get_cluster_sizes(rand_acc_p_thr)
    acc_cluster_sizes = get_cluster_sizes(acc_p_thr)

    # This is how we can compute it within present implementation.
    # It will be a bit different (since it doesn't account for target value if
    # I got it right), and would work only for accuracies
    thr_map = get_thresholding_map(rand_acc, pthr_feature)
    rand_cluster_sizes_ = get_cluster_sizes(rand_acc > thr_map)
    acc_cluster_sizes_ = get_cluster_sizes(acc > thr_map)

    assert_equal(rand_cluster_sizes, rand_cluster_sizes_)
    assert_equal(acc_cluster_sizes, acc_cluster_sizes_)

    #print rand_cluster_sizes
    #print acc_cluster_sizes

    # That is how it is done in group_clusterthr atm
    # store cluster size histogram for later p-value evaluation
    # use a sparse matrix for easy consumption (max dim is the number of
    # features, i.e. biggest possible cluster)
    from scipy.sparse import dok_matrix
    scl = dok_matrix((1, nf + 1), dtype=int)
    for s in rand_cluster_sizes:
        scl[0, s] = rand_cluster_sizes[s]

    test_count_sizes = repeat_cluster_vals(acc_cluster_sizes)
    test_pvals = _transform_to_pvals(test_count_sizes, scl.astype('float'))
    # needs conversion to array for comparisons
    test_pvals = np.asanyarray(test_pvals)
    # critical cluster_level threshold (without FW correction between clusters)
    # would be
    clusters_passed_threshold = test_count_sizes[test_pvals <= pthr_cluster]

    if len(clusters_passed_threshold):
        thr_cluster_size = min(clusters_passed_threshold)
        #print("Min cluster size which passed threshold: %d" % thr_cluster_size)
    else:
        #print("No clusters passed threshold")
        pass
    #print test_count_sizes, test_pvals

    acc_cluster_ps = get_cluster_pvals(acc_cluster_sizes, rand_cluster_sizes)

    for test_pval, test_count_size in zip(test_pvals, test_count_sizes):
        assert_almost_equal(acc_cluster_ps[test_count_size], test_pval)
Ejemplo n.º 19
0
def test_balancer():
    ds = give_data()
    ds.sa['ids'] = np.arange(len(ds))  # some sa to ease tracking of samples

    # only mark the selection in an attribute
    bal = Balancer()
    res = bal(ds)
    # we get a new dataset, with shared samples
    assert_false(ds is res)
    assert_true(ds.samples is res.samples.base)
    # should kick out 2 samples in each chunk of 10
    assert_almost_equal(np.mean(res.sa.balanced_set), 0.8)
    # same as above, but actually apply the selection
    bal = Balancer(apply_selection=True, count=5)
    # just run it once
    res = bal(ds)
    # we get a new dataset, with shared samples
    assert_false(ds is res)
    # should kick out 2 samples in each chunk of 10
    assert_equal(len(res), int(0.8 * len(ds)))
    # now use it as a generator
    dses = list(bal.generate(ds))
    assert_equal(len(dses), 5)

    # if we rerun again, it would be a different selection
    res2 = bal(ds)
    assert_true(np.any(res.sa.ids != bal(ds).sa.ids))

    # but if we create a balancer providing seed rng int,
    # should be identical results
    bal = Balancer(apply_selection=True, count=5, rng=1)
    assert_false(np.any(bal(ds).sa.ids != bal(ds).sa.ids))

    # But results should differ if we use .generate to produce those multiple
    # balanced datasets
    b = Balancer(apply_selection=True, count=3, rng=1)
    balanced = list(b.generate(ds))
    assert_false(all(balanced[0].sa.ids == balanced[1].sa.ids))
    assert_false(all(balanced[0].sa.ids == balanced[2].sa.ids))
    assert_false(all(balanced[1].sa.ids == balanced[2].sa.ids))

    # And should be exactly the same
    for ds_a, ds_b in zip(balanced, b.generate(ds)):
        assert_datasets_equal(ds_a, ds_b)

    # Contribution by Chris Markiewicz
    # And interleaving __call__ and generator fetches
    gen1 = b.generate(ds)
    gen2 = b.generate(ds)

    seq1, seq2, seq3 = [], [], []

    for i in xrange(3):
        seq1.append(gen1.next())
        seq2.append(gen2.next())
        seq3.append(b(ds))

    # Produces expected sequences

    for i in xrange(3):
        assert_datasets_equal(balanced[i], seq1[i])
        assert_datasets_equal(balanced[i], seq2[i])

    # And all __call__s return the same result
    ds_a = seq3[0]
    for ds_b in seq3[1:]:
        assert_array_equal(ds_a.sa.ids, ds_b.sa.ids)

    # with limit
    bal = Balancer(limit={'chunks': 3}, apply_selection=True)
    res = bal(ds)
    assert_equal(res.sa['chunks'].unique, (3, ))
    assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4)
    # same but include all offlimit samples
    bal = Balancer(limit={'chunks': 3},
                   include_offlimit=True,
                   apply_selection=True)
    res = bal(ds)
    assert_array_equal(res.sa['chunks'].unique, range(10))
    # chunk three still balanced, but the rest is not, i.e. all samples included
    assert_equal(
        get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(),
        [2] * 4)
    assert_equal(
        get_nelements_per_value(res.sa.chunks).values(),
        [10, 10, 10, 8, 10, 10, 10, 10, 10, 10])
    # fixed amount
    bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True)
    res = bal(ds)
    assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4)
    # fraction
    bal = Balancer(amount=0.499, limit=None, apply_selection=True)
    res = bal(ds)
    assert_array_equal(
        np.round(
            np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5),
        np.array(get_nelements_per_value(res.sa.targets).values()))
    # check on feature attribute
    ds.fa['one'] = np.tile([1, 2], 5)
    ds.fa['chk'] = np.repeat([1, 2], 5)
    bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True)
    res = bal(ds)
    assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
Ejemplo n.º 20
0
def test_simple_cluster_level_thresholding():
    nf = 13
    nperms = 100
    pthr_feature = 0.5  # just for testing
    pthr_cluster = 0.5
    rand_acc = np.random.normal(size=(nperms, nf))
    acc = np.random.normal(size=(1, nf))

    # Step 1 is to "fit" "Nonparametrics" per each of the features
    from mvpa2.clfs.stats import Nonparametric
    dists = [Nonparametric(samples) for samples in rand_acc.T]
    # we should be able to assert "p" value for each random sample for each feature
    rand_acc_p = np.array(
        [dist.rcdf(v) for dist, v in zip(dists, rand_acc.T)]
        ).T

    rand_acc_p_slow = np.array([
        [dist.rcdf(v) for dist, v in zip(dists, sample)]
         for sample in rand_acc])
    assert_array_equal(rand_acc_p_slow, rand_acc_p)

    assert_equal(rand_acc_p.shape, rand_acc.shape)
    assert(np.all(rand_acc_p <= 1))
    assert(np.all(rand_acc_p > 0))

    # 2: apply the same to our acc
    acc_p = np.array([dist.rcdf(v) for dist, v in zip(dists, acc[0])])[None, :]
    assert(np.all(acc_p <= 1))
    assert(np.all(acc_p > 0))

    skip_if_no_external('scipy')
    # Now we need to do our fancy cluster level madness
    from mvpa2.algorithms.group_clusterthr import \
        get_cluster_sizes, _transform_to_pvals, get_cluster_pvals, \
        get_thresholding_map, repeat_cluster_vals

    rand_acc_p_thr = rand_acc_p < pthr_feature
    acc_p_thr = acc_p < pthr_feature

    rand_cluster_sizes = get_cluster_sizes(rand_acc_p_thr)
    acc_cluster_sizes = get_cluster_sizes(acc_p_thr)

    # This is how we can compute it within present implementation.
    # It will be a bit different (since it doesn't account for target value if
    # I got it right), and would work only for accuracies
    thr_map = get_thresholding_map(rand_acc, pthr_feature)
    rand_cluster_sizes_ = get_cluster_sizes(rand_acc > thr_map)
    acc_cluster_sizes_ = get_cluster_sizes(acc > thr_map)

    assert_equal(rand_cluster_sizes, rand_cluster_sizes_)
    assert_equal(acc_cluster_sizes, acc_cluster_sizes_)

    #print rand_cluster_sizes
    #print acc_cluster_sizes

    # That is how it is done in group_clusterthr atm
    # store cluster size histogram for later p-value evaluation
    # use a sparse matrix for easy consumption (max dim is the number of
    # features, i.e. biggest possible cluster)
    from scipy.sparse import dok_matrix
    scl = dok_matrix((1, nf + 1), dtype=int)
    for s in rand_cluster_sizes:
        scl[0, s] = rand_cluster_sizes[s]

    test_count_sizes = repeat_cluster_vals(acc_cluster_sizes)
    test_pvals = _transform_to_pvals(test_count_sizes, scl.astype('float'))
    # needs conversion to array for comparisons
    test_pvals = np.asanyarray(test_pvals)
    # critical cluster_level threshold (without FW correction between clusters)
    # would be
    clusters_passed_threshold = test_count_sizes[test_pvals <= pthr_cluster]

    if len(clusters_passed_threshold):
        thr_cluster_size = min(clusters_passed_threshold)
        #print("Min cluster size which passed threshold: %d" % thr_cluster_size)
    else:
        #print("No clusters passed threshold")
        pass
    #print test_count_sizes, test_pvals


    acc_cluster_ps = get_cluster_pvals(acc_cluster_sizes, rand_cluster_sizes)

    for test_pval, test_count_size in zip(test_pvals, test_count_sizes):
        assert_almost_equal(acc_cluster_ps[test_count_size], test_pval)