Exemple #1
0
def test_factorialpartitioner():
    # Test against sifter and chainmap implemented in test_usecases
    # -- code below copied from test_usecases --
    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    ds = normal_feature_dataset(
        nlabels=6,
        snr=100,  # pure signal! ;)
        perlabel=30,
        nfeatures=6,
        nonbogus_features=range(6),
        nchunks=5)
    ds.sa['subord'] = ds.sa.targets.copy()
    ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, )
                         for i in ds.targets]  # 3 superord categories
    # let's override original targets just to be sure that we aren't relying on them
    ds.targets[:] = 0

    # let's make two other datasets to test later
    # one superordinate category only
    ds_1super = ds.copy()
    ds_1super.sa['superord'] = ['super1' for i in ds_1super.targets]

    # one superordinate category has only one subordinate
    #ds_unbalanced = ds.copy()
    #nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1')
    #mask_superord = ds_unbalanced.sa.superord == 'super1'
    #uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord])
    #ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)]
    ds_unbalanced = Dataset(range(4),
                            sa={
                                'subord': [0, 0, 1, 2],
                                'superord': [1, 1, 2, 2]
                            })

    npart = ChainNode(
        [
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([('partitions', 2),
                    ('superord', {
                        'uvalues': ds.sa['superord'].unique,
                        'balanced': True
                    })]),
        ],
        space='partitions')

    # now the new implementation
    factpart = FactorialPartitioner(NFoldPartitioner(attr='subord'),
                                    attr='superord')

    partitions_npart = [p.sa.partitions for p in npart.generate(ds)]
    partitions_factpart = [p.sa.partitions for p in factpart.generate(ds)]

    assert_array_equal(np.sort(partitions_npart), np.sort(partitions_factpart))

    # now let's check it behaves correctly if we have only one superord class
    nfold = NFoldPartitioner(attr='subord')
    partitions_nfold = [p.sa.partitions for p in nfold.generate(ds_1super)]
    partitions_factpart = [
        p.sa.partitions for p in factpart.generate(ds_1super)
    ]
    assert_array_equal(np.sort(partitions_nfold), np.sort(partitions_factpart))

    # smoke test for unbalanced subord classes
    warning_msg = 'One or more superordinate attributes do not have the same '\
                  'number of subordinate attributes. This could yield to '\
                  'unbalanced partitions.'
    with assert_warnings([(RuntimeWarning, warning_msg)]):
        partitions_factpart = [
            p.sa.partitions for p in factpart.generate(ds_unbalanced)
        ]

    partitions_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])]
    superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])]
    subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])]

    for out_part, true_part, super_out, sub_out in \
            zip(partitions_factpart, partitions_unbalanced,
                superord_unbalanced, subord_unbalanced):
        assert_array_equal(out_part, true_part)
        assert_array_equal((ds_unbalanced[out_part == 1].sa.superord.tolist(),
                            ds_unbalanced[out_part == 2].sa.superord.tolist()),
                           super_out)
        assert_array_equal((ds_unbalanced[out_part == 1].sa.subord.tolist(),
                            ds_unbalanced[out_part == 2].sa.subord.tolist()),
                           sub_out)

    # now let's test on a dummy dataset
    ds_dummy = Dataset(range(4),
                       sa={
                           'subord': range(4),
                           'superord': [1, 2] * 2
                       })
    partitions_factpart = [
        p.sa.partitions for p in factpart.generate(ds_dummy)
    ]
    assert_array_equal(
        partitions_factpart,
        [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
Exemple #2
0
def test_factorialpartitioner():
    # Test against sifter and chainmap implemented in test_usecases
    # -- code below copied from test_usecases --
    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    ds = normal_feature_dataset(
        nlabels=6, snr=100, perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5  # pure signal! ;)
    )
    ds.sa["subord"] = ds.sa.targets.copy()
    ds.sa["superord"] = ["super%d" % (int(i[1]) % 3,) for i in ds.targets]  # 3 superord categories
    # let's override original targets just to be sure that we aren't relying on them
    ds.targets[:] = 0

    # let's make two other datasets to test later
    # one superordinate category only
    ds_1super = ds.copy()
    ds_1super.sa["superord"] = ["super1" for i in ds_1super.targets]

    # one superordinate category has only one subordinate
    # ds_unbalanced = ds.copy()
    # nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1')
    # mask_superord = ds_unbalanced.sa.superord == 'super1'
    # uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord])
    # ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)]
    ds_unbalanced = Dataset(range(4), sa={"subord": [0, 0, 1, 2], "superord": [1, 1, 2, 2]})

    npart = ChainNode(
        [
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa["superord"].unique), attr="subord"),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([("partitions", 2), ("superord", {"uvalues": ds.sa["superord"].unique, "balanced": True})]),
        ],
        space="partitions",
    )

    # now the new implementation
    factpart = FactorialPartitioner(NFoldPartitioner(attr="subord"), attr="superord")

    partitions_npart = [p.sa.partitions for p in npart.generate(ds)]
    partitions_factpart = [p.sa.partitions for p in factpart.generate(ds)]

    assert_array_equal(np.sort(partitions_npart), np.sort(partitions_factpart))

    # now let's check it behaves correctly if we have only one superord class
    nfold = NFoldPartitioner(attr="subord")
    partitions_nfold = [p.sa.partitions for p in nfold.generate(ds_1super)]
    partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_1super)]
    assert_array_equal(np.sort(partitions_nfold), np.sort(partitions_factpart))

    # smoke test for unbalanced subord classes
    warning_msg = (
        "One or more superordinate attributes do not have the same "
        "number of subordinate attributes. This could yield to "
        "unbalanced partitions."
    )
    with assert_warnings([(RuntimeWarning, warning_msg)]):
        partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_unbalanced)]

    partitions_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])]
    superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])]
    subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])]

    for out_part, true_part, super_out, sub_out in zip(
        partitions_factpart, partitions_unbalanced, superord_unbalanced, subord_unbalanced
    ):
        assert_array_equal(out_part, true_part)
        assert_array_equal(
            (ds_unbalanced[out_part == 1].sa.superord.tolist(), ds_unbalanced[out_part == 2].sa.superord.tolist()),
            super_out,
        )
        assert_array_equal(
            (ds_unbalanced[out_part == 1].sa.subord.tolist(), ds_unbalanced[out_part == 2].sa.subord.tolist()), sub_out
        )

    # now let's test on a dummy dataset
    ds_dummy = Dataset(range(4), sa={"subord": range(4), "superord": [1, 2] * 2})
    partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_dummy)]
    assert_array_equal(partitions_factpart, [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
def test_factorialpartitioner():
    # Test against sifter and chainmap implemented in test_usecases
    # -- code below copied from test_usecases --
    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    ds = normal_feature_dataset(
        nlabels=6,
        snr=100,  # pure signal! ;)
        perlabel=30,
        nfeatures=6,
        nonbogus_features=range(6),
        nchunks=5)
    ds.sa['subord'] = ds.sa.targets.copy()
    ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, )
                         for i in ds.targets]  # 3 superord categories
    # let's override original targets just to be sure that we aren't relying on them
    ds.targets[:] = 0

    # let's make two other datasets to test later
    # one superordinate category only
    ds_1super = ds.copy()
    ds_1super.sa['superord'] = ['super1' for i in ds_1super.targets]

    # one superordinate category has only one subordinate
    #ds_unbalanced = ds.copy()
    #nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1')
    #mask_superord = ds_unbalanced.sa.superord == 'super1'
    #uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord])
    #ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)]
    ds_unbalanced = Dataset(range(4),
                            sa={
                                'subord': [0, 0, 1, 2],
                                'superord': [1, 1, 2, 2]
                            })

    npart = ChainNode(
        [
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([('partitions', 2),
                    ('superord', {
                        'uvalues': ds.sa['superord'].unique,
                        'balanced': True
                    })]),
        ],
        space='partitions')

    def partition(partitioner, ds_=ds):
        return [p.sa.partitions for p in partitioner.generate(ds_)]

    # now the new implementation
    # common kwargs
    factkw = dict(partitioner=NFoldPartitioner(attr='subord'), attr='superord')

    fpart = FactorialPartitioner(**factkw)
    p_npart = partition(npart)
    p_fpart = partition(fpart)

    assert_array_equal(np.sort(p_npart), np.sort(p_fpart))

    fpart2 = FactorialPartitioner(count=2,
                                  selection_strategy='first',
                                  **factkw)
    p_fpart2 = partition(fpart2)
    assert_equal(len(p_fpart), 8)
    assert_equal(len(p_fpart2), 2)
    assert_array_equal(p_fpart[:2], p_fpart2)

    # 1 equidistant -- should be the first one
    fpart1 = FactorialPartitioner(count=1, **factkw)
    p_fpart1 = partition(fpart1)
    assert_equal(len(p_fpart1), 1)
    assert_array_equal(p_fpart[:1], p_fpart1)

    # 2 equidistant
    fpart2 = FactorialPartitioner(count=2, **factkw)
    p_fpart2 = partition(fpart2)
    assert_equal(len(p_fpart2), 2)
    assert_array_equal(p_fpart[::4], p_fpart2)

    # without count -- should be all of them in original order
    fpartr = FactorialPartitioner(selection_strategy='random', **factkw)
    assert_array_equal(p_fpart, partition(fpartr))

    # but if with a count we should get some selection
    fpartr2 = FactorialPartitioner(selection_strategy='random',
                                   count=2,
                                   **factkw)
    # Let's generate a number of random selections:
    rand2_partitions = [partition(fpartr2) for i in xrange(10)]
    for p in rand2_partitions:
        assert_equal(len(p), 2)
    # majority of them must be different
    assert len(set([tuple(map(tuple, x)) for x in rand2_partitions])) >= 5

    # now let's check it behaves correctly if we have only one superord class
    nfold = NFoldPartitioner(attr='subord')
    p_nfold = partition(nfold, ds_1super)
    p_fpart = partition(fpart, ds_1super)
    assert_array_equal(np.sort(p_nfold), np.sort(p_fpart))

    # smoke test for unbalanced subord classes
    warning_msg = 'One or more superordinate attributes do not have the same '\
                  'number of subordinate attributes. This could yield to '\
                  'unbalanced partitions.'
    with assert_warnings([(RuntimeWarning, warning_msg)]):
        p_fpart = partition(fpart, ds_unbalanced)

    p_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])]
    superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])]
    subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])]

    for out_part, true_part, super_out, sub_out in \
            zip(p_fpart, p_unbalanced,
                superord_unbalanced, subord_unbalanced):
        assert_array_equal(out_part, true_part)
        assert_array_equal((ds_unbalanced[out_part == 1].sa.superord.tolist(),
                            ds_unbalanced[out_part == 2].sa.superord.tolist()),
                           super_out)
        assert_array_equal((ds_unbalanced[out_part == 1].sa.subord.tolist(),
                            ds_unbalanced[out_part == 2].sa.subord.tolist()),
                           sub_out)

    # now let's test on a dummy dataset
    ds_dummy = Dataset(range(4),
                       sa={
                           'subord': range(4),
                           'superord': [1, 2] * 2
                       })
    p_fpart = partition(fpart, ds_dummy)
    assert_array_equal(
        p_fpart, [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
Exemple #4
0
def test_factorialpartitioner():
    # Test against sifter and chainmap implemented in test_usecases
    # -- code below copied from test_usecases --
    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    ds = normal_feature_dataset(nlabels=6,
                                snr=100,   # pure signal! ;)
                                perlabel=30,
                                nfeatures=6,
                                nonbogus_features=range(6),
                                nchunks=5)
    ds.sa['subord'] = ds.sa.targets.copy()
    ds.sa['superord'] = ['super%d' % (int(i[1])%3,)
                         for i in ds.targets]   # 3 superord categories
    # let's override original targets just to be sure that we aren't relying on them
    ds.targets[:] = 0

    # let's make two other datasets to test later
    # one superordinate category only
    ds_1super = ds.copy()
    ds_1super.sa['superord'] = ['super1' for i in ds_1super.targets]

    # one superordinate category has only one subordinate
    #ds_unbalanced = ds.copy()
    #nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1')
    #mask_superord = ds_unbalanced.sa.superord == 'super1'
    #uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord])
    #ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)]
    ds_unbalanced = Dataset(range(4), sa={'subord': [0, 0, 1, 2],
                                          'superord': [1, 1, 2, 2]})

    npart = ChainNode([
        ## so we split based on superord
        NFoldPartitioner(len(ds.sa['superord'].unique),
                         attr='subord'),
        ## so it should select only those splits where we took 1 from
        ## each of the superord categories leaving things in balance
        Sifter([('partitions', 2),
                ('superord',
                 { 'uvalues': ds.sa['superord'].unique,
                   'balanced': True})
                ]),
    ], space='partitions')

    def partition(partitioner, ds_=ds):
        return [p.sa.partitions for p in partitioner.generate(ds_)]

    # now the new implementation
    # common kwargs
    factkw = dict(partitioner=NFoldPartitioner(attr='subord'), attr='superord')

    fpart = FactorialPartitioner(**factkw)
    p_npart = partition(npart)
    p_fpart = partition(fpart)

    assert_array_equal(np.sort(p_npart), np.sort(p_fpart))

    fpart2 = FactorialPartitioner(count=2, selection_strategy='first', **factkw)
    p_fpart2 = partition(fpart2)
    assert_equal(len(p_fpart), 8)
    assert_equal(len(p_fpart2), 2)
    assert_array_equal(p_fpart[:2], p_fpart2)

    # 1 equidistant -- should be the first one
    fpart1 = FactorialPartitioner(count=1, **factkw)
    p_fpart1 = partition(fpart1)
    assert_equal(len(p_fpart1), 1)
    assert_array_equal(p_fpart[:1], p_fpart1)

    # 2 equidistant
    fpart2 = FactorialPartitioner(count=2, **factkw)
    p_fpart2 = partition(fpart2)
    assert_equal(len(p_fpart2), 2)
    assert_array_equal(p_fpart[::4], p_fpart2)

    # without count -- should be all of them in original order
    fpartr = FactorialPartitioner(selection_strategy='random', **factkw)
    assert_array_equal(p_fpart, partition(fpartr))

    # but if with a count we should get some selection
    fpartr2 = FactorialPartitioner(selection_strategy='random', count=2, **factkw)
    # Let's generate a number of random selections:
    rand2_partitions = [partition(fpartr2) for i in xrange(10)]
    for p in rand2_partitions:
        assert_equal(len(p), 2)
    # majority of them must be different
    assert len(set([tuple(map(tuple, x)) for x in rand2_partitions])) >= 5

    # now let's check it behaves correctly if we have only one superord class
    nfold = NFoldPartitioner(attr='subord')
    p_nfold = partition(nfold, ds_1super)
    p_fpart = partition(fpart, ds_1super)
    assert_array_equal(np.sort(p_nfold), np.sort(p_fpart))

    # smoke test for unbalanced subord classes
    warning_msg = 'One or more superordinate attributes do not have the same '\
                  'number of subordinate attributes. This could yield to '\
                  'unbalanced partitions.'
    with assert_warnings([(RuntimeWarning, warning_msg)]):
        p_fpart = partition(fpart, ds_unbalanced)

    p_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])]
    superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])]
    subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])]

    for out_part, true_part, super_out, sub_out in \
            zip(p_fpart, p_unbalanced,
                superord_unbalanced, subord_unbalanced):
        assert_array_equal(out_part, true_part)
        assert_array_equal((ds_unbalanced[out_part == 1].sa.superord.tolist(),
                            ds_unbalanced[out_part == 2].sa.superord.tolist()),
                           super_out)
        assert_array_equal((ds_unbalanced[out_part == 1].sa.subord.tolist(),
                            ds_unbalanced[out_part == 2].sa.subord.tolist()),
                           sub_out)

    # now let's test on a dummy dataset
    ds_dummy = Dataset(range(4), sa={'subord': range(4),
                                     'superord': [1,2]*2})
    p_fpart = partition(fpart, ds_dummy)
    assert_array_equal(p_fpart,
                       [[2, 2, 1, 1],
                        [2, 1, 1, 2],
                        [1, 2, 2, 1],
                        [1, 1, 2, 2]])