def test_sifter_with_balancing(): # extended previous test which was already # "... somewhat duplicating the doctest" ds = Dataset(samples=np.arange(12).reshape((-1, 2)), sa={ 'chunks': [0, 1, 2, 3, 4, 5], 'targets': ['c', 'c', 'c', 'p', 'p', 'p'] }) # Without sifter -- just to assure that we do get all of them # i.e. 6*5*4*3/(4!) = 15 par = ChainNode([NFoldPartitioner(cvtype=4, attr='chunks')]) assert_equal(len(list(par.generate(ds))), 15) # so we will take 4 chunks out of available 7, but would care only # about those partitions where we have balanced number of 'c' and 'p' # entries assert_raises( ValueError, lambda x: list(Sifter([('targets', dict(wrong=1))]).generate(x)), ds) par = ChainNode([ NFoldPartitioner(cvtype=4, attr='chunks'), Sifter([('partitions', 2), ('targets', dict(uvalues=['c', 'p'], balanced=True))]) ]) dss = list(par.generate(ds)) # print [ x[x.sa.partitions==2].sa.targets for x in dss ] assert_equal(len(dss), 9) for ds_ in dss: testing = ds[ds_.sa.partitions == 2] assert_array_equal(np.unique(testing.sa.targets), ['c', 'p']) # and we still have both targets present in training training = ds[ds_.sa.partitions == 1] assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
def test_permute_superord(): from mvpa2.base.node import ChainNode from mvpa2.generators.partition import NFoldPartitioner from mvpa2.generators.base import Sifter from mvpa2.generators.permutation import AttributePermutator ds = _get_superord_dataset() # mvpa2.seed(1) part = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), AttributePermutator(['superord'], limit=['partitions', 'chunks']), ], space='partitions') for ds_perm in part.generate(ds): # it does permutation assert (np.sum(ds_perm.sa.superord != ds.sa.superord) != 0)
def test_sifter_superord_usecase(): from mvpa2.misc.data_generators import normal_feature_dataset from mvpa2.clfs.svm import LinearCSVMC # fast one to use for tests from mvpa2.measures.base import CrossValidation from mvpa2.base.node import ChainNode from mvpa2.generators.partition import NFoldPartitioner from mvpa2.generators.base import Sifter # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent ds = normal_feature_dataset( nlabels=6, snr=100, # pure signal! ;) perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5) ds.sa['subord'] = ds.sa.targets.copy() ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, ) for i in ds.targets] # 3 superord categories # let's override original targets just to be sure that we aren't relying on them ds.targets[:] = 0 npart = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), ], space='partitions') # and then do your normal where clf is space='superord' clf = LinearCSVMC(space='superord') cvte_regular = CrossValidation(clf, NFoldPartitioner(), errorfx=lambda p, t: np.mean(p == t)) cvte_super = CrossValidation(clf, npart, errorfx=lambda p, t: np.mean(p == t)) accs_regular = cvte_regular(ds) accs_super = cvte_super(ds) # With sifting we should get only 2^3 = 8 splits assert (len(accs_super) == 8) # I don't think that this would ever fail, so not marking it labile assert (np.mean(accs_regular) > .8) assert (np.mean(accs_super) < .6)
def test_sifter(): # somewhat duplicating the doctest ds = Dataset(samples=np.arange(8).reshape((4,2)), sa={'chunks': [ 0 , 1 , 2 , 3 ], 'targets': ['c', 'c', 'p', 'p']}) for sift_targets_definition in (['c', 'p'], dict(uvalues=['c', 'p'])): par = ChainNode([NFoldPartitioner(cvtype=2, attr='chunks'), Sifter([('partitions', 2), ('targets', sift_targets_definition)]) ]) dss = list(par.generate(ds)) assert_equal(len(dss), 4) for ds_ in dss: testing = ds[ds_.sa.partitions == 2] assert_array_equal(np.unique(testing.sa.targets), ['c', 'p']) # and we still have both targets present in training training = ds[ds_.sa.partitions == 1] assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
def test_sifter_superord_usecase(): from mvpa2.misc.data_generators import normal_feature_dataset from mvpa2.clfs.svm import LinearCSVMC # fast one to use for tests from mvpa2.measures.base import CrossValidation from mvpa2.base.node import ChainNode from mvpa2.generators.partition import NFoldPartitioner from mvpa2.generators.base import Sifter ds = _get_superord_dataset() npart = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), ], space='partitions') # and then do your normal where clf is space='superord' clf = LinearCSVMC(space='superord') cvte_regular = CrossValidation(clf, NFoldPartitioner(), errorfx=lambda p, t: np.mean(p == t)) cvte_super = CrossValidation(clf, npart, errorfx=lambda p, t: np.mean(p == t)) accs_regular = cvte_regular(ds) accs_super = cvte_super(ds) # With sifting we should get only 2^3 = 8 splits assert (len(accs_super) == 8) # I don't think that this would ever fail, so not marking it labile assert (np.mean(accs_regular) > .8) assert (np.mean(accs_super) < .6)
def test_factorialpartitioner(): # Test against sifter and chainmap implemented in test_usecases # -- code below copied from test_usecases -- # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent ds = normal_feature_dataset( nlabels=6, snr=100, # pure signal! ;) perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5) ds.sa['subord'] = ds.sa.targets.copy() ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, ) for i in ds.targets] # 3 superord categories # let's override original targets just to be sure that we aren't relying on them ds.targets[:] = 0 # let's make two other datasets to test later # one superordinate category only ds_1super = ds.copy() ds_1super.sa['superord'] = ['super1' for i in ds_1super.targets] # one superordinate category has only one subordinate #ds_unbalanced = ds.copy() #nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1') #mask_superord = ds_unbalanced.sa.superord == 'super1' #uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord]) #ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)] ds_unbalanced = Dataset(range(4), sa={ 'subord': [0, 0, 1, 2], 'superord': [1, 1, 2, 2] }) npart = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), ], space='partitions') # now the new implementation factpart = FactorialPartitioner(NFoldPartitioner(attr='subord'), attr='superord') partitions_npart = [p.sa.partitions for p in npart.generate(ds)] partitions_factpart = [p.sa.partitions for p in factpart.generate(ds)] assert_array_equal(np.sort(partitions_npart), np.sort(partitions_factpart)) # now let's check it behaves correctly if we have only one superord class nfold = NFoldPartitioner(attr='subord') partitions_nfold = [p.sa.partitions for p in nfold.generate(ds_1super)] partitions_factpart = [ p.sa.partitions for p in factpart.generate(ds_1super) ] assert_array_equal(np.sort(partitions_nfold), np.sort(partitions_factpart)) # smoke test for unbalanced subord classes warning_msg = 'One or more superordinate attributes do not have the same '\ 'number of subordinate attributes. This could yield to '\ 'unbalanced partitions.' with assert_warnings([(RuntimeWarning, warning_msg)]): partitions_factpart = [ p.sa.partitions for p in factpart.generate(ds_unbalanced) ] partitions_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])] superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])] subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])] for out_part, true_part, super_out, sub_out in \ zip(partitions_factpart, partitions_unbalanced, superord_unbalanced, subord_unbalanced): assert_array_equal(out_part, true_part) assert_array_equal((ds_unbalanced[out_part == 1].sa.superord.tolist(), ds_unbalanced[out_part == 2].sa.superord.tolist()), super_out) assert_array_equal((ds_unbalanced[out_part == 1].sa.subord.tolist(), ds_unbalanced[out_part == 2].sa.subord.tolist()), sub_out) # now let's test on a dummy dataset ds_dummy = Dataset(range(4), sa={ 'subord': range(4), 'superord': [1, 2] * 2 }) partitions_factpart = [ p.sa.partitions for p in factpart.generate(ds_dummy) ] assert_array_equal( partitions_factpart, [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
def test_factorialpartitioner(): # Test against sifter and chainmap implemented in test_usecases # -- code below copied from test_usecases -- # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent ds = normal_feature_dataset( nlabels=6, snr=100, # pure signal! ;) perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5) ds.sa['subord'] = ds.sa.targets.copy() ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, ) for i in ds.targets] # 3 superord categories # let's override original targets just to be sure that we aren't relying on them ds.targets[:] = 0 # let's make two other datasets to test later # one superordinate category only ds_1super = ds.copy() ds_1super.sa['superord'] = ['super1' for i in ds_1super.targets] # one superordinate category has only one subordinate #ds_unbalanced = ds.copy() #nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1') #mask_superord = ds_unbalanced.sa.superord == 'super1' #uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord]) #ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)] ds_unbalanced = Dataset(range(4), sa={ 'subord': [0, 0, 1, 2], 'superord': [1, 1, 2, 2] }) npart = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), ], space='partitions') def partition(partitioner, ds_=ds): return [p.sa.partitions for p in partitioner.generate(ds_)] # now the new implementation # common kwargs factkw = dict(partitioner=NFoldPartitioner(attr='subord'), attr='superord') fpart = FactorialPartitioner(**factkw) p_npart = partition(npart) p_fpart = partition(fpart) assert_array_equal(np.sort(p_npart), np.sort(p_fpart)) fpart2 = FactorialPartitioner(count=2, selection_strategy='first', **factkw) p_fpart2 = partition(fpart2) assert_equal(len(p_fpart), 8) assert_equal(len(p_fpart2), 2) assert_array_equal(p_fpart[:2], p_fpart2) # 1 equidistant -- should be the first one fpart1 = FactorialPartitioner(count=1, **factkw) p_fpart1 = partition(fpart1) assert_equal(len(p_fpart1), 1) assert_array_equal(p_fpart[:1], p_fpart1) # 2 equidistant fpart2 = FactorialPartitioner(count=2, **factkw) p_fpart2 = partition(fpart2) assert_equal(len(p_fpart2), 2) assert_array_equal(p_fpart[::4], p_fpart2) # without count -- should be all of them in original order fpartr = FactorialPartitioner(selection_strategy='random', **factkw) assert_array_equal(p_fpart, partition(fpartr)) # but if with a count we should get some selection fpartr2 = FactorialPartitioner(selection_strategy='random', count=2, **factkw) # Let's generate a number of random selections: rand2_partitions = [partition(fpartr2) for i in xrange(10)] for p in rand2_partitions: assert_equal(len(p), 2) # majority of them must be different assert len(set([tuple(map(tuple, x)) for x in rand2_partitions])) >= 5 # now let's check it behaves correctly if we have only one superord class nfold = NFoldPartitioner(attr='subord') p_nfold = partition(nfold, ds_1super) p_fpart = partition(fpart, ds_1super) assert_array_equal(np.sort(p_nfold), np.sort(p_fpart)) # smoke test for unbalanced subord classes warning_msg = 'One or more superordinate attributes do not have the same '\ 'number of subordinate attributes. This could yield to '\ 'unbalanced partitions.' with assert_warnings([(RuntimeWarning, warning_msg)]): p_fpart = partition(fpart, ds_unbalanced) p_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])] superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])] subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])] for out_part, true_part, super_out, sub_out in \ zip(p_fpart, p_unbalanced, superord_unbalanced, subord_unbalanced): assert_array_equal(out_part, true_part) assert_array_equal((ds_unbalanced[out_part == 1].sa.superord.tolist(), ds_unbalanced[out_part == 2].sa.superord.tolist()), super_out) assert_array_equal((ds_unbalanced[out_part == 1].sa.subord.tolist(), ds_unbalanced[out_part == 2].sa.subord.tolist()), sub_out) # now let's test on a dummy dataset ds_dummy = Dataset(range(4), sa={ 'subord': range(4), 'superord': [1, 2] * 2 }) p_fpart = partition(fpart, ds_dummy) assert_array_equal( p_fpart, [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])