def test_sifter_with_balancing(): # extended previous test which was already # "... somewhat duplicating the doctest" ds = Dataset(samples=np.arange(12).reshape((-1, 2)), sa={'chunks': [ 0 , 1 , 2 , 3 , 4, 5 ], 'targets': ['c', 'c', 'c', 'p', 'p', 'p']}) # Without sifter -- just to assure that we do get all of them # i.e. 6*5*4*3/(4!) = 15 par = ChainNode([NFoldPartitioner(cvtype=4, attr='chunks')]) assert_equal(len(list(par.generate(ds))), 15) # so we will take 4 chunks out of available 7, but would care only # about those partitions where we have balanced number of 'c' and 'p' # entries assert_raises(ValueError, lambda x: list(Sifter([('targets', dict(wrong=1))]).generate(x)), ds) par = ChainNode([NFoldPartitioner(cvtype=4, attr='chunks'), Sifter([('partitions', 2), ('targets', dict(uvalues=['c', 'p'], balanced=True))]) ]) dss = list(par.generate(ds)) # print [ x[x.sa.partitions==2].sa.targets for x in dss ] assert_equal(len(dss), 9) for ds_ in dss: testing = ds[ds_.sa.partitions == 2] assert_array_equal(np.unique(testing.sa.targets), ['c', 'p']) # and we still have both targets present in training training = ds[ds_.sa.partitions == 1] assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
def test_sifter_with_balancing(): # extended previous test which was already # "... somewhat duplicating the doctest" ds = Dataset(samples=np.arange(12).reshape((-1, 2)), sa={ 'chunks': [0, 1, 2, 3, 4, 5], 'targets': ['c', 'c', 'c', 'p', 'p', 'p'] }) # Without sifter -- just to assure that we do get all of them # i.e. 6*5*4*3/(4!) = 15 par = ChainNode([NFoldPartitioner(cvtype=4, attr='chunks')]) assert_equal(len(list(par.generate(ds))), 15) # so we will take 4 chunks out of available 7, but would care only # about those partitions where we have balanced number of 'c' and 'p' # entries assert_raises( ValueError, lambda x: list(Sifter([('targets', dict(wrong=1))]).generate(x)), ds) par = ChainNode([ NFoldPartitioner(cvtype=4, attr='chunks'), Sifter([('partitions', 2), ('targets', dict(uvalues=['c', 'p'], balanced=True))]) ]) dss = list(par.generate(ds)) # print [ x[x.sa.partitions==2].sa.targets for x in dss ] assert_equal(len(dss), 9) for ds_ in dss: testing = ds[ds_.sa.partitions == 2] assert_array_equal(np.unique(testing.sa.targets), ['c', 'p']) # and we still have both targets present in training training = ds[ds_.sa.partitions == 1] assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
def test_permute_superord(): from mvpa2.base.node import ChainNode from mvpa2.generators.partition import NFoldPartitioner from mvpa2.generators.base import Sifter from mvpa2.generators.permutation import AttributePermutator ds = _get_superord_dataset() # mvpa2.seed(1) part = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), AttributePermutator(['superord'], limit=['partitions', 'chunks']), ], space='partitions') for ds_perm in part.generate(ds): # it does permutation assert (np.sum(ds_perm.sa.superord != ds.sa.superord) != 0)
def test_permute_superord(): from mvpa2.base.node import ChainNode from mvpa2.generators.partition import NFoldPartitioner from mvpa2.generators.base import Sifter from mvpa2.generators.permutation import AttributePermutator ds = _get_superord_dataset() # mvpa2.seed(1) part = ChainNode([ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True})]), AttributePermutator(['superord'], limit=['partitions', 'chunks']), ], space='partitions') for ds_perm in part.generate(ds): # it does permutation assert(np.sum(ds_perm.sa.superord != ds.sa.superord) != 0)
def test_exclude_targets_combinations_subjectchunks(): partitioner = ChainNode([NFoldPartitioner(attr='subjects'), ExcludeTargetsCombinationsPartitioner( k=1, targets_attr='chunks', space='partitions')], space='partitions') # targets do not need even to be defined! ds = Dataset(np.arange(18).reshape(9, 2), sa={'chunks': np.arange(9) // 3, 'subjects': np.arange(9) % 3}) dss = list(partitioner.generate(ds)) assert_equal(len(dss), 9) testing_subjs, testing_chunks = [], [] for ds_ in dss: testing_partition = ds_.sa.partitions == 2 training_partition = ds_.sa.partitions == 1 # must be scalars -- so implicit test here # if not -- would be error testing_subj = np.asscalar(np.unique(ds_.sa.subjects[testing_partition])) testing_subjs.append(testing_subj) testing_chunk = np.asscalar(np.unique(ds_.sa.chunks[testing_partition])) testing_chunks.append(testing_chunk) # and those must not appear for training ok_(not testing_subj in ds_.sa.subjects[training_partition]) ok_(not testing_chunk in ds_.sa.chunks[training_partition]) # and we should have gone through all chunks/subjs pairs testing_pairs = set(zip(testing_subjs, testing_chunks)) assert_equal(len(testing_pairs), 9) # yoh: equivalent to set(itertools.product(range(3), range(3)))) # but .product is N/A for python2.5 assert_equal(testing_pairs, set(zip(*np.where(np.ones((3,3))))))
def test_exclude_targets_combinations(): partitioner = ChainNode([ NFoldPartitioner(), ExcludeTargetsCombinationsPartitioner( k=2, targets_attr='targets', space='partitions') ], space='partitions') from mvpa2.misc.data_generators import normal_feature_dataset ds = normal_feature_dataset(snr=0., nlabels=4, perlabel=3, nchunks=3, nonbogus_features=[0, 1, 2, 3], nfeatures=4) partitions = list(partitioner.generate(ds)) assert_equal(len(partitions), 3 * 6) splitter = Splitter('partitions') combs = [] comb_chunks = [] for p in partitions: trds, teds = list(splitter.generate(p))[:2] comb = tuple(np.unique(teds.targets)) combs.append(comb) comb_chunks.append(comb + tuple(np.unique(teds.chunks))) assert_equal(len(set(combs)), 6) # just 6 possible combinations of 2 out of 4 assert_equal(len(set(comb_chunks)), 3 * 6) # all unique
def test_split_clf_on_chainpartitioner(self): # pretty much a smoke test for #156 ds = datasets['uni2small'] part = ChainNode([ NFoldPartitioner(cvtype=1), Balancer(attr='targets', count=2, limit='partitions', apply_selection=True) ]) partitions = list(part.generate(ds)) sclf = SplitClassifier(sample_clf_lin, part, enable_ca=['stats', 'splits']) sclf.train(ds) pred = sclf.predict(ds) assert_equal(len(pred), len(ds)) # rudimentary check assert_equal(len(sclf.ca.splits), len(partitions)) assert_equal(len(sclf.clfs), len(partitions)) # now let's do sensitivity analyzer just in case sclf.untrain() sensana = sclf.get_sensitivity_analyzer() sens = sensana(ds) # basic check that sensitivities varied across splits from mvpa2.mappers.fx import FxMapper sens_stds = FxMapper('samples', np.std, uattrs=['targets'])(sens) assert_true(np.any(sens_stds != 0))
def test_splitter(): ds = give_data() # split with defaults spl1 = Splitter('chunks') assert_raises(NotImplementedError, spl1, ds) splits = list(spl1.generate(ds)) assert_equal(len(splits), len(ds.sa['chunks'].unique)) for split in splits: # it should have perform basic slicing! assert_true(split.samples.base is ds.samples) assert_equal(len(split.sa['chunks'].unique), 1) assert_true('lastsplit' in split.a) assert_true(splits[-1].a.lastsplit) # now again, more customized spl2 = Splitter('targets', attr_values=[0, 1, 1, 2, 3, 3, 3], count=4, noslicing=True) splits = list(spl2.generate(ds)) assert_equal(len(splits), 4) for split in splits: # it should NOT have perform basic slicing! assert_false(split.samples.base is ds.samples) assert_equal(len(split.sa['targets'].unique), 1) assert_equal(len(split.sa['chunks'].unique), 10) assert_true(splits[-1].a.lastsplit) # two should be identical assert_array_equal(splits[1].samples, splits[2].samples) # now go wild and split by feature attribute ds.fa['roi'] = np.repeat([0, 1], 5) # splitter should auto-detect that this is a feature attribute spl3 = Splitter('roi') splits = list(spl3.generate(ds)) assert_equal(len(splits), 2) for split in splits: assert_true(split.samples.base is ds.samples) assert_equal(len(split.fa['roi'].unique), 1) assert_equal(split.shape, (100, 5)) # and finally test chained splitters cspl = ChainNode([spl2, spl3, spl1]) splits = list(cspl.generate(ds)) # 4 target splits and 2 roi splits each and 10 chunks each assert_equal(len(splits), 80)
def test_splitter(): ds = give_data() # split with defaults spl1 = Splitter('chunks') assert_raises(NotImplementedError, spl1, ds) splits = list(spl1.generate(ds)) assert_equal(len(splits), len(ds.sa['chunks'].unique)) for split in splits: # it should have perform basic slicing! assert_true(split.samples.base is ds.samples) assert_equal(len(split.sa['chunks'].unique), 1) assert_true('lastsplit' in split.a) assert_true(splits[-1].a.lastsplit) # now again, more customized spl2 = Splitter('targets', attr_values = [0,1,1,2,3,3,3], count=4, noslicing=True) splits = list(spl2.generate(ds)) assert_equal(len(splits), 4) for split in splits: # it should NOT have perform basic slicing! assert_false(split.samples.base is ds.samples) assert_equal(len(split.sa['targets'].unique), 1) assert_equal(len(split.sa['chunks'].unique), 10) assert_true(splits[-1].a.lastsplit) # two should be identical assert_array_equal(splits[1].samples, splits[2].samples) # now go wild and split by feature attribute ds.fa['roi'] = np.repeat([0,1], 5) # splitter should auto-detect that this is a feature attribute spl3 = Splitter('roi') splits = list(spl3.generate(ds)) assert_equal(len(splits), 2) for split in splits: assert_true(split.samples.base is ds.samples) assert_equal(len(split.fa['roi'].unique), 1) assert_equal(split.shape, (100, 5)) # and finally test chained splitters cspl = ChainNode([spl2, spl3, spl1]) splits = list(cspl.generate(ds)) # 4 target splits and 2 roi splits each and 10 chunks each assert_equal(len(splits), 80)
def test_sifter(): # somewhat duplicating the doctest ds = Dataset(samples=np.arange(8).reshape((4,2)), sa={'chunks': [ 0 , 1 , 2 , 3 ], 'targets': ['c', 'c', 'p', 'p']}) par = ChainNode([NFoldPartitioner(cvtype=2, attr='chunks'), Sifter([('partitions', 2), ('targets', ['c', 'p'])]) ]) dss = list(par.generate(ds)) assert_equal(len(dss), 4) for ds_ in dss: testing = ds[ds_.sa.partitions == 2] assert_array_equal(np.unique(testing.sa.targets), ['c', 'p']) # and we still have both targets present in training training = ds[ds_.sa.partitions == 1] assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
def test_sifter(): # somewhat duplicating the doctest ds = Dataset(samples=np.arange(8).reshape((4, 2)), sa={"chunks": [0, 1, 2, 3], "targets": ["c", "c", "p", "p"]}) for sift_targets_definition in (["c", "p"], dict(uvalues=["c", "p"])): par = ChainNode( [ NFoldPartitioner(cvtype=2, attr="chunks"), Sifter([("partitions", 2), ("targets", sift_targets_definition)]), ] ) dss = list(par.generate(ds)) assert_equal(len(dss), 4) for ds_ in dss: testing = ds[ds_.sa.partitions == 2] assert_array_equal(np.unique(testing.sa.targets), ["c", "p"]) # and we still have both targets present in training training = ds[ds_.sa.partitions == 1] assert_array_equal(np.unique(training.sa.targets), ["c", "p"])
def test_discarded_boundaries(self): ds = datasets["hollow"] # four runs ds.sa["chunks"] = np.repeat(np.arange(4), 10) # do odd even splitting for lots of boundaries in few splits part = ChainNode([OddEvenPartitioner(), StripBoundariesSamples("chunks", 1, 2)]) parts = [d.samples.sid for d in part.generate(ds)] # both dataset should have the same samples, because the boundaries are # identical and the same sample should be stripped assert_array_equal(parts[0], parts[1]) # we strip 3 samples per boundary assert_equal(len(parts[0]), len(ds) - (3 * 3)) for i in [9, 10, 11, 19, 20, 21, 29, 30, 31]: assert_false(i in parts[0])
def test_discarded_boundaries(self): ds = datasets['hollow'] # four runs ds.sa['chunks'] = np.repeat(np.arange(4), 10) # do odd even splitting for lots of boundaries in few splits part = ChainNode([OddEvenPartitioner(), StripBoundariesSamples('chunks', 1, 2)]) parts = [d.samples.sid for d in part.generate(ds)] # both dataset should have the same samples, because the boundaries are # identical and the same sample should be stripped assert_array_equal(parts[0], parts[1]) # we strip 3 samples per boundary assert_equal(len(parts[0]), len(ds) - (3 * 3)) for i in [9, 10, 11, 19, 20, 21, 29, 30, 31]: assert_false(i in parts[0])
def test_sifter(): # somewhat duplicating the doctest ds = Dataset(samples=np.arange(8).reshape((4,2)), sa={'chunks': [ 0 , 1 , 2 , 3 ], 'targets': ['c', 'c', 'p', 'p']}) for sift_targets_definition in (['c', 'p'], dict(uvalues=['c', 'p'])): par = ChainNode([NFoldPartitioner(cvtype=2, attr='chunks'), Sifter([('partitions', 2), ('targets', sift_targets_definition)]) ]) dss = list(par.generate(ds)) assert_equal(len(dss), 4) for ds_ in dss: testing = ds[ds_.sa.partitions == 2] assert_array_equal(np.unique(testing.sa.targets), ['c', 'p']) # and we still have both targets present in training training = ds[ds_.sa.partitions == 1] assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
def test_exclude_targets_combinations(): partitioner = ChainNode( [NFoldPartitioner(), ExcludeTargetsCombinationsPartitioner(k=2, targets_attr="targets", space="partitions")], space="partitions", ) from mvpa2.misc.data_generators import normal_feature_dataset ds = normal_feature_dataset(snr=0.0, nlabels=4, perlabel=3, nchunks=3, nonbogus_features=[0, 1, 2, 3], nfeatures=4) partitions = list(partitioner.generate(ds)) assert_equal(len(partitions), 3 * 6) splitter = Splitter("partitions") combs = [] comb_chunks = [] for p in partitions: trds, teds = list(splitter.generate(p))[:2] comb = tuple(np.unique(teds.targets)) combs.append(comb) comb_chunks.append(comb + tuple(np.unique(teds.chunks))) assert_equal(len(set(combs)), 6) # just 6 possible combinations of 2 out of 4 assert_equal(len(set(comb_chunks)), 3 * 6) # all unique
def test_split_clf_on_chainpartitioner(self): # pretty much a smoke test for #156 ds = datasets['uni2small'] part = ChainNode([NFoldPartitioner(cvtype=1), Balancer(attr='targets', count=2, limit='partitions', apply_selection=True)]) partitions = list(part.generate(ds)) sclf = SplitClassifier(sample_clf_lin, part, enable_ca=['stats', 'splits']) sclf.train(ds) pred = sclf.predict(ds) assert_equal(len(pred), len(ds)) # rudimentary check assert_equal(len(sclf.ca.splits), len(partitions)) assert_equal(len(sclf.clfs), len(partitions)) # now let's do sensitivity analyzer just in case sclf.untrain() sensana = sclf.get_sensitivity_analyzer() sens = sensana(ds) # basic check that sensitivities varied across splits from mvpa2.mappers.fx import FxMapper sens_stds = FxMapper('samples', np.std, uattrs=['targets'])(sens) assert_true(np.any(sens_stds != 0))
def test_factorialpartitioner(): # Test against sifter and chainmap implemented in test_usecases # -- code below copied from test_usecases -- # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent ds = normal_feature_dataset( nlabels=6, snr=100, # pure signal! ;) perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5) ds.sa['subord'] = ds.sa.targets.copy() ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, ) for i in ds.targets] # 3 superord categories # let's override original targets just to be sure that we aren't relying on them ds.targets[:] = 0 # let's make two other datasets to test later # one superordinate category only ds_1super = ds.copy() ds_1super.sa['superord'] = ['super1' for i in ds_1super.targets] # one superordinate category has only one subordinate #ds_unbalanced = ds.copy() #nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1') #mask_superord = ds_unbalanced.sa.superord == 'super1' #uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord]) #ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)] ds_unbalanced = Dataset(range(4), sa={ 'subord': [0, 0, 1, 2], 'superord': [1, 1, 2, 2] }) npart = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), ], space='partitions') # now the new implementation factpart = FactorialPartitioner(NFoldPartitioner(attr='subord'), attr='superord') partitions_npart = [p.sa.partitions for p in npart.generate(ds)] partitions_factpart = [p.sa.partitions for p in factpart.generate(ds)] assert_array_equal(np.sort(partitions_npart), np.sort(partitions_factpart)) # now let's check it behaves correctly if we have only one superord class nfold = NFoldPartitioner(attr='subord') partitions_nfold = [p.sa.partitions for p in nfold.generate(ds_1super)] partitions_factpart = [ p.sa.partitions for p in factpart.generate(ds_1super) ] assert_array_equal(np.sort(partitions_nfold), np.sort(partitions_factpart)) # smoke test for unbalanced subord classes warning_msg = 'One or more superordinate attributes do not have the same '\ 'number of subordinate attributes. This could yield to '\ 'unbalanced partitions.' with assert_warnings([(RuntimeWarning, warning_msg)]): partitions_factpart = [ p.sa.partitions for p in factpart.generate(ds_unbalanced) ] partitions_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])] superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])] subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])] for out_part, true_part, super_out, sub_out in \ zip(partitions_factpart, partitions_unbalanced, superord_unbalanced, subord_unbalanced): assert_array_equal(out_part, true_part) assert_array_equal((ds_unbalanced[out_part == 1].sa.superord.tolist(), ds_unbalanced[out_part == 2].sa.superord.tolist()), super_out) assert_array_equal((ds_unbalanced[out_part == 1].sa.subord.tolist(), ds_unbalanced[out_part == 2].sa.subord.tolist()), sub_out) # now let's test on a dummy dataset ds_dummy = Dataset(range(4), sa={ 'subord': range(4), 'superord': [1, 2] * 2 }) partitions_factpart = [ p.sa.partitions for p in factpart.generate(ds_dummy) ] assert_array_equal( partitions_factpart, [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
def plot_feature_hist(dataset, xlim=None, noticks=True, targets_attr='targets', chunks_attr=None, **kwargs): """Plot histograms of feature values for each labels. Parameters ---------- dataset : Dataset xlim : None or 2-tuple Common x-axis limits for all histograms. noticks : bool If True, no axis ticks will be plotted. This is useful to save space in large plots. targets_attr : string, optional Name of samples attribute to be used as targets chunks_attr : None or string If a string, a histogram will be plotted per each target and each chunk (as defined in sa named `chunks_attr`), resulting is a histogram grid (targets x chunks). **kwargs Any additional arguments are passed to matplotlib's hist(). """ lsplit = ChainNode([ NFoldPartitioner(1, attr=targets_attr), Splitter('partitions', attr_values=[2]) ]) csplit = ChainNode([ NFoldPartitioner(1, attr=chunks_attr), Splitter('partitions', attr_values=[2]) ]) nrows = len(dataset.sa[targets_attr].unique) ncols = len(dataset.sa[chunks_attr].unique) def doplot(data): """Just a little helper which plots the histogram and removes ticks etc""" pl.hist(data, **kwargs) if xlim is not None: pl.xlim(xlim) if noticks: pl.yticks([]) pl.xticks([]) fig = 1 # for all labels for row, ds in enumerate(lsplit.generate(dataset)): if chunks_attr: for col, d in enumerate(csplit.generate(ds)): pl.subplot(nrows, ncols, fig) doplot(d.samples.ravel()) if row == 0: pl.title('C:' + str(d.sa[chunks_attr].unique[0])) if col == 0: pl.ylabel('L:' + str(d.sa[targets_attr].unique[0])) fig += 1 else: pl.subplot(1, nrows, fig) doplot(ds.samples) pl.title('L:' + str(ds.sa[targets_attr].unique[0])) fig += 1
def test_factorialpartitioner(): # Test against sifter and chainmap implemented in test_usecases # -- code below copied from test_usecases -- # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent ds = normal_feature_dataset( nlabels=6, snr=100, perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5 # pure signal! ;) ) ds.sa["subord"] = ds.sa.targets.copy() ds.sa["superord"] = ["super%d" % (int(i[1]) % 3,) for i in ds.targets] # 3 superord categories # let's override original targets just to be sure that we aren't relying on them ds.targets[:] = 0 # let's make two other datasets to test later # one superordinate category only ds_1super = ds.copy() ds_1super.sa["superord"] = ["super1" for i in ds_1super.targets] # one superordinate category has only one subordinate # ds_unbalanced = ds.copy() # nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1') # mask_superord = ds_unbalanced.sa.superord == 'super1' # uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord]) # ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)] ds_unbalanced = Dataset(range(4), sa={"subord": [0, 0, 1, 2], "superord": [1, 1, 2, 2]}) npart = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa["superord"].unique), attr="subord"), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([("partitions", 2), ("superord", {"uvalues": ds.sa["superord"].unique, "balanced": True})]), ], space="partitions", ) # now the new implementation factpart = FactorialPartitioner(NFoldPartitioner(attr="subord"), attr="superord") partitions_npart = [p.sa.partitions for p in npart.generate(ds)] partitions_factpart = [p.sa.partitions for p in factpart.generate(ds)] assert_array_equal(np.sort(partitions_npart), np.sort(partitions_factpart)) # now let's check it behaves correctly if we have only one superord class nfold = NFoldPartitioner(attr="subord") partitions_nfold = [p.sa.partitions for p in nfold.generate(ds_1super)] partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_1super)] assert_array_equal(np.sort(partitions_nfold), np.sort(partitions_factpart)) # smoke test for unbalanced subord classes warning_msg = ( "One or more superordinate attributes do not have the same " "number of subordinate attributes. This could yield to " "unbalanced partitions." ) with assert_warnings([(RuntimeWarning, warning_msg)]): partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_unbalanced)] partitions_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])] superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])] subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])] for out_part, true_part, super_out, sub_out in zip( partitions_factpart, partitions_unbalanced, superord_unbalanced, subord_unbalanced ): assert_array_equal(out_part, true_part) assert_array_equal( (ds_unbalanced[out_part == 1].sa.superord.tolist(), ds_unbalanced[out_part == 2].sa.superord.tolist()), super_out, ) assert_array_equal( (ds_unbalanced[out_part == 1].sa.subord.tolist(), ds_unbalanced[out_part == 2].sa.subord.tolist()), sub_out ) # now let's test on a dummy dataset ds_dummy = Dataset(range(4), sa={"subord": range(4), "superord": [1, 2] * 2}) partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_dummy)] assert_array_equal(partitions_factpart, [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
def plot_feature_hist(dataset, xlim=None, noticks=True, targets_attr='targets', chunks_attr=None, **kwargs): """Plot histograms of feature values for each labels. Parameters ---------- dataset : Dataset xlim : None or 2-tuple Common x-axis limits for all histograms. noticks : bool If True, no axis ticks will be plotted. This is useful to save space in large plots. targets_attr : string, optional Name of samples attribute to be used as targets chunks_attr : None or string If a string, a histogram will be plotted per each target and each chunk (as defined in sa named `chunks_attr`), resulting is a histogram grid (targets x chunks). **kwargs Any additional arguments are passed to matplotlib's hist(). """ lsplit = ChainNode([NFoldPartitioner(1, attr=targets_attr), Splitter('partitions', attr_values=[2])]) csplit = ChainNode([NFoldPartitioner(1, attr=chunks_attr), Splitter('partitions', attr_values=[2])]) nrows = len(dataset.sa[targets_attr].unique) ncols = len(dataset.sa[chunks_attr].unique) def doplot(data): """Just a little helper which plots the histogram and removes ticks etc""" pl.hist(data, **kwargs) if xlim is not None: pl.xlim(xlim) if noticks: pl.yticks([]) pl.xticks([]) fig = 1 # for all labels for row, ds in enumerate(lsplit.generate(dataset)): if chunks_attr: for col, d in enumerate(csplit.generate(ds)): pl.subplot(nrows, ncols, fig) doplot(d.samples.ravel()) if row == 0: pl.title('C:' + str(d.sa[chunks_attr].unique[0])) if col == 0: pl.ylabel('L:' + str(d.sa[targets_attr].unique[0])) fig += 1 else: pl.subplot(1, nrows, fig) doplot(ds.samples) pl.title('L:' + str(ds.sa[targets_attr].unique[0])) fig += 1