def test_nfold_random_counted_selection_partitioner(self): # Lets get somewhat extensive but complete one and see if # everything is legit. 0.5 must correspond to 50%, in our case # 5 out of 10 unique chunks split_partitions = [ tuple(x.sa.partitions) for x in NFoldPartitioner(0.5).generate(self.data)] # 252 is # of combinations of 5 from 10 assert_equal(len(split_partitions), 252) # verify that all of them are unique assert_equal(len(set(split_partitions)), 252) # now let's limit our query kwargs = dict(count=10, selection_strategy='random') split10_partitions = [ tuple(x.sa.partitions) for x in NFoldPartitioner(5, **kwargs).generate(self.data)] split10_partitions_ = [ tuple(x.sa.partitions) for x in NFoldPartitioner(0.5, **kwargs).generate(self.data)] # to make sure that I deal with sets of tuples correctly: assert_equal(len(set(split10_partitions)), 10) assert_equal(len(split10_partitions), 10) assert_equal(len(split10_partitions_), 10) # and they must differ (same ones are possible but very very unlikely) assert_not_equal(split10_partitions, split10_partitions_) # but every one of them must be within known exhaustive set assert_equal(set(split_partitions).intersection(split10_partitions), set(split10_partitions)) assert_equal(set(split_partitions).intersection(split10_partitions_), set(split10_partitions_))
def test_nfold_random_counted_selection_partitioner(self): return # Lets get somewhat extensive but complete one and see if # everything is legit. 0.5 must correspond to 50%, in our case # 5 out of 10 unique chunks split_partitions = [ tuple(x.sa.partitions) for x in NFoldPartitioner(0.5).generate(self.data)] # 252 is # of combinations of 5 from 10 assert_equal(len(split_partitions), 252) # verify that all of them are unique assert_equal(len(set(split_partitions)), 252) # now let's limit our query kwargs = dict(count=10, selection_strategy='random') split10_partitions = [ tuple(x.sa.partitions) for x in NFoldPartitioner(5, **kwargs).generate(self.data)] split10_partitions_ = [ tuple(x.sa.partitions) for x in NFoldPartitioner(0.5, **kwargs).generate(self.data)] # to make sure that I deal with sets of tuples correctly: assert_equal(len(set(split10_partitions)), 10) assert_equal(len(split10_partitions), 10) assert_equal(len(split10_partitions_), 10) # and they must differ (same ones are possible but very very unlikely) assert_not_equal(split10_partitions, split10_partitions_) # but every one of them must be within known exhaustive set assert_equal(set(split_partitions).intersection(split10_partitions), set(split10_partitions)) assert_equal(set(split_partitions).intersection(split10_partitions_), set(split10_partitions_))
def test_attrpermute(): # Was about to use borrowkwargs but didn't work out . Test doesn't hurt doc = AttributePermutator.__init__.__doc__ assert_in('limit : ', doc) assert_not_in('collection : ', doc) ds = give_data() ds.sa['ids'] = range(len(ds)) pristine_data = ds.samples.copy() permutation = AttributePermutator(['targets', 'ids'], assure=True) pds = permutation(ds) # should not touch the data assert_array_equal(pristine_data, pds.samples) # even keep the very same array assert_true(pds.samples.base is ds.samples) # there is no way that it can be the same attribute assert_false(np.all(pds.sa.ids == ds.sa.ids)) # ids should reflect permutation setup assert_array_equal(pds.sa.targets, ds.sa.targets[pds.sa.ids]) # other attribute should remain intact assert_array_equal(pds.sa.chunks, ds.sa.chunks) # now chunk-wise permutation permutation = AttributePermutator('ids', limit='chunks') pds = permutation(ds) # first ten should remain first ten assert_false(np.any(pds.sa.ids[:10] > 9)) # verify that implausible assure=True would not work permutation = AttributePermutator('targets', limit='ids', assure=True) assert_raises(RuntimeError, permutation, ds) # same thing, but only permute single chunk permutation = AttributePermutator('ids', limit={'chunks': 3}) pds = permutation(ds) # one chunk should change assert_false(np.any(pds.sa.ids[30:40] > 39)) assert_false(np.any(pds.sa.ids[30:40] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # or a list of chunks permutation = AttributePermutator('ids', limit={'chunks': [3, 4]}) pds = permutation(ds) # two chunks should change assert_false(np.any(pds.sa.ids[30:50] > 49)) assert_false(np.any(pds.sa.ids[30:50] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # and now try generating more permutations nruns = 2 permutation = AttributePermutator(['targets', 'ids'], assure=True, count=nruns) pds = list(permutation.generate(ds)) assert_equal(len(pds), nruns) for p in pds: assert_false(np.all(p.sa.ids == ds.sa.ids)) # permute feature attrs ds.fa['ids'] = range(ds.shape[1]) permutation = AttributePermutator('fa.ids', assure=True) pds = permutation(ds) assert_false(np.all(pds.fa.ids == ds.fa.ids)) # now chunk-wise uattrs strategy (reassignment) permutation = AttributePermutator('targets', limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets), zip(pds.targets)) # in each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) # we have only 1-to-1 mappings assert_true(len(set(zip(otargets, ptargets))), len(set(otargets))) ds.sa['odds'] = ds.sa.ids % 2 # test combinations permutation = AttributePermutator(['targets', 'odds'], limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets, ds.sa.odds), zip(pds.targets, pds.sa.odds)) # In each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] oodds, podds = ds.sa.odds[chunk_idx], pds.sa.odds[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) assert_equal(set(oodds), set(podds)) # at the end we have the same mapping assert_equal(set(zip(otargets, oodds)), set(zip(ptargets, podds)))
def test_attrpermute(): ds = give_data() ds.sa['ids'] = range(len(ds)) pristine_data = ds.samples.copy() permutation = AttributePermutator(['targets', 'ids'], assure=True) pds = permutation(ds) # should not touch the data assert_array_equal(pristine_data, pds.samples) # even keep the very same array assert_true(pds.samples.base is ds.samples) # there is no way that it can be the same attribute assert_false(np.all(pds.sa.ids == ds.sa.ids)) # ids should reflect permutation setup assert_array_equal(pds.sa.targets, ds.sa.targets[pds.sa.ids]) # other attribute should remain intact assert_array_equal(pds.sa.chunks, ds.sa.chunks) # now chunk-wise permutation permutation = AttributePermutator('ids', limit='chunks') pds = permutation(ds) # first ten should remain first ten assert_false(np.any(pds.sa.ids[:10] > 9)) # verify that implausible assure=True would not work permutation = AttributePermutator('targets', limit='ids', assure=True) assert_raises(RuntimeError, permutation, ds) # same thing, but only permute single chunk permutation = AttributePermutator('ids', limit={'chunks': 3}) pds = permutation(ds) # one chunk should change assert_false(np.any(pds.sa.ids[30:40] > 39)) assert_false(np.any(pds.sa.ids[30:40] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # or a list of chunks permutation = AttributePermutator('ids', limit={'chunks': [3,4]}) pds = permutation(ds) # two chunks should change assert_false(np.any(pds.sa.ids[30:50] > 49)) assert_false(np.any(pds.sa.ids[30:50] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # and now try generating more permutations nruns = 2 permutation = AttributePermutator(['targets', 'ids'], assure=True, count=nruns) pds = list(permutation.generate(ds)) assert_equal(len(pds), nruns) for p in pds: assert_false(np.all(p.sa.ids == ds.sa.ids)) # permute feature attrs ds.fa['ids'] = range(ds.shape[1]) permutation = AttributePermutator('fa.ids', assure=True) pds = permutation(ds) assert_false(np.all(pds.fa.ids == ds.fa.ids)) # now chunk-wise uattrs strategy (reassignment) permutation = AttributePermutator('targets', limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets), zip(pds.targets)) # in each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) # we have only 1-to-1 mappings assert_true(len(set(zip(otargets, ptargets))), len(set(otargets))) ds.sa['odds'] = ds.sa.ids % 2 # test combinations permutation = AttributePermutator(['targets', 'odds'], limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets, ds.sa.odds), zip(pds.targets, pds.sa.odds)) # In each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] oodds, podds = ds.sa.odds[chunk_idx], pds.sa.odds[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) assert_equal(set(oodds), set(podds)) # at the end we have the same mapping assert_equal(set(zip(otargets, oodds)), set(zip(ptargets, podds)))
def test_attrpermute(): # Was about to use borrowkwargs but didn't work out . Test doesn't hurt doc = AttributePermutator.__init__.__doc__ assert_in('limit : ', doc) assert_not_in('collection : ', doc) ds = give_data() ds.sa['ids'] = range(len(ds)) pristine_data = ds.samples.copy() permutation = AttributePermutator(['targets', 'ids'], assure=True) pds = permutation(ds) # should not touch the data assert_array_equal(pristine_data, pds.samples) # even keep the very same array assert_true(pds.samples.base is ds.samples) # there is no way that it can be the same attribute assert_false(np.all(pds.sa.ids == ds.sa.ids)) # ids should reflect permutation setup assert_array_equal(pds.sa.targets, ds.sa.targets[pds.sa.ids]) # other attribute should remain intact assert_array_equal(pds.sa.chunks, ds.sa.chunks) # now chunk-wise permutation permutation = AttributePermutator('ids', limit='chunks') pds = permutation(ds) # first ten should remain first ten assert_false(np.any(pds.sa.ids[:10] > 9)) # verify that implausible assure=True would not work permutation = AttributePermutator('targets', limit='ids', assure=True) assert_raises(RuntimeError, permutation, ds) # same thing, but only permute single chunk permutation = AttributePermutator('ids', limit={'chunks': 3}) pds = permutation(ds) # one chunk should change assert_false(np.any(pds.sa.ids[30:40] > 39)) assert_false(np.any(pds.sa.ids[30:40] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # or a list of chunks permutation = AttributePermutator('ids', limit={'chunks': [3,4]}) pds = permutation(ds) # two chunks should change assert_false(np.any(pds.sa.ids[30:50] > 49)) assert_false(np.any(pds.sa.ids[30:50] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # and now try generating more permutations nruns = 2 def assert_all_different_permutations(pds): assert_equal(len(pds), nruns) for i, p in enumerate(pds): assert_false(np.all(p.sa.ids == ds.sa.ids)) for p_ in pds[i+1:]: assert_false(np.all(p.sa.ids == p_.sa.ids)) permutation = AttributePermutator(['targets', 'ids'], assure=True, count=nruns) pds = list(permutation.generate(ds)) assert_all_different_permutations(pds) # if we provide seeding, and generate, it should also return different datasets permutation = AttributePermutator(['targets', 'ids'], count=nruns, rng=1) pds1 = list(permutation.generate(ds)) assert_all_different_permutations(pds) # but if we regenerate -- should all be the same to before pds2 = list(permutation.generate(ds)) assert_equal(len(pds1), len(pds2)) for p1, p2 in zip(pds1, pds2): assert_datasets_equal(p1, p2) # permute feature attrs ds.fa['ids'] = range(ds.shape[1]) permutation = AttributePermutator('fa.ids', assure=True) pds = permutation(ds) assert_false(np.all(pds.fa.ids == ds.fa.ids)) # now chunk-wise uattrs strategy (reassignment) permutation = AttributePermutator('targets', limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets), zip(pds.targets)) # in each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) # we have only 1-to-1 mappings assert_true(len(set(zip(otargets, ptargets))), len(set(otargets))) ds.sa['odds'] = ds.sa.ids % 2 # test combinations permutation = AttributePermutator(['targets', 'odds'], limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets, ds.sa.odds), zip(pds.targets, pds.sa.odds)) # In each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] oodds, podds = ds.sa.odds[chunk_idx], pds.sa.odds[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) assert_equal(set(oodds), set(podds)) # at the end we have the same mapping assert_equal(set(zip(otargets, oodds)), set(zip(ptargets, podds)))