def test_per_sample_sequences_complex(self): maximum = 2 def bin_f(x): return x['SequenceID'].rsplit('_', 1)[0] exp = sorted([('a', { 'SequenceID': 'a_2', 'Sequence': 'AATTGGCC-a2' }), ('a', { 'SequenceID': 'a_3', 'Sequence': 'AATTGGCC-a3' }), ('b', { 'SequenceID': 'b_2', 'Sequence': 'AATTGGCC-b2' }), ('b', { 'SequenceID': 'b_1', 'Sequence': 'AATTGGCC-b1' }), ('c', { 'SequenceID': 'c_1', 'Sequence': 'AATTGGCC-c1' }), ('c', { 'SequenceID': 'c_2', 'Sequence': 'AATTGGCC-c2' })], key=lambda x: x[0]) obs = isubsample(self.mock_sequence_iter(self.sequences), maximum, bin_f=bin_f, buf_size=1) self.assertEqual(sorted(obs, key=lambda x: x[0]), exp)
def subsample_dm(distmat, mapping_file, max, category, output): """Subsample the distmat to max samples per category value""" mf = pd.read_csv(mapping_file, '\t', converters=defaultdict(str), index_col='#SampleID') id_to_cat = dict(mf[category]) def bin_f(x): return id_to_cat[x] dm = read(distmat, into=DistanceMatrix) dm = dm.filter([id for _, id in isubsample(dm.ids, max, bin_f=bin_f)]) dm.to_file(output)
def test_per_sample_sequences_complex(self): maximum = 2 bin_f = lambda x: x['SequenceID'].rsplit('_', 1)[0] exp = sorted([('a', {'SequenceID': 'a_2', 'Sequence': 'AATTGGCC-a2'}), ('a', {'SequenceID': 'a_3', 'Sequence': 'AATTGGCC-a3'}), ('b', {'SequenceID': 'b_2', 'Sequence': 'AATTGGCC-b2'}), ('b', {'SequenceID': 'b_1', 'Sequence': 'AATTGGCC-b1'}), ('c', {'SequenceID': 'c_1', 'Sequence': 'AATTGGCC-c1'}), ('c', {'SequenceID': 'c_2', 'Sequence': 'AATTGGCC-c2'})], key=lambda x: x[0]) obs = isubsample(self.mock_sequence_iter(self.sequences), maximum, bin_f=bin_f, buf_size=1) self.assertEqual(sorted(obs, key=lambda x: x[0]), exp)
def subsample_dm(distmat, mapping_file, max, category, output): """Subsample the distmat to max samples per category value""" mf = pd.read_csv(mapping_file, '\t', converters=defaultdict(str), dtype=str) mf.set_index('#SampleID', inplace=True) id_to_cat = dict(mf[category]) def bin_f(x): return id_to_cat.get(x) dm = read(distmat, into=DistanceMatrix) dm = dm.filter([id for _, id in isubsample(dm.ids, max, bin_f=bin_f)]) dm.to_file(output)
def test_isubsample_simple(self): maximum = 10 def bin_f(x): return x['SequenceID'].rsplit('_', 1)[0] # note, the result here is sorted by sequence_id but is in heap order # by the random values associated to each sequence exp = sorted([('a', { 'SequenceID': 'a_5', 'Sequence': 'AATTGGCC-a5' }), ('a', { 'SequenceID': 'a_1', 'Sequence': 'AATTGGCC-a1' }), ('a', { 'SequenceID': 'a_4', 'Sequence': 'AATTGGCC-a4' }), ('a', { 'SequenceID': 'a_3', 'Sequence': 'AATTGGCC-a3' }), ('a', { 'SequenceID': 'a_2', 'Sequence': 'AATTGGCC-a2' }), ('b', { 'SequenceID': 'b_2', 'Sequence': 'AATTGGCC-b2' }), ('b', { 'SequenceID': 'b_1', 'Sequence': 'AATTGGCC-b1' }), ('c', { 'SequenceID': 'c_3', 'Sequence': 'AATTGGCC-c3' }), ('c', { 'SequenceID': 'c_2', 'Sequence': 'AATTGGCC-c2' }), ('c', { 'SequenceID': 'c_1', 'Sequence': 'AATTGGCC-c1' })], key=lambda x: x[0]) obs = isubsample(self.mock_sequence_iter(self.sequences), maximum, bin_f=bin_f) self.assertEqual(sorted(obs, key=lambda x: x[0]), exp)
def test_per_sample_sequences_min_seqs(self): maximum = 10 minimum = 3 bin_f = lambda x: x['SequenceID'].rsplit('_', 1)[0] # note, the result here is sorted by sequence_id but is in heap order # by the random values associated to each sequence exp = sorted([('a', {'SequenceID': 'a_5', 'Sequence': 'AATTGGCC-a5'}), ('a', {'SequenceID': 'a_1', 'Sequence': 'AATTGGCC-a1'}), ('a', {'SequenceID': 'a_4', 'Sequence': 'AATTGGCC-a4'}), ('a', {'SequenceID': 'a_3', 'Sequence': 'AATTGGCC-a3'}), ('a', {'SequenceID': 'a_2', 'Sequence': 'AATTGGCC-a2'}), ('c', {'SequenceID': 'c_3', 'Sequence': 'AATTGGCC-c3'}), ('c', {'SequenceID': 'c_2', 'Sequence': 'AATTGGCC-c2'}), ('c', {'SequenceID': 'c_1', 'Sequence': 'AATTGGCC-c1'})], key=lambda x: x[0]) obs = isubsample(self.mock_sequence_iter(self.sequences), maximum, minimum, bin_f=bin_f) self.assertEqual(sorted(obs, key=lambda x: x[0]), exp)
def test_binf_is_none(self): maximum = 2 items = [1, 2] exp = [(True, 1), (True, 2)] obs = isubsample(items, maximum) self.assertEqual(list(obs), exp)
def test_max_lt_zero(self): gen = isubsample([1, 2, 3], maximum=-10) with self.assertRaises(ValueError): next(gen)
def test_min_gt_max(self): gen = isubsample([1, 2, 3], maximum=2, minimum=10) with self.assertRaises(ValueError): next(gen)