def setup(self): self.mh = MinHash(500, 21, track_abundance=False) self.sequences = load_sequences(get_test_data('ecoli.genes.fna')) * 10 self.populated_mh = MinHash(500, 21, track_abundance=False) for seq in self.sequences: self.populated_mh.add_sequence(seq)
def test_no_downsample_scaled_if_n(track_abundance): # make sure you can't set max_n and then downsample scaled mh = MinHash(2, 4, track_abundance=track_abundance) with pytest.raises(ValueError) as excinfo: mh.downsample_scaled(100000000) assert 'cannot downsample a standard MinHash' in str(excinfo)
def setup(self): TimeMinHashSuite.setup(self) self.mh = MinHash(500, 21, track_abundance=True) self.populated_mh = MinHash(500, 21, track_abundance=True) for seq in self.sequences: self.populated_mh.add_sequence(seq)
def test_set_abundance(): a = MinHash(20, 10, track_abundance=False) with pytest.raises(RuntimeError) as e: a.set_abundances({1: 3, 2: 4}) assert "track_abundance=True when constructing" in e.value.args[0]
def test_basic_dna_bad(track_abundance): # test behavior on bad DNA mh = MinHash(1, 4, track_abundance=track_abundance) with pytest.raises(ValueError) as e: mh.add_sequence('ATGR') print(e) assert 'invalid DNA character in input k-mer: ATGR' in str(e)
def test_size_limit_none(track_abundance): # test behavior with size limit of 0 (=> no size limit) mh = MinHash(0, 4, track_abundance=track_abundance) mh.add_hash(10) mh.add_hash(20) mh.add_hash(30) assert mh.get_mins() == [10, 20, 30] mh.add_hash(5) # -> should retain all, b/c size limit is 0 assert mh.get_mins() == [5, 10, 20, 30]
def test_size_limit(track_abundance): # test behavior with size limit of 3 mh = MinHash(3, 4, track_abundance=track_abundance) mh.add_hash(10) mh.add_hash(20) mh.add_hash(30) assert mh.get_mins() == [10, 20, 30] mh.add_hash(5) # -> should push 30 off end assert mh.get_mins() == [5, 10, 20]
def test_abundance_simple(): a = MinHash(20, 5, False, track_abundance=True) a.add_sequence('AAAAA') assert a.get_mins() == [2110480117637990133] assert a.get_mins(with_abundance=True) == {2110480117637990133: 1} a.add_sequence('AAAAA') assert a.get_mins() == [2110480117637990133] assert a.get_mins(with_abundance=True) == {2110480117637990133: 2}
def test_reviving_minhash(): # simulate reading a MinHash from disk mh = MinHash(0, 21, max_hash=184467440737095520, seed=42, track_abundance=False) mins = (28945103950853965, 74690756200987412, 82962372765557409, 93503551367950366, 106923350319729608, 135116761470196737, 160165359281648267, 162390811417732001, 177939655451276972) for m in mins: mh.add_hash(m)
def test_mh_count_common(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) assert a.count_common(b) == 10 assert b.count_common(a) == 10
class TimeMinHashSuite: def setup(self): self.mh = MinHash(500, 21, track_abundance=False) self.sequences = load_sequences(get_test_data('ecoli.genes.fna')) * 10 self.populated_mh = MinHash(500, 21, track_abundance=False) for seq in self.sequences: self.populated_mh.add_sequence(seq) def time_add_sequence(self): mh = self.mh sequences = self.sequences for seq in sequences: mh.add_sequence(seq) def time_get_mins(self): mh = self.populated_mh for i in range(500): mh.get_mins() def time_add_hash(self): mh = self.mh for i in range(10000): mh.add_hash(i) def time_compare(self): mh = self.mh other_mh = self.populated_mh for i in range(500): mh.compare(other_mh) def time_count_common(self): mh = self.mh other_mh = self.populated_mh for i in range(500): mh.count_common(other_mh) def time_merge(self): mh = self.mh other_mh = self.populated_mh for i in range(500): mh.merge(other_mh) def time_copy(self): mh = self.populated_mh for i in range(500): mh.__copy__() def time_concat(self): mh = self.mh other_mh = self.populated_mh for i in range(500): mh += other_mh
def test_minhash_abund_merge_flat_2(): # this targets a segfault caused by trying to merge # a signature with abundance and a signature without abundance. a = MinHash(0, 10, track_abundance=True, max_hash=5000) b = MinHash(0, 10, max_hash=5000) for i in range(0, 10, 2): a.add_hash(i) for j in range(0, 10, 3): b.add_hash(i) a.merge(b)
def test_mh_copy_and_clear_with_max_hash(track_abundance): # test basic creation of new, empty MinHash w/max_hash param set a = MinHash(20, 10, track_abundance=track_abundance, max_hash=20) for i in range(0, 40, 2): a.add_hash(i) b = a.copy_and_clear() assert a.ksize == b.ksize assert b.num == a.num assert b.max_hash == 20 assert not b.is_protein assert b.track_abundance == track_abundance assert b.seed == a.seed assert len(b.get_mins()) == 0
def test_mh_merge_check_length(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) assert (len(c.get_mins()) == 20)
def test_mh_subtract(track_abundance): # test merging two identically configured minhashes a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) assert a.subtract_mins(b) == set(range(2, 40, 4))
def test_scaled_property(track_abundance): scaled = 10000 a = MinHash(0, 10, track_abundance=track_abundance, max_hash=round(2**64 / scaled)) assert a.scaled == scaled
def test_mh_merge(track_abundance): # test merging two identically configured minhashes a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) d = b.merge(a) assert len(c) == len(d) assert c.get_mins() == d.get_mins() assert c.compare(d) == 1.0 assert d.compare(c) == 1.0
def test_max_hash_and_scaled_error(track_abundance): # test behavior when supplying both max_hash and scaled with pytest.raises(ValueError): mh = MinHash(0, 4, track_abundance=track_abundance, max_hash=35, scaled=5)
def test_pickle_max_hash(track_abundance): a = MinHash(0, 10, track_abundance=track_abundance, max_hash=20) for i in range(0, 40, 2): a.add_hash(i) b = pickle.loads(pickle.dumps(a)) assert a.ksize == b.ksize assert b.num == a.num assert b.max_hash == a.max_hash assert b.max_hash == 20 assert not b.is_protein assert b.track_abundance == track_abundance assert b.seed == a.seed assert len(b.get_mins()) == len(a.get_mins()) assert len(b.get_mins()) == 11 assert a.scaled == b.scaled assert b.scaled != 0
def test_bytes_protein(track_abundance): # verify that we can hash protein/aa sequences mh = MinHash(10, 6, True, track_abundance=track_abundance) mh.add_protein('AGYYG') mh.add_protein(u'AGYYG') mh.add_protein(b'AGYYG') assert len(mh.get_mins()) == 4
class TimeMinAbundanceSuite(TimeMinHashSuite): def setup(self): TimeMinHashSuite.setup(self) self.mh = MinHash(500, 21, track_abundance=True) self.populated_mh = MinHash(500, 21, track_abundance=True) for seq in self.sequences: self.populated_mh.add_sequence(seq) def time_get_mins_abundance(self): mh = self.populated_mh for i in range(500): mh.get_mins(with_abundance=True) def time_set_abundances(self): mh = self.mh mins = self.populated_mh.get_mins(with_abundance=True) for i in range(500): mh.set_abundances(mins)
def test_mh_inplace_concat(track_abundance): # test merging two identically configured minhashes a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.__copy__() c += b d = b.__copy__() d += a assert len(c) == len(d) assert c.get_mins() == d.get_mins() assert c.compare(d) == 1.0 assert d.compare(c) == 1.0
def test_basic_dna(track_abundance): # verify that MHs of size 1 stay size 1, & act properly as bottom sketches. mh = MinHash(1, 4, track_abundance=track_abundance) mh.add_sequence('ATGC') a = mh.get_mins() mh.add_sequence('GCAT') # this will not get added; hash > ATGC b = mh.get_mins() print(a, b) assert a == b assert len(b) == 1
def test_div_zero_contained(track_abundance): # verify that empty MHs do not yield divide by zero errors for contained_by mh = MinHash(1, 4, track_abundance=track_abundance) mh2 = mh.copy_and_clear() mh.add_sequence('ATGC') assert mh.contained_by(mh2) == 0 assert mh2.contained_by(mh) == 0
def test_bytes_dna(track_abundance): mh = MinHash(1, 4, track_abundance=track_abundance) mh.add_sequence('ATGC') mh.add_sequence(b'ATGC') mh.add_sequence(u'ATGC') a = mh.get_mins() mh.add_sequence('GCAT') # this will not get added; hash > ATGC mh.add_sequence(b'GCAT') # this will not get added; hash > ATGC mh.add_sequence(u'GCAT') # this will not get added; hash > ATGC b = mh.get_mins() print(a, b) assert a == b assert len(b) == 1
def test_abundance_compare(): a = MinHash(20, 10, track_abundance=True) b = MinHash(20, 10, track_abundance=False) a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') assert a.compare(b) == 1.0 assert b.compare(b) == 1.0 assert b.compare(a) == 1.0 assert a.compare(a) == 1.0 # add same sequence again b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') assert a.compare(b) == 1.0 assert b.compare(b) == 1.0 assert b.compare(a) == 1.0 assert a.compare(a) == 1.0 b.add_sequence('GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT') x = a.compare(b) assert x >= 0.3, x x = b.compare(a) assert x >= 0.3, x assert a.compare(a) == 1.0 assert b.compare(b) == 1.0
def test_abundance_count_common(): a = MinHash(20, 5, False, track_abundance=True) b = MinHash(20, 5, False, track_abundance=False) a.add_sequence('AAAAA') a.add_sequence('AAAAA') assert a.get_mins() == [2110480117637990133] assert a.get_mins(with_abundance=True) == {2110480117637990133: 2} b.add_sequence('AAAAA') b.add_sequence('GGGGG') assert a.count_common(b) == 1 assert a.count_common(b) == b.count_common(a) assert b.get_mins(with_abundance=True) == [ 2110480117637990133, 10798773792509008305 ]