def test_mh_asymmetric_merge(track_abundance): # test merging two asymmetric (different size) MHs a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) # different size: 10 b = MinHash(10, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) d = b.merge(a) assert len(a) == 20 assert len(b) == 10 assert len(c) == len(a) assert len(d) == len(b) # can't compare different sizes without downsampling with pytest.raises(TypeError): d.compare(a) a = a.downsample_n(d.num) print(a.get_mins()) print(d.get_mins()) assert d.compare(a) == 1.0 c = c.downsample_n(b.num) assert c.compare(b) == 1.0
def test_mh_inplace_concat_asymmetric(track_abundance): # test merging two asymmetric (different size) MHs a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) # different size: 10 b = MinHash(10, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.__copy__() c += b d = b.__copy__() d += a assert len(a) == 20 assert len(b) == 10 assert len(c) == len(a) assert len(d) == len(b) try: d.compare(a) except TypeError as exc: assert 'must have same num' in str(exc) a = a.downsample_n(d.num) assert d.compare(a) == 1.0 # see: d += a, above. c = c.downsample_n(b.num) assert c.compare(b) == 0.5
def test_clear_2(): a = MinHash(20, 5, False, track_abundance=False) a.add_hash(10) assert a.get_mins() == [10] a.clear() assert a.get_mins() == []
def test_clear(): a = MinHash(20, 5, False, track_abundance=True) a.add_hash(10) assert a.get_mins(with_abundance=True) == {10: 1} a.clear() assert a.get_mins(with_abundance=True) == {}
def test_set_abundance_clear_3(): a = MinHash(20, 5, False, track_abundance=True) a.add_hash(10) assert a.get_mins(with_abundance=True) == {10: 1} a.set_abundances({20: 1, 30: 4}, clear=False) assert a.get_mins(with_abundance=True) == {10: 1, 20: 1, 30: 4}
def test_set_abundance_clear_2(): # default should be clear=True a = MinHash(20, 5, False, track_abundance=True) a.add_hash(10) assert a.get_mins(with_abundance=True) == {10: 1} a.set_abundances({20: 2}) assert a.get_mins(with_abundance=True) == {20: 2}
def test_reviving_minhash(): # simulate reading a MinHash from disk mh = MinHash(0, 21, max_hash=184467440737095520, seed=42, track_abundance=False) mins = (28945103950853965, 74690756200987412, 82962372765557409, 93503551367950366, 106923350319729608, 135116761470196737, 160165359281648267, 162390811417732001, 177939655451276972) for m in mins: mh.add_hash(m)
def test_minhash_abund_capacity_increase(): # this targets bug #319, a segfault caused by invalidation of # std::vector iterators upon vector resizing. # this should set capacity to 1000 - see KmerMinHash constructor call # to 'reserve' when n > 0 for specific parameter. a = MinHash(0, 10, track_abundance=True, max_hash=5000) # 1001 is dependent on the value passed to reserve (currently 1000). for i in range(1001, 0, -1): a.add_hash(i)
def test_mh_count_common(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) assert a.count_common(b) == 10 assert b.count_common(a) == 10
def test_mh_merge_check_length(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) assert(len(c.get_mins()) == 20)
def test_mh_subtract(track_abundance): # test subtracting two identically configured minhashes a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) assert a.subtract_mins(b) == set(range(2, 40, 4))
def test_mh_merge_check_length(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) assert len(c.get_mins()) == 20
def test_minhash_abund_add(): # this targets part of bug #319, a segfault caused by invalidation of # std::vector iterators upon vector resizing - in this case, there # was also a bug in inserting into the middle of mins when scaled was set. a = MinHash(0, 10, track_abundance=True, max_hash=5000) n = 0 for i in range(10, 0, -1): a.add_hash(i) n += 1 assert len(a.get_mins()) == n print(len(a.get_mins()))
def test_minhash_abund_merge_flat_2(): # this targets a segfault caused by trying to merge # a signature with abundance and a signature without abundance. a = MinHash(0, 10, track_abundance=True, max_hash=5000) b = MinHash(0, 10, max_hash=5000) for i in range(0, 10, 2): a.add_hash(i) for j in range(0, 10, 3): b.add_hash(i) a.merge(b)
def test_add_many(track_abundance): a = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000) b = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000) a.add_many(list(range(0, 100, 2))) a.add_many(list(range(0, 100, 2))) assert len(a) == 50 assert all(c % 2 == 0 for c in a.get_mins()) for h in range(0, 100, 2): b.add_hash(h) b.add_hash(h) assert len(b) == 50 assert a == b
def test_mh_copy_and_clear_with_max_hash(track_abundance): # test basic creation of new, empty MinHash w/max_hash param set a = MinHash(0, 10, track_abundance=track_abundance, max_hash=20) for i in range(0, 40, 2): a.add_hash(i) b = a.copy_and_clear() assert a.ksize == b.ksize assert b.num == a.num assert b.max_hash == 20 assert not b.is_protein assert b.track_abundance == track_abundance assert b.seed == a.seed assert len(b.get_mins()) == 0 assert a.scaled == b.scaled assert b.scaled != 0
def test_mh_merge_empty_num(track_abundance): # test merging two identically configured minhashes, one empty a = MinHash(20, 10, track_abundance=track_abundance) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) d = b.merge(a) assert len(c) assert len(c) == len(d) assert c.get_mins() == d.get_mins() assert round(c.similarity(d), 3) == 1.0 assert round(d.similarity(c), 3) == 1.0
def test_mh_merge_empty_scaled(track_abundance): # test merging two identically configured minhashes, one empty a = MinHash(0, 10, scaled=1, track_abundance=track_abundance) b = MinHash(0, 10, scaled=1, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) d = b.merge(a) assert len(c) assert len(c) == len(d) assert c.get_mins() == d.get_mins() assert c.compare(d) == 1.0 assert d.compare(c) == 1.0
def test_mh_merge(track_abundance): # test merging two identically configured minhashes a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) d = b.merge(a) assert len(c) == len(d) assert c.get_mins() == d.get_mins() assert c.compare(d) == 1.0 assert d.compare(c) == 1.0
def test_minhash_abund_merge_flat(): # this targets a segfault caused by trying to compute similarity # of a signature with abundance and a signature without abundance. # the correct behavior for now is to calculate simple Jaccard, # i.e. 'flatten' both of them. a = MinHash(0, 10, track_abundance=True, max_hash=5000) b = MinHash(0, 10, max_hash=5000) for i in range(0, 10, 2): a.add_hash(i) for j in range(0, 10, 3): b.add_hash(i) # these crashed, previously. assert a.similarity(b) == 0.2 assert b.similarity(a) == 0.2
def test_pickle_max_hash(track_abundance): a = MinHash(0, 10, track_abundance=track_abundance, max_hash=20) for i in range(0, 40, 2): a.add_hash(i) b = pickle.loads(pickle.dumps(a)) assert a.ksize == b.ksize assert b.num == a.num assert b.max_hash == a.max_hash assert b.max_hash == 20 assert not b.is_protein assert b.track_abundance == track_abundance assert b.seed == a.seed assert len(b.get_mins()) == len(a.get_mins()) assert len(b.get_mins()) == 11 assert a.scaled == b.scaled assert b.scaled != 0
def test_pickle_scaled(track_abundance): a = MinHash(0, 10, track_abundance=track_abundance, scaled=922337203685477632) for i in range(0, 40, 2): a.add_hash(i) b = pickle.loads(pickle.dumps(a)) assert a.ksize == b.ksize assert b.num == a.num assert b.max_hash == a.max_hash assert b.max_hash == 20 assert not b.is_protein assert b.track_abundance == track_abundance assert b.seed == a.seed assert len(b.get_mins()) == len(a.get_mins()) assert len(b.get_mins()) == 11 assert a.scaled == b.scaled assert b.scaled != 0
def test_max_hash(track_abundance): # test behavior with max_hash mh = MinHash(0, 4, track_abundance=track_abundance, max_hash=35) mh.add_hash(10) mh.add_hash(20) mh.add_hash(30) assert mh.get_mins() == [10, 20, 30] mh.add_hash(40) assert mh.get_mins() == [10, 20, 30] mh.add_hash(36) assert mh.get_mins() == [10, 20, 30]
def test_mh_asymmetric(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) # different size: 10 b = MinHash(10, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) assert a.count_common(b) == 10 assert b.count_common(a) == 10 with pytest.raises(TypeError): a.compare(b) a = a.downsample_n(10) assert a.compare(b) == 0.5 assert b.compare(a) == 0.5
def test_mh_inplace_concat(track_abundance): # test merging two identically configured minhashes a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.__copy__() c += b d = b.__copy__() d += a assert len(c) == len(d) assert c.get_mins() == d.get_mins() assert c.compare(d) == 1.0 assert d.compare(c) == 1.0
def test_mh_jaccard_asymmetric_num(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) # different size: 10 b = MinHash(10, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) assert a.count_common(b) == 10 assert b.count_common(a) == 10 # with 'jaccard', this will raise an error b/c different num with pytest.raises(TypeError): a.jaccard(b) a = a.downsample_n(10) # CTB note: this used to be 'compare', is now 'jaccard' assert a.jaccard(b) == 0.5 assert b.jaccard(a) == 0.5
def test_mh_merge_check_length2(track_abundance): # merged MH doesn't have full number of elements a = MinHash(4, 10, track_abundance=track_abundance) a.add_hash(3) a.add_hash(1) a.add_hash(4) b = MinHash(4, 10, track_abundance=track_abundance) b.add_hash(3) b.add_hash(1) b.add_hash(4) c = a.merge(b) assert len(c.get_mins()) == 3
def test_mh_merge_check_length2(track_abundance): # merged MH doesn't have full number of elements a = MinHash(4, 10, track_abundance=track_abundance) a.add_hash(3) a.add_hash(1) a.add_hash(4) b = MinHash(4, 10, track_abundance=track_abundance) b.add_hash(3) b.add_hash(1) b.add_hash(4) c = a.merge(b) assert(len(c.get_mins()) == 3)
def test_mh_merge(track_abundance): # test merging two identically configured minhashes a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) d = b.merge(a) assert len(c) == len(d) assert c.get_mins() == d.get_mins() if track_abundance: assert round(c.similarity(d), 3) == 0.91 assert round(d.similarity(c), 3) == 0.91 else: assert round(c.similarity(d), 3) == 1.0 assert round(d.similarity(c), 3) == 1.0
def test_scaled(track_abundance): # test behavior with scaled (alt to max_hash) scaled = get_scaled_for_max_hash(35) print('XX', scaled, get_max_hash_for_scaled(scaled)) mh = MinHash(0, 4, track_abundance=track_abundance, scaled=scaled) assert mh.max_hash == 35 mh.add_hash(10) mh.add_hash(20) mh.add_hash(30) assert mh.get_mins() == [10, 20, 30] mh.add_hash(40) assert mh.get_mins() == [10, 20, 30] mh.add_hash(36) assert mh.get_mins() == [10, 20, 30]
def test_mh_asymmetric_merge(track_abundance): # test merging two asymmetric (different size) MHs a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) # different size: 10 b = MinHash(10, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) d = b.merge(a) assert len(a) == 20 assert len(b) == 10 assert len(c) == len(a) assert len(d) == len(b) # can't use jaccard on different nums without downsampling with pytest.raises(TypeError): d.jaccard(a) a = a.downsample_n(d.num) print(a.get_mins()) print(d.get_mins()) if track_abundance: assert round(d.similarity(a), 3) == 0.91 else: assert round(d.similarity(a), 3) == 1.0 c = c.downsample_n(b.num) if track_abundance: assert round(c.similarity(b), 3) == 0.91 else: assert c.similarity(b) == 1.0
def test_size_limit(track_abundance): # test behavior with size limit of 3 mh = MinHash(3, 4, track_abundance=track_abundance) mh.add_hash(10) mh.add_hash(20) mh.add_hash(30) assert mh.get_mins() == [10, 20, 30] mh.add_hash(5) # -> should push 30 off end assert mh.get_mins() == [5, 10, 20]
def test_mh_len(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) assert a.get_mins() == list(range(0, 40, 2))
def test_mh_unsigned_long_long(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) a.add_hash(9227159859419181011) # too big for a C long int. assert 9227159859419181011 in a.get_mins()