def test_mh_asymmetric_merge(track_abundance): # test merging two asymmetric (different size) MHs a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) # different size: 10 b = MinHash(10, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) d = b.merge(a) assert len(a) == 20 assert len(b) == 10 assert len(c) == len(a) assert len(d) == len(b) # can't compare different sizes without downsampling with pytest.raises(TypeError): d.compare(a) a = a.downsample_n(d.num) print(a.get_mins()) print(d.get_mins()) assert d.compare(a) == 1.0 c = c.downsample_n(b.num) assert c.compare(b) == 1.0
def test_minhash_abund_merge_flat_2(): # this targets a segfault caused by trying to merge # a signature with abundance and a signature without abundance. a = MinHash(0, 10, track_abundance=True, max_hash=5000) b = MinHash(0, 10, max_hash=5000) for i in range(0, 10, 2): a.add_hash(i) for j in range(0, 10, 3): b.add_hash(i) a.merge(b)
def test_mh_merge_empty_num(track_abundance): # test merging two identically configured minhashes, one empty a = MinHash(20, 10, track_abundance=track_abundance) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) d = b.merge(a) assert len(c) assert len(c) == len(d) assert c.get_mins() == d.get_mins() assert round(c.similarity(d), 3) == 1.0 assert round(d.similarity(c), 3) == 1.0
def test_mh_merge_empty_scaled(track_abundance): # test merging two identically configured minhashes, one empty a = MinHash(0, 10, scaled=1, track_abundance=track_abundance) b = MinHash(0, 10, scaled=1, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) d = b.merge(a) assert len(c) assert len(c) == len(d) assert c.get_mins() == d.get_mins() assert c.compare(d) == 1.0 assert d.compare(c) == 1.0
def test_mh_merge(track_abundance): # test merging two identically configured minhashes a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) d = b.merge(a) assert len(c) == len(d) assert c.get_mins() == d.get_mins() assert c.compare(d) == 1.0 assert d.compare(c) == 1.0
def test_mh_merge_check_length(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) assert len(c.get_mins()) == 20
def test_mh_merge_check_length(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) assert(len(c.get_mins()) == 20)
def test_mh_merge(track_abundance): # test merging two identically configured minhashes a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) d = b.merge(a) assert len(c) == len(d) assert c.get_mins() == d.get_mins() if track_abundance: assert round(c.similarity(d), 3) == 0.91 assert round(d.similarity(c), 3) == 0.91 else: assert round(c.similarity(d), 3) == 1.0 assert round(d.similarity(c), 3) == 1.0
def test_mh_merge_check_length2(track_abundance): # merged MH doesn't have full number of elements a = MinHash(4, 10, track_abundance=track_abundance) a.add_hash(3) a.add_hash(1) a.add_hash(4) b = MinHash(4, 10, track_abundance=track_abundance) b.add_hash(3) b.add_hash(1) b.add_hash(4) c = a.merge(b) assert len(c.get_mins()) == 3
def test_mh_merge_check_length2(track_abundance): # merged MH doesn't have full number of elements a = MinHash(4, 10, track_abundance=track_abundance) a.add_hash(3) a.add_hash(1) a.add_hash(4) b = MinHash(4, 10, track_abundance=track_abundance) b.add_hash(3) b.add_hash(1) b.add_hash(4) c = a.merge(b) assert(len(c.get_mins()) == 3)
def test_mh_asymmetric_merge(track_abundance): # test merging two asymmetric (different size) MHs a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) # different size: 10 b = MinHash(10, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) d = b.merge(a) assert len(a) == 20 assert len(b) == 10 assert len(c) == len(a) assert len(d) == len(b) # can't use jaccard on different nums without downsampling with pytest.raises(TypeError): d.jaccard(a) a = a.downsample_n(d.num) print(a.get_mins()) print(d.get_mins()) if track_abundance: assert round(d.similarity(a), 3) == 0.91 else: assert round(d.similarity(a), 3) == 1.0 c = c.downsample_n(b.num) if track_abundance: assert round(c.similarity(b), 3) == 0.91 else: assert c.similarity(b) == 1.0
def test_mh_merge_diff_ksize(track_abundance): a = MinHash(20, 5, track_abundance=track_abundance) b = MinHash(20, 6, track_abundance=track_abundance) with pytest.raises(ValueError): a.merge(b)
def test_mh_merge_diff_protein(track_abundance): a = MinHash(20, 5, False, track_abundance=track_abundance) b = MinHash(20, 5, True, track_abundance=track_abundance) with pytest.raises(ValueError): a.merge(b)
def test_mh_merge_typeerror(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) with pytest.raises(TypeError): a.merge(set())