def setup(self): TimeMinHashSuite.setup(self) self.mh = MinHash(500, 21, track_abundance=True) self.populated_mh = MinHash(500, 21, track_abundance=True) for seq in self.sequences: self.populated_mh.add_sequence(seq)
def setup(self): self.mh = MinHash(500, 21, track_abundance=False) self.sequences = load_sequences(get_test_data('ecoli.genes.fna')) * 10 self.populated_mh = MinHash(500, 21, track_abundance=False) for seq in self.sequences: self.populated_mh.add_sequence(seq)
def test_mh_similarity_downsample_errors(track_abundance): # test downsample=False (default) argument to MinHash.similarity # max_hash = 50 a = MinHash(0, 20, max_hash=50, track_abundance=track_abundance) # max_hash = 100 b = MinHash(0, 20, max_hash=100, track_abundance=track_abundance) a_values = {1: 5, 3: 3, 5: 2, 8: 2} b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) b.set_abundances(b_values) else: a.add_many(a_values.keys()) b.add_many(b_values.keys()) # error, incompatible max hash with pytest.raises(ValueError) as e: a.similarity(b, ignore_abundance=True) # downsample=False assert 'mismatch in scaled; comparison fail' in str(e.value) with pytest.raises(ValueError) as e: a.similarity(b, ignore_abundance=False) # downsample=False assert 'mismatch in scaled; comparison fail' in str(e.value) with pytest.raises(ValueError) as e: b.similarity(a, ignore_abundance=True) # downsample=False assert 'mismatch in scaled; comparison fail' in str(e.value) with pytest.raises(ValueError) as e: b.similarity(a, ignore_abundance=False) # downsample=false assert 'mismatch in scaled; comparison fail' in str(e.value)
def setup(self): self.mh = MinHash(500, 21, track_abundance=True) self.protein_mh = MinHash(500, 21, is_protein=True, track_abundance=True) self.sequences = load_sequences()
def test_mh_asymmetric_merge(track_abundance): # test merging two asymmetric (different size) MHs a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) # different size: 10 b = MinHash(10, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) d = b.merge(a) assert len(a) == 20 assert len(b) == 10 assert len(c) == len(a) assert len(d) == len(b) # can't compare different sizes without downsampling with pytest.raises(TypeError): d.compare(a) a = a.downsample_n(d.num) print(a.get_mins()) print(d.get_mins()) assert d.compare(a) == 1.0 c = c.downsample_n(b.num) assert c.compare(b) == 1.0
def setup(self): self.mh = MinHash(500, 21, track_abundance=False) self.sequences = load_sequences() self.populated_mh = MinHash(500, 21, track_abundance=False) for seq in self.sequences: self.populated_mh.add_sequence(seq)
def test_mh_inplace_concat_asymmetric(track_abundance): # test merging two asymmetric (different size) MHs a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) # different size: 10 b = MinHash(10, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.__copy__() c += b d = b.__copy__() d += a assert len(a) == 20 assert len(b) == 10 assert len(c) == len(a) assert len(d) == len(b) try: d.compare(a) except TypeError as exc: assert 'must have same num' in str(exc) a = a.downsample_n(d.num) assert d.compare(a) == 1.0 # see: d += a, above. c = c.downsample_n(b.num) assert c.compare(b) == 0.5
def test_similarity_1(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) b = MinHash(20, 10, track_abundance=track_abundance) a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') assert round(a.similarity(b), 3) == 1.0 assert round(b.similarity(b), 3) == 1.0 assert round(b.similarity(a), 3) == 1.0 assert round(a.similarity(a), 3) == 1.0 # add same sequence again b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') assert round(a.similarity(b), 3) == 1.0 assert round(b.similarity(b), 3) == 1.0 assert round(b.similarity(a), 3) == 1.0 assert round(a.similarity(a), 3) == 1.0 b.add_sequence('GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT') x = a.similarity(b) assert x >= 0.3, x x = b.similarity(a) assert x >= 0.3, x assert round(a.similarity(a), 3) == 1.0 assert round(b.similarity(b), 3) == 1.0
def test_mh_similarity_downsample_true(track_abundance): # verify sim(a, b) == sim(b, a), with and without ignore_abundance # max_hash = 50 a = MinHash(0, 20, max_hash=50, track_abundance=track_abundance) # max_hash = 100 b = MinHash(0, 20, max_hash=100, track_abundance=track_abundance) a_values = {1: 5, 3: 3, 5: 2, 8: 2} b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) b.set_abundances(b_values) else: a.add_many(a_values.keys()) b.add_many(b_values.keys()) # downsample=True => no error; values should match either way x = a.similarity(b, ignore_abundance=True, downsample=True) y = b.similarity(a, ignore_abundance=True, downsample=True) assert x == y # downsample=True => no error; values should match either way x = a.similarity(b, ignore_abundance=False, downsample=True) y = b.similarity(a, ignore_abundance=False, downsample=True) assert x == y
def test_abundance_compare(): a = MinHash(20, 10, track_abundance=True) b = MinHash(20, 10, track_abundance=False) a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') assert a.compare(b) == 1.0 assert b.compare(b) == 1.0 assert b.compare(a) == 1.0 assert a.compare(a) == 1.0 # add same sequence again b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') assert a.compare(b) == 1.0 assert b.compare(b) == 1.0 assert b.compare(a) == 1.0 assert a.compare(a) == 1.0 b.add_sequence('GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT') x = a.compare(b) assert x >= 0.3, x x = b.compare(a) assert x >= 0.3, x assert a.compare(a) == 1.0 assert b.compare(b) == 1.0
def test_mh_jaccard_similarity(): # check actual Jaccard value for a non-trivial case a = MinHash(0, 20, max_hash=50, track_abundance=False) b = MinHash(0, 20, max_hash=50, track_abundance=False) a.add_many([1, 3, 5, 8]) b.add_many([1, 3, 5, 6, 8, 10]) assert a.similarity(b) == 4. / 6.
def test_set_abundance_clear(): # on empty minhash, clear should have no effect a = MinHash(20, 5, False, track_abundance=True) b = MinHash(20, 5, False, track_abundance=True) a.set_abundances({1: 3, 2: 4}, clear=True) b.set_abundances({1: 3, 2: 4}, clear=False) assert a.get_mins() == b.get_mins()
def test_mh_merge_check_length(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) assert len(c.get_mins()) == 20
def test_mh_angular_similarity_2(): # check actual angular similarity for a second non-trivial case a = MinHash(0, 20, max_hash=100, track_abundance=True) b = MinHash(0, 20, max_hash=100, track_abundance=True) a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2, 70: 70}) b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1, 70: 70}) assert round(a.similarity(b), 4) == 0.9728 # ignore_abundance => jaccard assert a.similarity(b, ignore_abundance=True) == 5. / 7.
def test_consume_lowercase(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) b = MinHash(20, 10, track_abundance=track_abundance) a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA'.lower()) b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') assert a.compare(b) == 1.0 assert b.compare(b) == 1.0 assert b.compare(a) == 1.0 assert a.compare(a) == 1.0
def test_consume_lowercase(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) b = MinHash(20, 10, track_abundance=track_abundance) a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA'.lower()) b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') assert round(a.similarity(b), 3) == 1.0 assert round(b.similarity(b), 3) == 1.0 assert round(b.similarity(a), 3) == 1.0 assert round(a.similarity(a), 3) == 1.0
def test_mh_count_common(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) assert a.count_common(b) == 10 assert b.count_common(a) == 10
def test_mh_subtract(track_abundance): # test subtracting two identically configured minhashes a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) b = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) assert a.subtract_mins(b) == set(range(2, 40, 4))
def test_mh_similarity_downsample_jaccard_value(): # check jaccard value after downsampling # max_hash = 50 a = MinHash(0, 20, max_hash=50, track_abundance=False) # max_hash = 100 b = MinHash(0, 20, max_hash=100, track_abundance=False) a.add_many([1, 3, 5, 8, 70]) b.add_many([1, 3, 5, 6, 8, 10, 70]) # the hash=70 will be truncated by downsampling assert a.similarity(b, downsample=True) == 4. / 6.
def test_dayhoff(track_abundance): # verify that we can hash to dayhoff-encoded protein/aa sequences mh_dayhoff = MinHash(10, 6, is_protein=True, dayhoff=True, hp=False, track_abundance=track_abundance) mh_dayhoff.add_sequence('ACTGAC') assert len(mh_dayhoff.get_mins()) == 2 # verify that dayhoff-encoded hashes are different from protein/aa hashes mh_protein = MinHash(10, 6, is_protein=True, track_abundance=track_abundance) mh_protein.add_sequence('ACTGAC') assert len(mh_protein.get_mins()) == 2 assert mh_protein.get_mins() != mh_dayhoff.get_mins()
def test_abundance_simple_2(): a = MinHash(20, 5, False, track_abundance=True) b = MinHash(20, 5, False, track_abundance=True) a.add_sequence('AAAAA') assert a.get_mins() == [2110480117637990133] assert a.get_mins(with_abundance=True) == {2110480117637990133: 1} a.add_sequence('AAAAA') assert a.get_mins() == [2110480117637990133] assert a.get_mins(with_abundance=True) == {2110480117637990133: 2} b.add_sequence('AAAAA') assert a.count_common(b) == 1
def test_mh_merge_check_length2(track_abundance): # merged MH doesn't have full number of elements a = MinHash(4, 10, track_abundance=track_abundance) a.add_hash(3) a.add_hash(1) a.add_hash(4) b = MinHash(4, 10, track_abundance=track_abundance) b.add_hash(3) b.add_hash(1) b.add_hash(4) c = a.merge(b) assert len(c.get_mins()) == 3
def test_minhash_abund_merge_flat_2(): # this targets a segfault caused by trying to merge # a signature with abundance and a signature without abundance. a = MinHash(0, 10, track_abundance=True, max_hash=5000) b = MinHash(0, 10, max_hash=5000) for i in range(0, 10, 2): a.add_hash(i) for j in range(0, 10, 3): b.add_hash(i) a.merge(b)
def test_mh_angular_similarity(): # check actual angular similarity for a non-trivial case, taken from: # https://www.sciencedirect.com/topics/computer-science/cosine-similarity # note: angular similarity is 1 - 2*(acos(sim) / pi), when elements # are always positive (https://en.wikipedia.org/wiki/Cosine_similarity) a = MinHash(0, 20, max_hash=50, track_abundance=True) b = MinHash(0, 20, max_hash=50, track_abundance=True) a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2}) b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1}) cos_sim = 0.9356 angular_sim = 1 - 2 * math.acos(cos_sim) / math.pi assert round(angular_sim, 4) == 0.7703 assert round(a.similarity(b), 4) == round(angular_sim, 4)
def test_hp(track_abundance): # verify that we can hash to hp-encoded protein/aa sequences mh_hp = MinHash(10, 6, is_protein=True, dayhoff=False, hp=True, track_abundance=track_abundance) assert mh_hp.moltype == 'hp' mh_hp.add_sequence('ACTGAC') assert len(mh_hp.get_mins()) == 2 # verify that hp-encoded hashes are different from protein/aa hashes mh_protein = MinHash(10, 6, is_protein=True, track_abundance=track_abundance) mh_protein.add_sequence('ACTGAC') assert len(mh_protein.get_mins()) == 2 assert mh_protein.get_mins() != mh_hp.get_mins()
def test_set_abundance(): a = MinHash(20, 10, track_abundance=False) with pytest.raises(RuntimeError) as e: a.set_abundances({1: 3, 2: 4}) assert "track_abundance=True when constructing" in e.value.args[0]
def test_scaled_property(track_abundance): scaled = 10000 a = MinHash(0, 10, track_abundance=track_abundance, max_hash=round(2**64 / scaled)) assert a.scaled == scaled
def test_mh_merge_empty_scaled(track_abundance): # test merging two identically configured minhashes, one empty a = MinHash(0, 10, scaled=1, track_abundance=track_abundance) b = MinHash(0, 10, scaled=1, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) c = a.merge(b) d = b.merge(a) assert len(c) assert len(c) == len(d) assert c.get_mins() == d.get_mins() assert c.compare(d) == 1.0 assert d.compare(c) == 1.0
def test_add_many(track_abundance): a = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000) b = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000) a.add_many(list(range(0, 100, 2))) a.add_many(list(range(0, 100, 2))) assert len(a) == 50 assert all(c % 2 == 0 for c in a.get_mins()) for h in range(0, 100, 2): b.add_hash(h) b.add_hash(h) assert len(b) == 50 assert a == b
def test_no_downsample_scaled_if_n(track_abundance): # make sure you can't set max_n and then downsample scaled mh = MinHash(2, 4, track_abundance=track_abundance) with pytest.raises(ValueError) as excinfo: mh.downsample_scaled(100000000) assert 'cannot downsample a standard MinHash' in str(excinfo.value)