def test_pickle(track_abundance): import pickle from io import BytesIO e1 = MinHash(n=5, ksize=6, is_protein=False, track_abundance=track_abundance) seq = 'ATGGCAGTGACGATGCCG' e1.add_sequence(seq) e1.add_sequence(seq) fp = BytesIO() pickle.dump(e1, fp) fp2 = BytesIO(fp.getvalue()) e2 = pickle.load(fp2) assert e1.get_mins(with_abundance=track_abundance) == \ e2.get_mins(with_abundance=track_abundance) assert e1.num == e2.num assert e1.ksize == e2.ksize assert e1.is_protein == e2.is_protein assert e1.max_hash == e2.max_hash assert e1.seed == e2.seed
def build_query_mh_for_seed(self, seed, scaled): mh = MinHash(0, self.ksize, scaled=scaled, seed=seed) for record in screed.open(self.filename): mh.add_sequence(record.sequence, True) return sourmash_lib.SourmashSignature(mh, name=self.name, filename=self.filename)
def test_diff_seed(track_abundance): E1 = MinHash(n=5, ksize=20, track_abundance=track_abundance, seed=1) E2 = MinHash(n=5, ksize=20, track_abundance=track_abundance, seed=2) for i in [1, 2, 3, 4, 5]: E1.add_hash(i) for i in [1, 2, 3, 4, 6]: E2.add_hash(i) with pytest.raises(ValueError): E1.count_common(E2)
def test_common_1(track_abundance): E1 = MinHash(n=5, ksize=20, track_abundance=track_abundance) E2 = MinHash(n=5, ksize=20, track_abundance=track_abundance) for i in [1, 2, 3, 4, 5]: E1.add_hash(i) for i in [1, 2, 3, 4, 6]: E2.add_hash(i) assert E1.count_common(E2) == 4 assert E2.count_common(E1) == 4
def build_query_mh_for_seed(seed, ksize, scaled, query_seq_file): mh = MinHash(0, ksize, scaled=scaled, seed=seed) name = None for record in screed.open(query_seq_file): if not name: name = record.name mh.add_sequence(record.sequence, True) return sourmash_lib.SourmashSignature(mh, name=name, filename=query_seq_file)
def test_abund_similarity(): E1 = MinHash(n=5, ksize=20, track_abundance=True) E2 = MinHash(n=5, ksize=20, track_abundance=True) for i in [1]: E1.add_hash(i) for i in [1, 2]: E2.add_hash(i) assert round(E1.similarity(E1)) == 1.0 assert round(E1.similarity(E2), 2) == 0.5 assert round(E1.similarity(E1, ignore_abundance=True)) == 1.0 assert round(E1.similarity(E2, ignore_abundance=True), 2) == 0.5
def test_jaccard_1(track_abundance): E1 = MinHash(n=5, ksize=20, track_abundance=track_abundance) E2 = MinHash(n=5, ksize=20, track_abundance=track_abundance) for i in [1, 2, 3, 4, 5]: E1.add_hash(i) for i in [1, 2, 3, 4, 6]: E2.add_hash(i) # here the union is [1, 2, 3, 4, 5] # and the intesection is [1, 2, 3, 4] => 4/5. assert round(E1.jaccard(E2), 2) == round(4 / 5.0, 2) assert round(E2.jaccard(E1), 2) == round(4 / 5.0, 2)
def test_bad_construct_1(track_abundance): try: e1 = MinHash(ksize=6, is_protein=False, track_abundance=track_abundance) assert 0, "require n in constructor" except TypeError: pass
def test_abund_similarity_zero(): E1 = MinHash(n=5, ksize=20, track_abundance=True) E2 = MinHash(n=5, ksize=20, track_abundance=True) for i in [1]: E1.add_hash(i) assert E1.similarity(E2) == 0.0
def test_protein_mh(track_abundance): e1 = MinHash(n=5, ksize=6, is_protein=True, track_abundance=track_abundance) e2 = MinHash(n=5, ksize=6, is_protein=True, track_abundance=track_abundance) seq = 'ATGGCAGTGACGATGCCG' e1.add_sequence(seq) for i in range(len(seq) - 5): kmer = seq[i:i + 6] e2.add(kmer) assert e1.get_mins() == e2.get_mins() assert 901193879228338100 in e1.get_mins()
def test_dna_mh(track_abundance): e1 = MinHash(n=5, ksize=4, track_abundance=track_abundance) e2 = MinHash(n=5, ksize=4, track_abundance=track_abundance) seq = 'ATGGCAGTGACGATGCCAG' e1.add_sequence(seq) for i in range(len(seq) - 3): e2.add(seq[i:i + 4]) assert e1.get_mins() == e2.get_mins() print(e1.get_mins()) assert 726311917625663847 in e1.get_mins() assert 3697418565283905118 in e1.get_mins()
def test_jaccard_2_difflen(track_abundance): E1 = MinHash(n=5, ksize=20, track_abundance=track_abundance) E2 = MinHash(n=5, ksize=20, track_abundance=track_abundance) for i in [1, 2, 3, 4, 5]: E1.add_hash(i) for i in [1, 2, 3, 4]: E2.add_hash(i) print(E1.jaccard(E2)) assert round(E1.jaccard(E2), 2) == 4 / 5.0 assert round(E2.jaccard(E1), 2) == 4 / 5.0