Esempio n. 1
0
class TimeMinAbundanceSuite(TimeMinHashSuite):
    def setup(self):
        TimeMinHashSuite.setup(self)
        self.mh = MinHash(500, 21, track_abundance=True)

        self.populated_mh = MinHash(500, 21, track_abundance=True)
        for seq in self.sequences:
            self.populated_mh.add_sequence(seq)

    def time_get_mins_abundance(self):
        mh = self.populated_mh
        for i in range(500):
            mh.get_mins(with_abundance=True)

    def time_set_abundances(self):
        mh = self.mh
        mins = self.populated_mh.get_mins(with_abundance=True)
        for i in range(500):
            mh.set_abundances(mins)

    def time_set_abundances_noclear(self):
        mh = self.mh
        mins = self.populated_mh.get_mins(with_abundance=True)
        for i in range(500):
            mh.set_abundances(mins, clear=False)
Esempio n. 2
0
def test_div_zero_contained(track_abundance):
    # verify that empty MHs do not yield divide by zero errors for contained_by
    mh = MinHash(1, 4, track_abundance=track_abundance)
    mh2 = mh.copy_and_clear()

    mh.add_sequence('ATGC')
    assert mh.contained_by(mh2) == 0
    assert mh2.contained_by(mh) == 0
Esempio n. 3
0
def test_div_zero_contained(track_abundance):
    # verify that empty MHs do not yield divide by zero errors for contained_by
    mh = MinHash(1, 4, track_abundance=track_abundance)
    mh2 = mh.copy_and_clear()

    mh.add_sequence('ATGC')
    assert mh.contained_by(mh2) == 0
    assert mh2.contained_by(mh) == 0
Esempio n. 4
0
def test_set_abundance_initialized():
    a = MinHash(1, 4, track_abundance=False)
    a.add_sequence('ATGC')

    with pytest.raises(RuntimeError) as e:
        a.track_abundance = True

    assert "Can only set track_abundance=True if the MinHash is empty" in e.value.args[0]
Esempio n. 5
0
def test_reset_abundance_initialized():
    a = MinHash(1, 4, track_abundance=True)
    a.add_sequence('ATGC')

    # If we had a minhash with abundances and drop it, this shouldn't fail.
    # Convert from Abundance to Regular MinHash
    a.track_abundance = False

    assert a.get_mins(with_abundance=True) == [12415348535738636339]
Esempio n. 6
0
def test_basic_dna_bad(track_abundance):
    # test behavior on bad DNA
    mh = MinHash(1, 4, track_abundance=track_abundance)

    with pytest.raises(ValueError) as e:
        mh.add_sequence('ATGR')
    print(e)

    assert 'invalid DNA character in input k-mer: ATGR' in str(e)
Esempio n. 7
0
def test_basic_dna_bad(track_abundance):
    # test behavior on bad DNA
    mh = MinHash(1, 4, track_abundance=track_abundance)

    with pytest.raises(ValueError) as e:
        mh.add_sequence('ATGR')
    print(e)

    assert 'invalid DNA character in input k-mer: ATGR' in str(e.value)
Esempio n. 8
0
def test_abundance_simple():
    a = MinHash(20, 5, False, track_abundance=True)

    a.add_sequence('AAAAA')
    assert a.get_mins() == [2110480117637990133]
    assert a.get_mins(with_abundance=True) == {2110480117637990133: 1}

    a.add_sequence('AAAAA')
    assert a.get_mins() == [2110480117637990133]
    assert a.get_mins(with_abundance=True) == {2110480117637990133: 2}
Esempio n. 9
0
def test_abundance_simple():
    a = MinHash(20, 5, False, track_abundance=True)

    a.add_sequence('AAAAA')
    assert a.get_mins() == [2110480117637990133]
    assert a.get_mins(with_abundance=True) == {2110480117637990133: 1}

    a.add_sequence('AAAAA')
    assert a.get_mins() == [2110480117637990133]
    assert a.get_mins(with_abundance=True) == {2110480117637990133: 2}
Esempio n. 10
0
def test_consume_lowercase(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    b = MinHash(20, 10, track_abundance=track_abundance)

    a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA'.lower())
    b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')

    assert a.compare(b) == 1.0
    assert b.compare(b) == 1.0
    assert b.compare(a) == 1.0
    assert a.compare(a) == 1.0
Esempio n. 11
0
def test_consume_lowercase(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    b = MinHash(20, 10, track_abundance=track_abundance)

    a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA'.lower())
    b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')

    assert round(a.similarity(b), 3) == 1.0
    assert round(b.similarity(b), 3) == 1.0
    assert round(b.similarity(a), 3) == 1.0
    assert round(a.similarity(a), 3) == 1.0
Esempio n. 12
0
class TimeMinHashSuite:
    def setup(self):
        self.mh = MinHash(500, 21, track_abundance=False)
        self.sequences = load_sequences(get_test_data('ecoli.genes.fna')) * 10

        self.populated_mh = MinHash(500, 21, track_abundance=False)
        for seq in self.sequences:
            self.populated_mh.add_sequence(seq)

    def time_add_sequence(self):
        mh = self.mh
        sequences = self.sequences
        for seq in sequences:
            mh.add_sequence(seq)

    def time_get_mins(self):
        mh = self.populated_mh
        for i in range(500):
            mh.get_mins()

    def time_add_hash(self):
        mh = self.mh
        for i in range(10000):
            mh.add_hash(i)

    def time_compare(self):
        mh = self.mh
        other_mh = self.populated_mh
        for i in range(500):
            mh.compare(other_mh)

    def time_count_common(self):
        mh = self.mh
        other_mh = self.populated_mh
        for i in range(500):
            mh.count_common(other_mh)

    def time_merge(self):
        mh = self.mh
        other_mh = self.populated_mh
        for i in range(500):
            mh.merge(other_mh)

    def time_copy(self):
        mh = self.populated_mh
        for i in range(500):
            mh.__copy__()

    def time_concat(self):
        mh = self.mh
        other_mh = self.populated_mh
        for i in range(500):
            mh += other_mh
Esempio n. 13
0
def test_basic_dna(track_abundance):
    # verify that MHs of size 1 stay size 1, & act properly as bottom sketches.
    mh = MinHash(1, 4, track_abundance=track_abundance)
    mh.add_sequence('ATGC')
    a = mh.get_mins()

    mh.add_sequence('GCAT')             # this will not get added; hash > ATGC
    b = mh.get_mins()

    print(a, b)
    assert a == b
    assert len(b) == 1
Esempio n. 14
0
def test_dayhoff(track_abundance):
    # verify that we can hash to dayhoff-encoded protein/aa sequences
    mh_dayhoff = MinHash(10, 6, is_protein=True,
                         dayhoff=True, hp=False, track_abundance=track_abundance)
    mh_dayhoff.add_sequence('ACTGAC')

    assert len(mh_dayhoff.get_mins()) == 2
    # verify that dayhoff-encoded hashes are different from protein/aa hashes
    mh_protein = MinHash(10, 6, is_protein=True, track_abundance=track_abundance)
    mh_protein.add_sequence('ACTGAC')

    assert len(mh_protein.get_mins()) == 2
    assert mh_protein.get_mins() != mh_dayhoff.get_mins()
Esempio n. 15
0
def test_similarity_1(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    b = MinHash(20, 10, track_abundance=track_abundance)

    a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
    b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')

    assert round(a.similarity(b), 3) == 1.0
    assert round(b.similarity(b), 3) == 1.0
    assert round(b.similarity(a), 3) == 1.0
    assert round(a.similarity(a), 3) == 1.0

    # add same sequence again
    b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
    assert round(a.similarity(b), 3) == 1.0
    assert round(b.similarity(b), 3) == 1.0
    assert round(b.similarity(a), 3) == 1.0
    assert round(a.similarity(a), 3) == 1.0


    b.add_sequence('GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT')
    x = a.similarity(b)
    assert x >= 0.3, x

    x = b.similarity(a)
    assert x >= 0.3, x
    assert round(a.similarity(a), 3) == 1.0
    assert round(b.similarity(b), 3) == 1.0
Esempio n. 16
0
def test_abundance_compare():
    a = MinHash(20, 10, track_abundance=True)
    b = MinHash(20, 10, track_abundance=False)

    a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
    b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')

    assert a.compare(b) == 1.0
    assert b.compare(b) == 1.0
    assert b.compare(a) == 1.0
    assert a.compare(a) == 1.0

    # add same sequence again
    b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
    assert a.compare(b) == 1.0
    assert b.compare(b) == 1.0
    assert b.compare(a) == 1.0
    assert a.compare(a) == 1.0

    b.add_sequence('GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT')
    x = a.compare(b)
    assert x >= 0.3, x

    x = b.compare(a)
    assert x >= 0.3, x
    assert a.compare(a) == 1.0
    assert b.compare(b) == 1.0
Esempio n. 17
0
def test_abundance_compare():
    a = MinHash(20, 10, track_abundance=True)
    b = MinHash(20, 10, track_abundance=False)

    a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
    b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')

    assert a.compare(b) == 1.0
    assert b.compare(b) == 1.0
    assert b.compare(a) == 1.0
    assert a.compare(a) == 1.0

    # add same sequence again
    b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
    assert a.compare(b) == 1.0
    assert b.compare(b) == 1.0
    assert b.compare(a) == 1.0
    assert a.compare(a) == 1.0

    b.add_sequence('GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT')
    x = a.compare(b)
    assert x >= 0.3, x

    x = b.compare(a)
    assert x >= 0.3, x
    assert a.compare(a) == 1.0
    assert b.compare(b) == 1.0
Esempio n. 18
0
def test_hp(track_abundance):
    # verify that we can hash to hp-encoded protein/aa sequences
    mh_hp = MinHash(10, 6, is_protein=True,
                    dayhoff=False, hp=True, track_abundance=track_abundance)
    assert mh_hp.moltype == 'hp'

    mh_hp.add_sequence('ACTGAC')

    assert len(mh_hp.get_mins()) == 2
    # verify that hp-encoded hashes are different from protein/aa hashes
    mh_protein = MinHash(10, 6, is_protein=True, track_abundance=track_abundance)
    mh_protein.add_sequence('ACTGAC')

    assert len(mh_protein.get_mins()) == 2
    assert mh_protein.get_mins() != mh_hp.get_mins()
Esempio n. 19
0
def test_abundance_count_common():
    a = MinHash(20, 5, False, track_abundance=True)
    b = MinHash(20, 5, False, track_abundance=False)

    a.add_sequence('AAAAA')
    a.add_sequence('AAAAA')
    assert a.get_mins() == [2110480117637990133]
    assert a.get_mins(with_abundance=True) == {2110480117637990133: 2}

    b.add_sequence('AAAAA')
    b.add_sequence('GGGGG')
    assert a.count_common(b) == 1
    assert a.count_common(b) == b.count_common(a)

    assert b.get_mins(with_abundance=True) == [2110480117637990133,
                                               10798773792509008305]
Esempio n. 20
0
class TimeMinAbundanceSuite(TimeMinHashSuite):
    def setup(self):
        TimeMinHashSuite.setup(self)
        self.mh = MinHash(500, 21, track_abundance=True)

        self.populated_mh = MinHash(500, 21, track_abundance=True)
        for seq in self.sequences:
            self.populated_mh.add_sequence(seq)

    def time_get_mins_abundance(self):
        mh = self.populated_mh
        for i in range(500):
            mh.get_mins(with_abundance=True)

    def time_set_abundances(self):
        mh = self.mh
        mins = self.populated_mh.get_mins(with_abundance=True)
        for i in range(500):
            mh.set_abundances(mins)
Esempio n. 21
0
def test_intersection_errors(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    b = MinHash(20, 10, track_abundance=track_abundance)
    c = MinHash(30, 10, track_abundance=track_abundance)

    a.add_sequence("TGCCGCCCAGCA")
    b.add_sequence("TGCCGCCCAGCA")

    common = set(a.get_mins())
    combined_size = 3

    intersection, size = a.intersection(b, in_common=False)
    assert intersection == set()
    assert combined_size == size

    with pytest.raises(TypeError):
        a.intersection(set())

    with pytest.raises(TypeError):
        a.intersection(c)
Esempio n. 22
0
def test_bytes_dna(track_abundance):
    mh = MinHash(1, 4, track_abundance=track_abundance)
    mh.add_sequence('ATGC')
    mh.add_sequence(b'ATGC')
    mh.add_sequence(u'ATGC')
    a = mh.get_mins()

    mh.add_sequence('GCAT')             # this will not get added; hash > ATGC
    mh.add_sequence(b'GCAT')             # this will not get added; hash > ATGC
    mh.add_sequence(u'GCAT')             # this will not get added; hash > ATGC
    b = mh.get_mins()

    print(a, b)
    assert a == b
    assert len(b) == 1
Esempio n. 23
0
def test_basic_dna_bad_force(track_abundance):
    # test behavior on bad DNA; use 100 so multiple hashes get added.
    mh = MinHash(100, 4, track_abundance=track_abundance)
    assert len(mh.get_mins()) == 0
    mh.add_sequence('ATGN', True)  # ambiguous kmer skipped.
    assert len(mh.get_mins()) == 0
    mh.add_sequence('AATGN', True)  # but good k-mers still used.
    assert len(mh.get_mins()) == 1
    mh.add_sequence('AATG', True)  # checking that right kmer was added
    assert len(mh.get_mins()) == 1  # (only 1 hash <- this is a dup)
Esempio n. 24
0
def test_basic_dna_bad_force(track_abundance):
    # test behavior on bad DNA; use 100 so multiple hashes get added.
    mh = MinHash(100, 4, track_abundance=track_abundance)
    assert len(mh.get_mins()) == 0
    mh.add_sequence('ATGN', True)     # ambiguous kmer skipped.
    assert len(mh.get_mins()) == 0
    mh.add_sequence('AATGN', True)    # but good k-mers still used.
    assert len(mh.get_mins()) == 1
    mh.add_sequence('AATG', True)     # checking that right kmer was added
    assert len(mh.get_mins()) == 1    # (only 1 hash <- this is a dup)
Esempio n. 25
0
def test_basic_dna_bad_force_2(track_abundance):
    # test behavior on bad DNA
    mh = MinHash(100, 4, track_abundance=track_abundance)
    assert len(mh.get_mins()) == 0
    mh.add_sequence('AAGNCGG', True)  # ambiguous kmers skipped.
    assert len(mh.get_mins()) == 0
    mh.add_sequence('AATGNGCGG', True)  # ambiguous kmers skipped.
    assert len(mh.get_mins()) == 2
    mh.add_sequence('AATG', True)  # checking that right kmers were added
    mh.add_sequence('GCGG', True)
    assert len(mh.get_mins()) == 2  # (only 2 hashes should be there)
Esempio n. 26
0
def test_basic_dna_bad_force_2(track_abundance):
    # test behavior on bad DNA
    mh = MinHash(100, 4, track_abundance=track_abundance)
    assert len(mh.get_mins()) == 0
    mh.add_sequence('AAGNCGG', True)     # ambiguous kmers skipped.
    assert len(mh.get_mins()) == 0
    mh.add_sequence('AATGNGCGG', True)  # ambiguous kmers skipped.
    assert len(mh.get_mins()) == 2
    mh.add_sequence('AATG', True)        # checking that right kmers were added
    mh.add_sequence('GCGG', True)
    assert len(mh.get_mins()) == 2       # (only 2 hashes should be there)
Esempio n. 27
0
def test_mh_copy(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)

    a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
    b = a.__copy__()
    assert round(b.similarity(a), 3) == 1.0
Esempio n. 28
0
def test_short_sequence(track_abundance):
    a = MinHash(20, 5, track_abundance=track_abundance)
    a.add_sequence('GGGG')
    # adding a short sequence should fail silently
    assert len(a.get_mins()) == 0
Esempio n. 29
0
def test_mh_len(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)

    assert len(a) == 20
    a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
    assert len(a) == 20
Esempio n. 30
0
def test_mh_copy(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)

    a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
    b = a.__copy__()
    assert b.compare(a) == 1.0
Esempio n. 31
0
def test_intersection_1(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    b = MinHash(20, 10, track_abundance=track_abundance)

    a.add_sequence('TGCCGCCCAGCA')
    b.add_sequence('TGCCGCCCAGCA')

    common = set(a.get_mins())
    combined_size = 3

    intersection, size = a.intersection(b, in_common=True)
    assert intersection == common
    assert combined_size == size

    intersection, size = b.intersection(b, in_common=True)
    assert intersection == common
    assert combined_size == size

    intersection, size = b.intersection(a, in_common=True)
    assert intersection == common
    assert combined_size == size

    intersection, size = a.intersection(a, in_common=True)
    assert intersection == common
    assert combined_size == size

    # add same sequence again
    b.add_sequence('TGCCGCCCAGCA')

    intersection, size = a.intersection(b, in_common=True)
    assert intersection == common
    assert combined_size == size

    intersection, size = b.intersection(b, in_common=True)
    assert intersection == common
    assert combined_size == size

    intersection, size = b.intersection(a, in_common=True)
    assert intersection == common
    assert combined_size == size

    intersection, size = a.intersection(a, in_common=True)
    assert intersection == common
    assert combined_size == size

    a.add_sequence('GTCCGCCCAGTGA')
    b.add_sequence('GTCCGCCCAGTGG')

    new_in_common = set(a.get_mins()).intersection(set(b.get_mins()))
    new_combined_size = 8

    intersection, size = a.intersection(b, in_common=True)
    assert intersection == new_in_common
    assert size == new_combined_size

    intersection, size = b.intersection(a, in_common=True)
    assert intersection == new_in_common
    assert size == new_combined_size

    intersection, size = a.intersection(a, in_common=True)
    assert intersection == set(a.get_mins())

    intersection, size = b.intersection(b, in_common=True)
    assert intersection == set(b.get_mins())
Esempio n. 32
0
class TimeMinHashSuite:
    def setup(self):
        self.mh = MinHash(500, 21, track_abundance=False)
        self.protein_mh = MinHash(500,
                                  21,
                                  is_protein=True,
                                  track_abundance=False)
        self.sequences = load_sequences()

        self.populated_mh = MinHash(500, 21, track_abundance=False)
        for seq in self.sequences:
            self.populated_mh.add_sequence(seq)

    def time_add_sequence(self):
        mh = self.mh
        sequences = self.sequences
        for seq in sequences:
            mh.add_sequence(seq)

    def time_add_protein(self):
        mh = self.protein_mh
        sequences = self.sequences
        for seq in sequences:
            mh.add_protein(seq)

    def time_get_mins(self):
        mh = self.populated_mh
        for i in range(500):
            mh.get_mins()

    def time_add_hash(self):
        mh = self.mh
        for i in range(10000):
            mh.add_hash(i)

    def time_add_many(self):
        mh = self.mh
        mh.add_many(list(range(1000)))

    def time_compare(self):
        mh = self.mh
        other_mh = self.populated_mh
        for i in range(500):
            mh.compare(other_mh)

    def time_count_common(self):
        mh = self.mh
        other_mh = self.populated_mh
        for i in range(500):
            mh.count_common(other_mh)

    def time_merge(self):
        mh = self.mh
        other_mh = self.populated_mh
        for i in range(500):
            mh.merge(other_mh)

    def time_copy(self):
        mh = self.populated_mh
        for i in range(500):
            mh.__copy__()

    def time_concat(self):
        mh = self.mh
        other_mh = self.populated_mh
        for i in range(500):
            mh += other_mh
Esempio n. 33
0
def test_mh_copy(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)

    a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
    b = a.__copy__()
    assert b.compare(a) == 1.0
Esempio n. 34
0
def test_intersection_1(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    b = MinHash(20, 10, track_abundance=track_abundance)

    a.add_sequence('TGCCGCCCAGCA')
    b.add_sequence('TGCCGCCCAGCA')

    common = set(a.get_mins())
    combined_size = 3

    intersection, size = a.intersection(b)
    assert intersection == common
    assert combined_size == size

    intersection, size = b.intersection(b)
    assert intersection == common
    assert combined_size == size

    intersection, size = b.intersection(a)
    assert intersection == common
    assert combined_size == size

    intersection, size = a.intersection(a)
    assert intersection == common
    assert combined_size == size

    # add same sequence again
    b.add_sequence('TGCCGCCCAGCA')

    intersection, size = a.intersection(b)
    assert intersection == common
    assert combined_size == size

    intersection, size = b.intersection(b)
    assert intersection == common
    assert combined_size == size

    intersection, size = b.intersection(a)
    assert intersection == common
    assert combined_size == size

    intersection, size = a.intersection(a)
    assert intersection == common
    assert combined_size == size

    a.add_sequence('GTCCGCCCAGTGA')
    b.add_sequence('GTCCGCCCAGTGG')

    new_in_common = set(a.get_mins()).intersection(set(b.get_mins()))
    new_combined_size = 8

    intersection, size = a.intersection(b)
    assert intersection == new_in_common
    assert size == new_combined_size

    intersection, size = b.intersection(a)
    assert intersection == new_in_common
    assert size == new_combined_size

    intersection, size = a.intersection(a)
    assert intersection == set(a.get_mins())

    intersection, size = b.intersection(b)
    assert intersection == set(b.get_mins())
Esempio n. 35
0
def test_short_sequence(track_abundance):
    a = MinHash(20, 5, track_abundance=track_abundance)
    a.add_sequence('GGGG')
    # adding a short sequence should fail silently
    assert len(a.get_mins()) == 0
Esempio n. 36
0
def test_mh_len(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)

    assert len(a) == 20
    a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
    assert len(a) == 20