Esempio n. 1
0
def test_size_limit(track_abundance):
    # test behavior with size limit of 3
    mh = MinHash(3, 4, track_abundance=track_abundance)
    mh.add_hash(10)
    mh.add_hash(20)
    mh.add_hash(30)
    assert mh.get_mins() == [10, 20, 30]
    mh.add_hash(5) # -> should push 30 off end
    assert mh.get_mins() == [5, 10, 20]
Esempio n. 2
0
def test_abundance_simple():
    a = MinHash(20, 5, False, track_abundance=True)

    a.add_sequence('AAAAA')
    assert a.get_mins() == [2110480117637990133]
    assert a.get_mins(with_abundance=True) == {2110480117637990133: 1}

    a.add_sequence('AAAAA')
    assert a.get_mins() == [2110480117637990133]
    assert a.get_mins(with_abundance=True) == {2110480117637990133: 2}
Esempio n. 3
0
def test_basic_dna_bad_force(track_abundance):
    # test behavior on bad DNA; use 100 so multiple hashes get added.
    mh = MinHash(100, 4, track_abundance=track_abundance)
    assert len(mh.get_mins()) == 0
    mh.add_sequence('ATGN', True)  # ambiguous kmer skipped.
    assert len(mh.get_mins()) == 0
    mh.add_sequence('AATGN', True)  # but good k-mers still used.
    assert len(mh.get_mins()) == 1
    mh.add_sequence('AATG', True)  # checking that right kmer was added
    assert len(mh.get_mins()) == 1  # (only 1 hash <- this is a dup)
Esempio n. 4
0
def test_abundance_simple():
    a = MinHash(20, 5, False, track_abundance=True)

    a.add_sequence('AAAAA')
    assert a.get_mins() == [2110480117637990133]
    assert a.get_mins(with_abundance=True) == {2110480117637990133: 1}

    a.add_sequence('AAAAA')
    assert a.get_mins() == [2110480117637990133]
    assert a.get_mins(with_abundance=True) == {2110480117637990133: 2}
Esempio n. 5
0
def test_basic_dna_bad_force(track_abundance):
    # test behavior on bad DNA; use 100 so multiple hashes get added.
    mh = MinHash(100, 4, track_abundance=track_abundance)
    assert len(mh.get_mins()) == 0
    mh.add_sequence('ATGN', True)     # ambiguous kmer skipped.
    assert len(mh.get_mins()) == 0
    mh.add_sequence('AATGN', True)    # but good k-mers still used.
    assert len(mh.get_mins()) == 1
    mh.add_sequence('AATG', True)     # checking that right kmer was added
    assert len(mh.get_mins()) == 1    # (only 1 hash <- this is a dup)
Esempio n. 6
0
def test_basic_dna_bad_force_2(track_abundance):
    # test behavior on bad DNA
    mh = MinHash(100, 4, track_abundance=track_abundance)
    assert len(mh.get_mins()) == 0
    mh.add_sequence('AAGNCGG', True)     # ambiguous kmers skipped.
    assert len(mh.get_mins()) == 0
    mh.add_sequence('AATGNGCGG', True)  # ambiguous kmers skipped.
    assert len(mh.get_mins()) == 2
    mh.add_sequence('AATG', True)        # checking that right kmers were added
    mh.add_sequence('GCGG', True)
    assert len(mh.get_mins()) == 2       # (only 2 hashes should be there)
Esempio n. 7
0
def test_add_hash_with_abundance():
    a = MinHash(20, 5, False, track_abundance=True)

    a.add_hash_with_abundance(10, 1)
    assert a.get_mins(with_abundance=True) == {10: 1}

    a.add_hash_with_abundance(20, 2)
    assert a.get_mins(with_abundance=True) == {10: 1, 20: 2}

    a.add_hash_with_abundance(10, 2)
    assert a.get_mins(with_abundance=True) == {10: 3, 20: 2}
Esempio n. 8
0
def test_max_hash(track_abundance):
    # test behavior with max_hash
    mh = MinHash(0, 4, track_abundance=track_abundance, max_hash=35)
    mh.add_hash(10)
    mh.add_hash(20)
    mh.add_hash(30)
    assert mh.get_mins() == [10, 20, 30]
    mh.add_hash(40)
    assert mh.get_mins() == [10, 20, 30]
    mh.add_hash(36)
    assert mh.get_mins() == [10, 20, 30]
Esempio n. 9
0
def test_max_hash(track_abundance):
    # test behavior with max_hash
    mh = MinHash(0, 4, track_abundance=track_abundance, max_hash=35)
    mh.add_hash(10)
    mh.add_hash(20)
    mh.add_hash(30)
    assert mh.get_mins() == [10, 20, 30]
    mh.add_hash(40)
    assert mh.get_mins() == [10, 20, 30]
    mh.add_hash(36)
    assert mh.get_mins() == [10, 20, 30]
Esempio n. 10
0
def test_basic_dna_bad_force_2(track_abundance):
    # test behavior on bad DNA
    mh = MinHash(100, 4, track_abundance=track_abundance)
    assert len(mh.get_mins()) == 0
    mh.add_sequence('AAGNCGG', True)  # ambiguous kmers skipped.
    assert len(mh.get_mins()) == 0
    mh.add_sequence('AATGNGCGG', True)  # ambiguous kmers skipped.
    assert len(mh.get_mins()) == 2
    mh.add_sequence('AATG', True)  # checking that right kmers were added
    mh.add_sequence('GCGG', True)
    assert len(mh.get_mins()) == 2  # (only 2 hashes should be there)
Esempio n. 11
0
def test_basic_dna(track_abundance):
    # verify that MHs of size 1 stay size 1, & act properly as bottom sketches.
    mh = MinHash(1, 4, track_abundance=track_abundance)
    mh.add_sequence('ATGC')
    a = mh.get_mins()

    mh.add_sequence('GCAT')             # this will not get added; hash > ATGC
    b = mh.get_mins()

    print(a, b)
    assert a == b
    assert len(b) == 1
Esempio n. 12
0
def test_minhash_abund_add():
    # this targets part of bug #319, a segfault caused by invalidation of
    # std::vector iterators upon vector resizing - in this case, there
    # was also a bug in inserting into the middle of mins when scaled was set.

    a = MinHash(0, 10, track_abundance=True, max_hash=5000)

    n = 0
    for i in range(10, 0, -1):
        a.add_hash(i)
        n += 1
        assert len(a.get_mins()) == n
        print(len(a.get_mins()))
Esempio n. 13
0
def test_set_abundance_clear_4():
    # setting the abundance of an already set hash should add
    # the abundances together
    a = MinHash(20, 5, False, track_abundance=True)

    a.set_abundances({
        20: 2,
        10: 1
    }, clear=False)  # should also sort the hashes
    assert a.get_mins(with_abundance=True) == {10: 1, 20: 2}

    a.set_abundances({20: 1, 10: 2}, clear=False)
    assert a.get_mins(with_abundance=True) == {10: 3, 20: 3}
Esempio n. 14
0
def test_minhash_abund_add():
    # this targets part of bug #319, a segfault caused by invalidation of
    # std::vector iterators upon vector resizing - in this case, there
    # was also a bug in inserting into the middle of mins when scaled was set.

    a = MinHash(0, 10, track_abundance=True, max_hash=5000)

    n = 0
    for i in range(10, 0, -1):
        a.add_hash(i)
        n += 1
        assert len(a.get_mins()) == n
        print(len(a.get_mins()))
Esempio n. 15
0
def test_dayhoff(track_abundance):
    # verify that we can hash to dayhoff-encoded protein/aa sequences
    mh_dayhoff = MinHash(10, 6, is_protein=True,
                         dayhoff=True, hp=False, track_abundance=track_abundance)
    mh_dayhoff.add_sequence('ACTGAC')

    assert len(mh_dayhoff.get_mins()) == 2
    # verify that dayhoff-encoded hashes are different from protein/aa hashes
    mh_protein = MinHash(10, 6, is_protein=True, track_abundance=track_abundance)
    mh_protein.add_sequence('ACTGAC')

    assert len(mh_protein.get_mins()) == 2
    assert mh_protein.get_mins() != mh_dayhoff.get_mins()
Esempio n. 16
0
def test_protein_hp(track_abundance, hp):
    # verify that we can hash protein/aa sequences
    mh = MinHash(10,
                 6,
                 True,
                 dayhoff=False,
                 hp=hp,
                 track_abundance=track_abundance)
    mh.add_protein('AGYYG')

    if hp:
        assert len(mh.get_mins()) == 1
    else:
        assert len(mh.get_mins()) == 4
Esempio n. 17
0
def test_scaled(track_abundance):
    # test behavior with scaled (alt to max_hash)
    scaled = get_scaled_for_max_hash(35)
    print('XX', scaled, get_max_hash_for_scaled(scaled))
    mh = MinHash(0, 4, track_abundance=track_abundance, scaled=scaled)
    assert mh.max_hash == 35

    mh.add_hash(10)
    mh.add_hash(20)
    mh.add_hash(30)
    assert mh.get_mins() == [10, 20, 30]
    mh.add_hash(40)
    assert mh.get_mins() == [10, 20, 30]
    mh.add_hash(36)
    assert mh.get_mins() == [10, 20, 30]
Esempio n. 18
0
def test_hp(track_abundance):
    # verify that we can hash to hp-encoded protein/aa sequences
    mh_hp = MinHash(10, 6, is_protein=True,
                    dayhoff=False, hp=True, track_abundance=track_abundance)
    assert mh_hp.moltype == 'hp'

    mh_hp.add_sequence('ACTGAC')

    assert len(mh_hp.get_mins()) == 2
    # verify that hp-encoded hashes are different from protein/aa hashes
    mh_protein = MinHash(10, 6, is_protein=True, track_abundance=track_abundance)
    mh_protein.add_sequence('ACTGAC')

    assert len(mh_protein.get_mins()) == 2
    assert mh_protein.get_mins() != mh_hp.get_mins()
Esempio n. 19
0
def test_bytes_dna(track_abundance):
    mh = MinHash(1, 4, track_abundance=track_abundance)
    mh.add_sequence('ATGC')
    mh.add_sequence(b'ATGC')
    mh.add_sequence(u'ATGC')
    a = mh.get_mins()

    mh.add_sequence('GCAT')             # this will not get added; hash > ATGC
    mh.add_sequence(b'GCAT')             # this will not get added; hash > ATGC
    mh.add_sequence(u'GCAT')             # this will not get added; hash > ATGC
    b = mh.get_mins()

    print(a, b)
    assert a == b
    assert len(b) == 1
Esempio n. 20
0
def test_bytes_dna(track_abundance):
    mh = MinHash(1, 4, track_abundance=track_abundance)
    mh.add_sequence('ATGC')
    mh.add_sequence(b'ATGC')
    mh.add_sequence('ATGC')
    a = mh.get_mins()

    mh.add_sequence('GCAT')  # this will not get added; hash > ATGC
    mh.add_sequence(b'GCAT')  # this will not get added; hash > ATGC
    mh.add_sequence('GCAT')  # this will not get added; hash > ATGC
    b = mh.get_mins()

    print(a, b)
    assert a == b
    assert len(b) == 1
Esempio n. 21
0
def test_scaled(track_abundance):
    # test behavior with scaled (alt to max_hash)
    scaled = get_scaled_for_max_hash(35)
    print('XX', scaled, get_max_hash_for_scaled(scaled))
    mh = MinHash(0, 4, track_abundance=track_abundance, scaled=scaled)
    assert mh.max_hash == 35

    mh.add_hash(10)
    mh.add_hash(20)
    mh.add_hash(30)
    assert mh.get_mins() == [10, 20, 30]
    mh.add_hash(40)
    assert mh.get_mins() == [10, 20, 30]
    mh.add_hash(36)
    assert mh.get_mins() == [10, 20, 30]
Esempio n. 22
0
def test_bytes_protein_hp(track_abundance, hp):
    # verify that we can hash protein/aa sequences
    mh = MinHash(10, 6, True, dayhoff=False, hp=hp, track_abundance=track_abundance)
    expected_moltype = 'protein'
    if hp:
        expected_moltype = 'hp'
    assert mh.moltype == expected_moltype

    mh.add_protein('AGYYG')
    mh.add_protein(u'AGYYG')
    mh.add_protein(b'AGYYG')

    if hp:
        assert len(mh.get_mins()) == 1
    else:
        assert len(mh.get_mins()) == 4
Esempio n. 23
0
def test_abundance_count_common():
    a = MinHash(20, 5, False, track_abundance=True)
    b = MinHash(20, 5, False, track_abundance=False)

    a.add_sequence('AAAAA')
    a.add_sequence('AAAAA')
    assert a.get_mins() == [2110480117637990133]
    assert a.get_mins(with_abundance=True) == {2110480117637990133: 2}

    b.add_sequence('AAAAA')
    b.add_sequence('GGGGG')
    assert a.count_common(b) == 1
    assert a.count_common(b) == b.count_common(a)

    assert b.get_mins(with_abundance=True) == [2110480117637990133,
                                               10798773792509008305]
Esempio n. 24
0
def test_mh_asymmetric_merge(track_abundance):
    # test merging two asymmetric (different size) MHs
    a = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 40, 2):
        a.add_hash(i)

    # different size: 10
    b = MinHash(10, 10, track_abundance=track_abundance)
    for i in range(0, 80, 4):
        b.add_hash(i)

    c = a.merge(b)
    d = b.merge(a)

    assert len(a) == 20
    assert len(b) == 10
    assert len(c) == len(a)
    assert len(d) == len(b)

    # can't compare different sizes without downsampling
    with pytest.raises(TypeError):
        d.compare(a)

    a = a.downsample_n(d.num)
    print(a.get_mins())
    print(d.get_mins())
    assert d.compare(a) == 1.0

    c = c.downsample_n(b.num)
    assert c.compare(b) == 1.0
Esempio n. 25
0
def test_mh_asymmetric_merge(track_abundance):
    # test merging two asymmetric (different size) MHs
    a = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 40, 2):
        a.add_hash(i)

    # different size: 10
    b = MinHash(10, 10, track_abundance=track_abundance)
    for i in range(0, 80, 4):
        b.add_hash(i)

    c = a.merge(b)
    d = b.merge(a)

    assert len(a) == 20
    assert len(b) == 10
    assert len(c) == len(a)
    assert len(d) == len(b)

    # can't compare different sizes without downsampling
    with pytest.raises(TypeError):
        d.compare(a)

    a = a.downsample_n(d.num)
    print(a.get_mins())
    print(d.get_mins())
    assert d.compare(a) == 1.0

    c = c.downsample_n(b.num)
    assert c.compare(b) == 1.0
Esempio n. 26
0
def test_bytes_protein(track_abundance):
    # verify that we can hash protein/aa sequences
    mh = MinHash(10, 6, True, track_abundance=track_abundance)
    mh.add_protein('AGYYG')
    mh.add_protein(u'AGYYG')
    mh.add_protein(b'AGYYG')

    assert len(mh.get_mins()) == 4
Esempio n. 27
0
def test_reset_abundance_initialized():
    a = MinHash(1, 4, track_abundance=True)
    a.add_sequence('ATGC')

    # If we had a minhash with abundances and drop it, this shouldn't fail.
    # Convert from Abundance to Regular MinHash
    a.track_abundance = False

    assert a.get_mins(with_abundance=True) == [12415348535738636339]
Esempio n. 28
0
def test_protein_dayhoff(track_abundance, dayhoff):
    # verify that we can hash protein/aa sequences
    mh = MinHash(10,
                 6,
                 True,
                 dayhoff=dayhoff,
                 hp=False,
                 track_abundance=track_abundance)
    mh.add_protein('AGYYG')

    assert len(mh.get_mins()) == 4
Esempio n. 29
0
def test_bytes_protein_dayhoff(track_abundance, dayhoff):
    # verify that we can hash protein/aa sequences
    mh = MinHash(10, 6, True, dayhoff=dayhoff, hp=False,
                 track_abundance=track_abundance)

    expected_moltype = 'protein'
    if dayhoff:
        expected_moltype = 'dayhoff'
    assert mh.moltype == expected_moltype

    mh.add_protein('AGYYG')
    mh.add_protein('AGYYG')
    mh.add_protein(b'AGYYG')

    assert len(mh.get_mins()) == 4
Esempio n. 30
0
def test_add_many(track_abundance):
    a = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000)
    b = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000)

    a.add_many(list(range(0, 100, 2)))
    a.add_many(list(range(0, 100, 2)))

    assert len(a) == 50
    assert all(c % 2 == 0 for c in a.get_mins())

    for h in range(0, 100, 2):
        b.add_hash(h)
        b.add_hash(h)

    assert len(b) == 50
    assert a == b
Esempio n. 31
0
def test_add_many(track_abundance):
    a = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000)
    b = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000)

    a.add_many(list(range(0, 100, 2)))
    a.add_many(list(range(0, 100, 2)))

    assert len(a) == 50
    assert all(c % 2 == 0 for c in a.get_mins())

    for h in range(0, 100, 2):
        b.add_hash(h)
        b.add_hash(h)

    assert len(b) == 50
    assert a == b
Esempio n. 32
0
def test_pickle_scaled(track_abundance):
    a = MinHash(0, 10, track_abundance=track_abundance, scaled=922337203685477632)
    for i in range(0, 40, 2):
        a.add_hash(i)

    b = pickle.loads(pickle.dumps(a))
    assert a.ksize == b.ksize
    assert b.num == a.num
    assert b.max_hash == a.max_hash
    assert b.max_hash == 20
    assert not b.is_protein
    assert b.track_abundance == track_abundance
    assert b.seed == a.seed
    assert len(b.get_mins()) == len(a.get_mins())
    assert len(b.get_mins()) == 11
    assert a.scaled == b.scaled
    assert b.scaled != 0
Esempio n. 33
0
def test_pickle_max_hash(track_abundance):
    a = MinHash(0, 10, track_abundance=track_abundance, max_hash=20)
    for i in range(0, 40, 2):
        a.add_hash(i)

    b = pickle.loads(pickle.dumps(a))
    assert a.ksize == b.ksize
    assert b.num == a.num
    assert b.max_hash == a.max_hash
    assert b.max_hash == 20
    assert not b.is_protein
    assert b.track_abundance == track_abundance
    assert b.seed == a.seed
    assert len(b.get_mins()) == len(a.get_mins())
    assert len(b.get_mins()) == 11
    assert a.scaled == b.scaled
    assert b.scaled != 0
Esempio n. 34
0
def test_remove_many(track_abundance):
    a = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000)

    a.add_many(list(range(0, 100, 2)))

    orig_sig = signature.SourmashSignature(a)
    orig_md5 = orig_sig.md5sum()

    a.remove_many(list(range(0, 100, 3)))
    new_sig = signature.SourmashSignature(a)
    new_md5 = new_sig.md5sum()

    assert orig_md5 == "f1cc295157374f5c07cfca5f867188a1"
    assert new_md5 == "dd93fa319ef57f4a019c59ee1a8c73e2"
    assert orig_md5 != new_md5

    assert len(a) == 33
    assert all(c % 6 != 0 for c in a.get_mins())
Esempio n. 35
0
def test_remove_many(track_abundance):
    a = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000)

    a.add_many(list(range(0, 100, 2)))

    orig_sig = signature.SourmashSignature(a)
    orig_md5 = orig_sig.md5sum()

    a.remove_many(list(range(0, 100, 3)))
    new_sig = signature.SourmashSignature(a)
    new_md5 = new_sig.md5sum()

    assert orig_md5 == "f1cc295157374f5c07cfca5f867188a1"
    assert new_md5 == "dd93fa319ef57f4a019c59ee1a8c73e2"
    assert orig_md5 != new_md5

    assert len(a) == 33
    assert all(c % 6 != 0 for c in a.get_mins())
Esempio n. 36
0
class TimeMinAbundanceSuite(TimeMinHashSuite):
    def setup(self):
        TimeMinHashSuite.setup(self)
        self.mh = MinHash(500, 21, track_abundance=True)

        self.populated_mh = MinHash(500, 21, track_abundance=True)
        for seq in self.sequences:
            self.populated_mh.add_sequence(seq)

    def time_get_mins_abundance(self):
        mh = self.populated_mh
        for i in range(500):
            mh.get_mins(with_abundance=True)

    def time_set_abundances(self):
        mh = self.mh
        mins = self.populated_mh.get_mins(with_abundance=True)
        for i in range(500):
            mh.set_abundances(mins)
Esempio n. 37
0
def test_intersection_errors(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    b = MinHash(20, 10, track_abundance=track_abundance)
    c = MinHash(30, 10, track_abundance=track_abundance)

    a.add_sequence("TGCCGCCCAGCA")
    b.add_sequence("TGCCGCCCAGCA")

    common = set(a.get_mins())
    combined_size = 3

    intersection, size = a.intersection(b, in_common=False)
    assert intersection == set()
    assert combined_size == size

    with pytest.raises(TypeError):
        a.intersection(set())

    with pytest.raises(TypeError):
        a.intersection(c)
Esempio n. 38
0
def test_mh_asymmetric_merge(track_abundance):
    # test merging two asymmetric (different size) MHs
    a = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 40, 2):
        a.add_hash(i)

    # different size: 10
    b = MinHash(10, 10, track_abundance=track_abundance)
    for i in range(0, 80, 4):
        b.add_hash(i)

    c = a.merge(b)
    d = b.merge(a)

    assert len(a) == 20
    assert len(b) == 10
    assert len(c) == len(a)
    assert len(d) == len(b)

    # can't use jaccard on different nums without downsampling
    with pytest.raises(TypeError):
        d.jaccard(a)

    a = a.downsample_n(d.num)
    print(a.get_mins())
    print(d.get_mins())

    if track_abundance:
        assert round(d.similarity(a), 3) == 0.91
    else:
        assert round(d.similarity(a), 3) == 1.0

    c = c.downsample_n(b.num)
    if track_abundance:
        assert round(c.similarity(b), 3) == 0.91
    else:
        assert c.similarity(b) == 1.0
Esempio n. 39
0
def test_short_sequence(track_abundance):
    a = MinHash(20, 5, track_abundance=track_abundance)
    a.add_sequence('GGGG')
    # adding a short sequence should fail silently
    assert len(a.get_mins()) == 0
Esempio n. 40
0
def test_mh_unsigned_long_long(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    a.add_hash(9227159859419181011)  # too big for a C long int.
    assert 9227159859419181011 in a.get_mins()
Esempio n. 41
0
def test_mh_len(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 40, 2):
        a.add_hash(i)

    assert a.get_mins() == list(range(0, 40, 2))
Esempio n. 42
0
def test_intersection_1(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    b = MinHash(20, 10, track_abundance=track_abundance)

    a.add_sequence('TGCCGCCCAGCA')
    b.add_sequence('TGCCGCCCAGCA')

    common = set(a.get_mins())
    combined_size = 3

    intersection, size = a.intersection(b, in_common=True)
    assert intersection == common
    assert combined_size == size

    intersection, size = b.intersection(b, in_common=True)
    assert intersection == common
    assert combined_size == size

    intersection, size = b.intersection(a, in_common=True)
    assert intersection == common
    assert combined_size == size

    intersection, size = a.intersection(a, in_common=True)
    assert intersection == common
    assert combined_size == size

    # add same sequence again
    b.add_sequence('TGCCGCCCAGCA')

    intersection, size = a.intersection(b, in_common=True)
    assert intersection == common
    assert combined_size == size

    intersection, size = b.intersection(b, in_common=True)
    assert intersection == common
    assert combined_size == size

    intersection, size = b.intersection(a, in_common=True)
    assert intersection == common
    assert combined_size == size

    intersection, size = a.intersection(a, in_common=True)
    assert intersection == common
    assert combined_size == size

    a.add_sequence('GTCCGCCCAGTGA')
    b.add_sequence('GTCCGCCCAGTGG')

    new_in_common = set(a.get_mins()).intersection(set(b.get_mins()))
    new_combined_size = 8

    intersection, size = a.intersection(b, in_common=True)
    assert intersection == new_in_common
    assert size == new_combined_size

    intersection, size = b.intersection(a, in_common=True)
    assert intersection == new_in_common
    assert size == new_combined_size

    intersection, size = a.intersection(a, in_common=True)
    assert intersection == set(a.get_mins())

    intersection, size = b.intersection(b, in_common=True)
    assert intersection == set(b.get_mins())
Esempio n. 43
0
def test_mh_unsigned_long_long(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    a.add_hash(9227159859419181011)        # too big for a C long int.
    assert 9227159859419181011 in a.get_mins()
Esempio n. 44
0
def test_intersection_1(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    b = MinHash(20, 10, track_abundance=track_abundance)

    a.add_sequence('TGCCGCCCAGCA')
    b.add_sequence('TGCCGCCCAGCA')

    common = set(a.get_mins())
    combined_size = 3

    intersection, size = a.intersection(b)
    assert intersection == common
    assert combined_size == size

    intersection, size = b.intersection(b)
    assert intersection == common
    assert combined_size == size

    intersection, size = b.intersection(a)
    assert intersection == common
    assert combined_size == size

    intersection, size = a.intersection(a)
    assert intersection == common
    assert combined_size == size

    # add same sequence again
    b.add_sequence('TGCCGCCCAGCA')

    intersection, size = a.intersection(b)
    assert intersection == common
    assert combined_size == size

    intersection, size = b.intersection(b)
    assert intersection == common
    assert combined_size == size

    intersection, size = b.intersection(a)
    assert intersection == common
    assert combined_size == size

    intersection, size = a.intersection(a)
    assert intersection == common
    assert combined_size == size

    a.add_sequence('GTCCGCCCAGTGA')
    b.add_sequence('GTCCGCCCAGTGG')

    new_in_common = set(a.get_mins()).intersection(set(b.get_mins()))
    new_combined_size = 8

    intersection, size = a.intersection(b)
    assert intersection == new_in_common
    assert size == new_combined_size

    intersection, size = b.intersection(a)
    assert intersection == new_in_common
    assert size == new_combined_size

    intersection, size = a.intersection(a)
    assert intersection == set(a.get_mins())

    intersection, size = b.intersection(b)
    assert intersection == set(b.get_mins())
Esempio n. 45
0
def test_short_sequence(track_abundance):
    a = MinHash(20, 5, track_abundance=track_abundance)
    a.add_sequence('GGGG')
    # adding a short sequence should fail silently
    assert len(a.get_mins()) == 0
Esempio n. 46
0
def test_mh_len(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 40, 2):
        a.add_hash(i)

    assert a.get_mins() == list(range(0, 40, 2))
Esempio n. 47
0
def test_protein_short(track_abundance):
    # verify that we can hash protein/aa sequences
    mh = MinHash(10, 9, True, track_abundance=track_abundance)
    mh.add_protein('AG')

    assert len(mh.get_mins()) == 0, mh.get_mins()