Beispiel #1
0
    def setup(self):
        TimeMinHashSuite.setup(self)
        self.mh = MinHash(500, 21, track_abundance=True)

        self.populated_mh = MinHash(500, 21, track_abundance=True)
        for seq in self.sequences:
            self.populated_mh.add_sequence(seq)
Beispiel #2
0
    def setup(self):
        self.mh = MinHash(500, 21, track_abundance=False)
        self.sequences = load_sequences(get_test_data('ecoli.genes.fna')) * 10

        self.populated_mh = MinHash(500, 21, track_abundance=False)
        for seq in self.sequences:
            self.populated_mh.add_sequence(seq)
Beispiel #3
0
def test_mh_similarity_downsample_errors(track_abundance):
    # test downsample=False (default) argument to MinHash.similarity

    # max_hash = 50
    a = MinHash(0, 20, max_hash=50, track_abundance=track_abundance)
    # max_hash = 100
    b = MinHash(0, 20, max_hash=100, track_abundance=track_abundance)

    a_values = {1: 5, 3: 3, 5: 2, 8: 2}
    b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1}
    if track_abundance:
        a.set_abundances(a_values)
        b.set_abundances(b_values)
    else:
        a.add_many(a_values.keys())
        b.add_many(b_values.keys())

    # error, incompatible max hash
    with pytest.raises(ValueError) as e:
        a.similarity(b, ignore_abundance=True)  # downsample=False
    assert 'mismatch in scaled; comparison fail' in str(e.value)

    with pytest.raises(ValueError) as e:
        a.similarity(b, ignore_abundance=False)  # downsample=False
    assert 'mismatch in scaled; comparison fail' in str(e.value)

    with pytest.raises(ValueError) as e:
        b.similarity(a, ignore_abundance=True)  # downsample=False
    assert 'mismatch in scaled; comparison fail' in str(e.value)

    with pytest.raises(ValueError) as e:
        b.similarity(a, ignore_abundance=False)  # downsample=false
    assert 'mismatch in scaled; comparison fail' in str(e.value)
Beispiel #4
0
 def setup(self):
     self.mh = MinHash(500, 21, track_abundance=True)
     self.protein_mh = MinHash(500,
                               21,
                               is_protein=True,
                               track_abundance=True)
     self.sequences = load_sequences()
Beispiel #5
0
def test_mh_asymmetric_merge(track_abundance):
    # test merging two asymmetric (different size) MHs
    a = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 40, 2):
        a.add_hash(i)

    # different size: 10
    b = MinHash(10, 10, track_abundance=track_abundance)
    for i in range(0, 80, 4):
        b.add_hash(i)

    c = a.merge(b)
    d = b.merge(a)

    assert len(a) == 20
    assert len(b) == 10
    assert len(c) == len(a)
    assert len(d) == len(b)

    # can't compare different sizes without downsampling
    with pytest.raises(TypeError):
        d.compare(a)

    a = a.downsample_n(d.num)
    print(a.get_mins())
    print(d.get_mins())
    assert d.compare(a) == 1.0

    c = c.downsample_n(b.num)
    assert c.compare(b) == 1.0
Beispiel #6
0
    def setup(self):
        self.mh = MinHash(500, 21, track_abundance=False)
        self.sequences = load_sequences()

        self.populated_mh = MinHash(500, 21, track_abundance=False)
        for seq in self.sequences:
            self.populated_mh.add_sequence(seq)
Beispiel #7
0
def test_mh_inplace_concat_asymmetric(track_abundance):
    # test merging two asymmetric (different size) MHs
    a = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 40, 2):
        a.add_hash(i)

    # different size: 10
    b = MinHash(10, 10, track_abundance=track_abundance)
    for i in range(0, 80, 4):
        b.add_hash(i)

    c = a.__copy__()
    c += b

    d = b.__copy__()
    d += a

    assert len(a) == 20
    assert len(b) == 10
    assert len(c) == len(a)
    assert len(d) == len(b)

    try:
        d.compare(a)
    except TypeError as exc:
        assert 'must have same num' in str(exc)

    a = a.downsample_n(d.num)
    assert d.compare(a) == 1.0  # see: d += a, above.

    c = c.downsample_n(b.num)
    assert c.compare(b) == 0.5
Beispiel #8
0
def test_similarity_1(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    b = MinHash(20, 10, track_abundance=track_abundance)

    a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
    b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')

    assert round(a.similarity(b), 3) == 1.0
    assert round(b.similarity(b), 3) == 1.0
    assert round(b.similarity(a), 3) == 1.0
    assert round(a.similarity(a), 3) == 1.0

    # add same sequence again
    b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
    assert round(a.similarity(b), 3) == 1.0
    assert round(b.similarity(b), 3) == 1.0
    assert round(b.similarity(a), 3) == 1.0
    assert round(a.similarity(a), 3) == 1.0


    b.add_sequence('GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT')
    x = a.similarity(b)
    assert x >= 0.3, x

    x = b.similarity(a)
    assert x >= 0.3, x
    assert round(a.similarity(a), 3) == 1.0
    assert round(b.similarity(b), 3) == 1.0
Beispiel #9
0
def test_mh_similarity_downsample_true(track_abundance):
    # verify sim(a, b) == sim(b, a), with and without ignore_abundance

    # max_hash = 50
    a = MinHash(0, 20, max_hash=50, track_abundance=track_abundance)
    # max_hash = 100
    b = MinHash(0, 20, max_hash=100, track_abundance=track_abundance)

    a_values = {1: 5, 3: 3, 5: 2, 8: 2}
    b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1}
    if track_abundance:
        a.set_abundances(a_values)
        b.set_abundances(b_values)
    else:
        a.add_many(a_values.keys())
        b.add_many(b_values.keys())

    # downsample=True => no error; values should match either way
    x = a.similarity(b, ignore_abundance=True, downsample=True)
    y = b.similarity(a, ignore_abundance=True, downsample=True)
    assert x == y

    # downsample=True => no error; values should match either way
    x = a.similarity(b, ignore_abundance=False, downsample=True)
    y = b.similarity(a, ignore_abundance=False, downsample=True)
    assert x == y
Beispiel #10
0
def test_abundance_compare():
    a = MinHash(20, 10, track_abundance=True)
    b = MinHash(20, 10, track_abundance=False)

    a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
    b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')

    assert a.compare(b) == 1.0
    assert b.compare(b) == 1.0
    assert b.compare(a) == 1.0
    assert a.compare(a) == 1.0

    # add same sequence again
    b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
    assert a.compare(b) == 1.0
    assert b.compare(b) == 1.0
    assert b.compare(a) == 1.0
    assert a.compare(a) == 1.0

    b.add_sequence('GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT')
    x = a.compare(b)
    assert x >= 0.3, x

    x = b.compare(a)
    assert x >= 0.3, x
    assert a.compare(a) == 1.0
    assert b.compare(b) == 1.0
Beispiel #11
0
def test_mh_jaccard_similarity():
    # check actual Jaccard value for a non-trivial case
    a = MinHash(0, 20, max_hash=50, track_abundance=False)
    b = MinHash(0, 20, max_hash=50, track_abundance=False)
    a.add_many([1, 3, 5, 8])
    b.add_many([1, 3, 5, 6, 8, 10])

    assert a.similarity(b) == 4. / 6.
Beispiel #12
0
def test_set_abundance_clear():
    # on empty minhash, clear should have no effect
    a = MinHash(20, 5, False, track_abundance=True)
    b = MinHash(20, 5, False, track_abundance=True)

    a.set_abundances({1: 3, 2: 4}, clear=True)
    b.set_abundances({1: 3, 2: 4}, clear=False)

    assert a.get_mins() == b.get_mins()
Beispiel #13
0
def test_mh_merge_check_length(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 40, 2):
        a.add_hash(i)

    b = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 80, 4):
        b.add_hash(i)

    c = a.merge(b)
    assert len(c.get_mins()) == 20
Beispiel #14
0
def test_mh_angular_similarity_2():
    # check actual angular similarity for a second non-trivial case
    a = MinHash(0, 20, max_hash=100, track_abundance=True)
    b = MinHash(0, 20, max_hash=100, track_abundance=True)
    a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2, 70: 70})
    b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1, 70: 70})

    assert round(a.similarity(b), 4) == 0.9728

    # ignore_abundance => jaccard
    assert a.similarity(b, ignore_abundance=True) == 5. / 7.
Beispiel #15
0
def test_consume_lowercase(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    b = MinHash(20, 10, track_abundance=track_abundance)

    a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA'.lower())
    b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')

    assert a.compare(b) == 1.0
    assert b.compare(b) == 1.0
    assert b.compare(a) == 1.0
    assert a.compare(a) == 1.0
Beispiel #16
0
def test_consume_lowercase(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    b = MinHash(20, 10, track_abundance=track_abundance)

    a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA'.lower())
    b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')

    assert round(a.similarity(b), 3) == 1.0
    assert round(b.similarity(b), 3) == 1.0
    assert round(b.similarity(a), 3) == 1.0
    assert round(a.similarity(a), 3) == 1.0
Beispiel #17
0
def test_mh_count_common(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 40, 2):
        a.add_hash(i)

    b = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 80, 4):
        b.add_hash(i)

    assert a.count_common(b) == 10
    assert b.count_common(a) == 10
Beispiel #18
0
def test_mh_subtract(track_abundance):
    # test subtracting two identically configured minhashes
    a = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 40, 2):
        a.add_hash(i)

    b = MinHash(20, 10, track_abundance=track_abundance)
    for i in range(0, 80, 4):
        b.add_hash(i)

    assert a.subtract_mins(b) == set(range(2, 40, 4))
Beispiel #19
0
def test_mh_similarity_downsample_jaccard_value():
    # check jaccard value after downsampling

    # max_hash = 50
    a = MinHash(0, 20, max_hash=50, track_abundance=False)
    # max_hash = 100
    b = MinHash(0, 20, max_hash=100, track_abundance=False)

    a.add_many([1, 3, 5, 8, 70])
    b.add_many([1, 3, 5, 6, 8, 10, 70])

    # the hash=70 will be truncated by downsampling
    assert a.similarity(b, downsample=True) == 4. / 6.
Beispiel #20
0
def test_dayhoff(track_abundance):
    # verify that we can hash to dayhoff-encoded protein/aa sequences
    mh_dayhoff = MinHash(10, 6, is_protein=True,
                         dayhoff=True, hp=False, track_abundance=track_abundance)
    mh_dayhoff.add_sequence('ACTGAC')

    assert len(mh_dayhoff.get_mins()) == 2
    # verify that dayhoff-encoded hashes are different from protein/aa hashes
    mh_protein = MinHash(10, 6, is_protein=True, track_abundance=track_abundance)
    mh_protein.add_sequence('ACTGAC')

    assert len(mh_protein.get_mins()) == 2
    assert mh_protein.get_mins() != mh_dayhoff.get_mins()
Beispiel #21
0
def test_abundance_simple_2():
    a = MinHash(20, 5, False, track_abundance=True)
    b = MinHash(20, 5, False, track_abundance=True)

    a.add_sequence('AAAAA')
    assert a.get_mins() == [2110480117637990133]
    assert a.get_mins(with_abundance=True) == {2110480117637990133: 1}

    a.add_sequence('AAAAA')
    assert a.get_mins() == [2110480117637990133]
    assert a.get_mins(with_abundance=True) == {2110480117637990133: 2}

    b.add_sequence('AAAAA')
    assert a.count_common(b) == 1
Beispiel #22
0
def test_mh_merge_check_length2(track_abundance):
    # merged MH doesn't have full number of elements
    a = MinHash(4, 10, track_abundance=track_abundance)
    a.add_hash(3)
    a.add_hash(1)
    a.add_hash(4)

    b = MinHash(4, 10, track_abundance=track_abundance)
    b.add_hash(3)
    b.add_hash(1)
    b.add_hash(4)

    c = a.merge(b)
    assert len(c.get_mins()) == 3
Beispiel #23
0
def test_minhash_abund_merge_flat_2():
    # this targets a segfault caused by trying to merge
    # a signature with abundance and a signature without abundance.

    a = MinHash(0, 10, track_abundance=True, max_hash=5000)
    b = MinHash(0, 10, max_hash=5000)

    for i in range(0, 10, 2):
        a.add_hash(i)

    for j in range(0, 10, 3):
        b.add_hash(i)

    a.merge(b)
Beispiel #24
0
def test_mh_angular_similarity():
    # check actual angular similarity for a non-trivial case, taken from:
    # https://www.sciencedirect.com/topics/computer-science/cosine-similarity
    # note: angular similarity is 1 - 2*(acos(sim) / pi), when elements
    # are always positive (https://en.wikipedia.org/wiki/Cosine_similarity)
    a = MinHash(0, 20, max_hash=50, track_abundance=True)
    b = MinHash(0, 20, max_hash=50, track_abundance=True)
    a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2})
    b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1})

    cos_sim = 0.9356
    angular_sim = 1 - 2 * math.acos(cos_sim) / math.pi
    assert round(angular_sim, 4) == 0.7703

    assert round(a.similarity(b), 4) == round(angular_sim, 4)
Beispiel #25
0
def test_hp(track_abundance):
    # verify that we can hash to hp-encoded protein/aa sequences
    mh_hp = MinHash(10, 6, is_protein=True,
                    dayhoff=False, hp=True, track_abundance=track_abundance)
    assert mh_hp.moltype == 'hp'

    mh_hp.add_sequence('ACTGAC')

    assert len(mh_hp.get_mins()) == 2
    # verify that hp-encoded hashes are different from protein/aa hashes
    mh_protein = MinHash(10, 6, is_protein=True, track_abundance=track_abundance)
    mh_protein.add_sequence('ACTGAC')

    assert len(mh_protein.get_mins()) == 2
    assert mh_protein.get_mins() != mh_hp.get_mins()
Beispiel #26
0
def test_set_abundance():
    a = MinHash(20, 10, track_abundance=False)

    with pytest.raises(RuntimeError) as e:
        a.set_abundances({1: 3, 2: 4})

    assert "track_abundance=True when constructing" in e.value.args[0]
Beispiel #27
0
def test_scaled_property(track_abundance):
    scaled = 10000
    a = MinHash(0,
                10,
                track_abundance=track_abundance,
                max_hash=round(2**64 / scaled))
    assert a.scaled == scaled
Beispiel #28
0
def test_mh_merge_empty_scaled(track_abundance):
    # test merging two identically configured minhashes, one empty
    a = MinHash(0, 10, scaled=1, track_abundance=track_abundance)

    b = MinHash(0, 10, scaled=1, track_abundance=track_abundance)
    for i in range(0, 80, 4):
        b.add_hash(i)

    c = a.merge(b)
    d = b.merge(a)

    assert len(c)
    assert len(c) == len(d)
    assert c.get_mins() == d.get_mins()
    assert c.compare(d) == 1.0
    assert d.compare(c) == 1.0
Beispiel #29
0
def test_add_many(track_abundance):
    a = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000)
    b = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000)

    a.add_many(list(range(0, 100, 2)))
    a.add_many(list(range(0, 100, 2)))

    assert len(a) == 50
    assert all(c % 2 == 0 for c in a.get_mins())

    for h in range(0, 100, 2):
        b.add_hash(h)
        b.add_hash(h)

    assert len(b) == 50
    assert a == b
Beispiel #30
0
def test_no_downsample_scaled_if_n(track_abundance):
    # make sure you can't set max_n and then downsample scaled
    mh = MinHash(2, 4, track_abundance=track_abundance)
    with pytest.raises(ValueError) as excinfo:
        mh.downsample_scaled(100000000)

    assert 'cannot downsample a standard MinHash' in str(excinfo.value)