Example #1
0
def test_similarity_1(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    b = MinHash(20, 10, track_abundance=track_abundance)

    a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
    b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')

    assert round(a.similarity(b), 3) == 1.0
    assert round(b.similarity(b), 3) == 1.0
    assert round(b.similarity(a), 3) == 1.0
    assert round(a.similarity(a), 3) == 1.0

    # add same sequence again
    b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
    assert round(a.similarity(b), 3) == 1.0
    assert round(b.similarity(b), 3) == 1.0
    assert round(b.similarity(a), 3) == 1.0
    assert round(a.similarity(a), 3) == 1.0


    b.add_sequence('GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT')
    x = a.similarity(b)
    assert x >= 0.3, x

    x = b.similarity(a)
    assert x >= 0.3, x
    assert round(a.similarity(a), 3) == 1.0
    assert round(b.similarity(b), 3) == 1.0
Example #2
0
def test_mh_similarity_downsample_true(track_abundance):
    # verify sim(a, b) == sim(b, a), with and without ignore_abundance

    # max_hash = 50
    a = MinHash(0, 20, max_hash=50, track_abundance=track_abundance)
    # max_hash = 100
    b = MinHash(0, 20, max_hash=100, track_abundance=track_abundance)

    a_values = {1: 5, 3: 3, 5: 2, 8: 2}
    b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1}
    if track_abundance:
        a.set_abundances(a_values)
        b.set_abundances(b_values)
    else:
        a.add_many(a_values.keys())
        b.add_many(b_values.keys())

    # downsample=True => no error; values should match either way
    x = a.similarity(b, ignore_abundance=True, downsample=True)
    y = b.similarity(a, ignore_abundance=True, downsample=True)
    assert x == y

    # downsample=True => no error; values should match either way
    x = a.similarity(b, ignore_abundance=False, downsample=True)
    y = b.similarity(a, ignore_abundance=False, downsample=True)
    assert x == y
Example #3
0
def test_mh_angular_similarity_2():
    # check actual angular similarity for a second non-trivial case
    a = MinHash(0, 20, max_hash=100, track_abundance=True)
    b = MinHash(0, 20, max_hash=100, track_abundance=True)
    a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2, 70: 70})
    b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1, 70: 70})

    assert round(a.similarity(b), 4) == 0.9728

    # ignore_abundance => jaccard
    assert a.similarity(b, ignore_abundance=True) == 5. / 7.
Example #4
0
def test_consume_lowercase(track_abundance):
    a = MinHash(20, 10, track_abundance=track_abundance)
    b = MinHash(20, 10, track_abundance=track_abundance)

    a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA'.lower())
    b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')

    assert round(a.similarity(b), 3) == 1.0
    assert round(b.similarity(b), 3) == 1.0
    assert round(b.similarity(a), 3) == 1.0
    assert round(a.similarity(a), 3) == 1.0
Example #5
0
def test_mh_similarity_downsample_errors(track_abundance):
    # test downsample=False (default) argument to MinHash.similarity

    # max_hash = 50
    a = MinHash(0, 20, max_hash=50, track_abundance=track_abundance)
    # max_hash = 100
    b = MinHash(0, 20, max_hash=100, track_abundance=track_abundance)

    a_values = {1: 5, 3: 3, 5: 2, 8: 2}
    b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1}
    if track_abundance:
        a.set_abundances(a_values)
        b.set_abundances(b_values)
    else:
        a.add_many(a_values.keys())
        b.add_many(b_values.keys())

    # error, incompatible max hash
    with pytest.raises(ValueError) as e:
        a.similarity(b, ignore_abundance=True)  # downsample=False
    assert 'mismatch in scaled; comparison fail' in str(e.value)

    with pytest.raises(ValueError) as e:
        a.similarity(b, ignore_abundance=False)  # downsample=False
    assert 'mismatch in scaled; comparison fail' in str(e.value)

    with pytest.raises(ValueError) as e:
        b.similarity(a, ignore_abundance=True)  # downsample=False
    assert 'mismatch in scaled; comparison fail' in str(e.value)

    with pytest.raises(ValueError) as e:
        b.similarity(a, ignore_abundance=False)  # downsample=false
    assert 'mismatch in scaled; comparison fail' in str(e.value)
Example #6
0
def test_div_zero(track_abundance):
    # verify that empty MHs do not yield divide by zero errors for similarity
    mh = MinHash(1, 4, track_abundance=track_abundance)
    mh2 = mh.copy_and_clear()

    mh.add_sequence('ATGC')
    assert mh.similarity(mh2) == 0
    assert mh2.similarity(mh) == 0
Example #7
0
def test_minhash_abund_merge_flat():
    # this targets a segfault caused by trying to compute similarity
    # of a signature with abundance and a signature without abundance.
    # the correct behavior for now is to calculate simple Jaccard,
    # i.e. 'flatten' both of them.
    a = MinHash(0, 10, track_abundance=True, max_hash=5000)
    b = MinHash(0, 10, max_hash=5000)

    for i in range(0, 10, 2):
        a.add_hash(i)

    for j in range(0, 10, 3):
        b.add_hash(i)

    # these crashed, previously.
    assert a.similarity(b) == 0.2
    assert b.similarity(a) == 0.2
Example #8
0
def test_div_zero(track_abundance):
    # verify that empty MHs do not yield divide by zero errors for similarity
    mh = MinHash(1, 4, track_abundance=track_abundance)
    mh2 = mh.copy_and_clear()

    mh.add_sequence('ATGC')
    assert mh.similarity(mh2) == 0
    assert mh2.similarity(mh) == 0
Example #9
0
def test_minhash_abund_merge_flat():
    # this targets a segfault caused by trying to compute similarity
    # of a signature with abundance and a signature without abundance.
    # the correct behavior for now is to calculate simple Jaccard,
    # i.e. 'flatten' both of them.
    a = MinHash(0, 10, track_abundance=True, max_hash=5000)
    b = MinHash(0, 10, max_hash=5000)

    for i in range(0, 10, 2):
        a.add_hash(i)

    for j in range(0, 10, 3):
        b.add_hash(i)

    # these crashed, previously.
    assert a.similarity(b) == 0.2
    assert b.similarity(a) == 0.2
Example #10
0
def test_mh_jaccard_similarity():
    # check actual Jaccard value for a non-trivial case
    a = MinHash(0, 20, max_hash=50, track_abundance=False)
    b = MinHash(0, 20, max_hash=50, track_abundance=False)
    a.add_many([1, 3, 5, 8])
    b.add_many([1, 3, 5, 6, 8, 10])

    assert a.similarity(b) == 4. / 6.
Example #11
0
def test_mh_similarity_downsample_angular_value():
    # test downsample=True argument to MinHash.similarity

    # max_hash = 50
    a = MinHash(0, 20, max_hash=50, track_abundance=True)
    # max_hash = 100
    b = MinHash(0, 20, max_hash=100, track_abundance=True)

    a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2, 70: 70})
    b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1, 70: 70})

    # the hash=70 will be truncated by downsampling
    sim = a.similarity(b, downsample=True)
    assert round(sim, 4) == 0.7703

    # with ignore_abundance, will be equal to jaccard
    jaccard = a.similarity(b, downsample=True, ignore_abundance=True)
    assert jaccard == 4. / 6.
Example #12
0
def test_mh_similarity_downsample_jaccard_value():
    # check jaccard value after downsampling

    # max_hash = 50
    a = MinHash(0, 20, max_hash=50, track_abundance=False)
    # max_hash = 100
    b = MinHash(0, 20, max_hash=100, track_abundance=False)

    a.add_many([1, 3, 5, 8, 70])
    b.add_many([1, 3, 5, 6, 8, 10, 70])

    # the hash=70 will be truncated by downsampling
    assert a.similarity(b, downsample=True) == 4. / 6.
Example #13
0
def test_mh_angular_similarity():
    # check actual angular similarity for a non-trivial case, taken from:
    # https://www.sciencedirect.com/topics/computer-science/cosine-similarity
    # note: angular similarity is 1 - 2*(acos(sim) / pi), when elements
    # are always positive (https://en.wikipedia.org/wiki/Cosine_similarity)
    a = MinHash(0, 20, max_hash=50, track_abundance=True)
    b = MinHash(0, 20, max_hash=50, track_abundance=True)
    a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2})
    b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1})

    cos_sim = 0.9356
    angular_sim = 1 - 2 * math.acos(cos_sim) / math.pi
    assert round(angular_sim, 4) == 0.7703

    assert round(a.similarity(b), 4) == round(angular_sim, 4)
Example #14
0
def test_mh_similarity_diff_protein(track_abundance):
    a = MinHash(20, 5, False, track_abundance=track_abundance)
    b = MinHash(20, 5, True, track_abundance=track_abundance)

    with pytest.raises(ValueError):
        a.similarity(b)
Example #15
0
def test_mh_similarity_diff_ksize(track_abundance):
    a = MinHash(20, 5, track_abundance=track_abundance)
    b = MinHash(20, 6, track_abundance=track_abundance)

    with pytest.raises(ValueError):
        a.similarity(b)
Example #16
0
def test_mh_similarity_diff_seed(track_abundance):
    a = MinHash(20, 5, track_abundance=track_abundance, seed=1)
    b = MinHash(20, 5, track_abundance=track_abundance, seed=2)

    with pytest.raises(ValueError):
        a.similarity(b)
Example #17
0
def test_mh_similarity_diff_max_hash(track_abundance):
    a = MinHash(0, 5, track_abundance=track_abundance, max_hash=5)
    b = MinHash(0, 5, track_abundance=track_abundance, max_hash=10)

    with pytest.raises(ValueError):
        a.similarity(b)