Ejemplo n.º 1
0
def test_mh_similarity_downsample_errors(track_abundance):
    # test downsample=False (default) argument to MinHash.similarity

    # max_hash = 50
    a = MinHash(0, 20, max_hash=50, track_abundance=track_abundance)
    # max_hash = 100
    b = MinHash(0, 20, max_hash=100, track_abundance=track_abundance)

    a_values = {1: 5, 3: 3, 5: 2, 8: 2}
    b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1}
    if track_abundance:
        a.set_abundances(a_values)
        b.set_abundances(b_values)
    else:
        a.add_many(a_values.keys())
        b.add_many(b_values.keys())

    # error, incompatible max hash
    with pytest.raises(ValueError) as e:
        a.similarity(b, ignore_abundance=True)  # downsample=False
    assert 'mismatch in scaled; comparison fail' in str(e.value)

    with pytest.raises(ValueError) as e:
        a.similarity(b, ignore_abundance=False)  # downsample=False
    assert 'mismatch in scaled; comparison fail' in str(e.value)

    with pytest.raises(ValueError) as e:
        b.similarity(a, ignore_abundance=True)  # downsample=False
    assert 'mismatch in scaled; comparison fail' in str(e.value)

    with pytest.raises(ValueError) as e:
        b.similarity(a, ignore_abundance=False)  # downsample=false
    assert 'mismatch in scaled; comparison fail' in str(e.value)
Ejemplo n.º 2
0
def test_mh_similarity_downsample_true(track_abundance):
    # verify sim(a, b) == sim(b, a), with and without ignore_abundance

    # max_hash = 50
    a = MinHash(0, 20, max_hash=50, track_abundance=track_abundance)
    # max_hash = 100
    b = MinHash(0, 20, max_hash=100, track_abundance=track_abundance)

    a_values = {1: 5, 3: 3, 5: 2, 8: 2}
    b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1}
    if track_abundance:
        a.set_abundances(a_values)
        b.set_abundances(b_values)
    else:
        a.add_many(a_values.keys())
        b.add_many(b_values.keys())

    # downsample=True => no error; values should match either way
    x = a.similarity(b, ignore_abundance=True, downsample=True)
    y = b.similarity(a, ignore_abundance=True, downsample=True)
    assert x == y

    # downsample=True => no error; values should match either way
    x = a.similarity(b, ignore_abundance=False, downsample=True)
    y = b.similarity(a, ignore_abundance=False, downsample=True)
    assert x == y
Ejemplo n.º 3
0
def test_mh_jaccard_similarity():
    # check actual Jaccard value for a non-trivial case
    a = MinHash(0, 20, max_hash=50, track_abundance=False)
    b = MinHash(0, 20, max_hash=50, track_abundance=False)
    a.add_many([1, 3, 5, 8])
    b.add_many([1, 3, 5, 6, 8, 10])

    assert a.similarity(b) == 4. / 6.
Ejemplo n.º 4
0
def test_mh_similarity_downsample_jaccard_value():
    # check jaccard value after downsampling

    # max_hash = 50
    a = MinHash(0, 20, max_hash=50, track_abundance=False)
    # max_hash = 100
    b = MinHash(0, 20, max_hash=100, track_abundance=False)

    a.add_many([1, 3, 5, 8, 70])
    b.add_many([1, 3, 5, 6, 8, 10, 70])

    # the hash=70 will be truncated by downsampling
    assert a.similarity(b, downsample=True) == 4. / 6.
Ejemplo n.º 5
0
def test_add_many(track_abundance):
    a = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000)
    b = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000)

    a.add_many(list(range(0, 100, 2)))
    a.add_many(list(range(0, 100, 2)))

    assert len(a) == 50
    assert all(c % 2 == 0 for c in a.get_mins())

    for h in range(0, 100, 2):
        b.add_hash(h)
        b.add_hash(h)

    assert len(b) == 50
    assert a == b
Ejemplo n.º 6
0
def test_add_many(track_abundance):
    a = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000)
    b = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000)

    a.add_many(list(range(0, 100, 2)))
    a.add_many(list(range(0, 100, 2)))

    assert len(a) == 50
    assert all(c % 2 == 0 for c in a.get_mins())

    for h in range(0, 100, 2):
        b.add_hash(h)
        b.add_hash(h)

    assert len(b) == 50
    assert a == b
Ejemplo n.º 7
0
def test_remove_many(track_abundance):
    a = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000)

    a.add_many(list(range(0, 100, 2)))

    orig_sig = signature.SourmashSignature(a)
    orig_md5 = orig_sig.md5sum()

    a.remove_many(list(range(0, 100, 3)))
    new_sig = signature.SourmashSignature(a)
    new_md5 = new_sig.md5sum()

    assert orig_md5 == "f1cc295157374f5c07cfca5f867188a1"
    assert new_md5 == "dd93fa319ef57f4a019c59ee1a8c73e2"
    assert orig_md5 != new_md5

    assert len(a) == 33
    assert all(c % 6 != 0 for c in a.get_mins())
Ejemplo n.º 8
0
def test_remove_many(track_abundance):
    a = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000)

    a.add_many(list(range(0, 100, 2)))

    orig_sig = signature.SourmashSignature(a)
    orig_md5 = orig_sig.md5sum()

    a.remove_many(list(range(0, 100, 3)))
    new_sig = signature.SourmashSignature(a)
    new_md5 = new_sig.md5sum()

    assert orig_md5 == "f1cc295157374f5c07cfca5f867188a1"
    assert new_md5 == "dd93fa319ef57f4a019c59ee1a8c73e2"
    assert orig_md5 != new_md5

    assert len(a) == 33
    assert all(c % 6 != 0 for c in a.get_mins())