Example #1
0
def test_similarity_downsample(track_abundance):
    e = sourmash_lib.MinHash(n=0,
                             ksize=20,
                             track_abundance=track_abundance,
                             max_hash=2**63)
    f = sourmash_lib.MinHash(n=0,
                             ksize=20,
                             track_abundance=track_abundance,
                             max_hash=2**2)

    e.add_hash(1)
    e.add_hash(5)
    assert len(e.get_mins()) == 2

    f.add_hash(1)
    f.add_hash(5)  # should be discarded due to max_hash
    assert len(f.get_mins()) == 1

    ee = SourmashSignature(e)
    ff = SourmashSignature(f)

    with pytest.raises(ValueError):  # mismatch in max_hash
        ee.similarity(ff)

    x = ee.similarity(ff, downsample=True)
    assert round(x, 1) == 1.0
Example #2
0
def test_compare(track_abundance):
    # same content, same name -> equal
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)
    sig1 = SourmashSignature(e, name='foo')

    f = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    f.add("AT" * 10)
    sig2 = SourmashSignature(f, name='foo')

    assert e == f
Example #3
0
def test_load_one_fail_multisig(track_abundance):
    e1 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig1 = SourmashSignature(e1)

    e2 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig2 = SourmashSignature(e2)

    x = save_signatures([sig1, sig2])

    with pytest.raises(ValueError):
        y = load_one_signature(x)
Example #4
0
def test_compare_ne(track_abundance):
    # same content, different names -> different
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)
    sig1 = SourmashSignature(e, name='foo')

    f = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    f.add("AT" * 10)
    sig2 = SourmashSignature(f, name='bar')

    assert sig1 != sig2
Example #5
0
def test_compare_ne2_reverse(track_abundance):
    # same content, one has filename, other does not -> different
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)
    sig1 = SourmashSignature(e, name='foo')

    f = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    f.add("AT" * 10)
    sig2 = SourmashSignature(f, filename='b')

    assert sig2 != sig1
    assert sig1 != sig2
Example #6
0
def test_save_minified(track_abundance):
    e1 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig1 = SourmashSignature(e1, name="foo")

    e2 = sourmash_lib.MinHash(n=1, ksize=25, track_abundance=track_abundance)
    sig2 = SourmashSignature(e2, name="bar baz")

    x = save_signatures([sig1, sig2])
    assert '\n' not in x
    assert len(x.split('\n')) == 1

    y = list(load_signatures(x))
    assert len(y) == 2
    assert any(sig.name() == 'foo' for sig in y)
    assert any(sig.name() == 'bar baz' for sig in y)
Example #7
0
def test_save_load_multisig(track_abundance):
    e1 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig1 = SourmashSignature(e1)

    e2 = sourmash_lib.MinHash(n=1, ksize=25, track_abundance=track_abundance)
    sig2 = SourmashSignature(e2)

    x = save_signatures([sig1, sig2])
    y = list(load_signatures(x))

    print(x)

    assert len(y) == 2
    assert sig1 in y  # order not guaranteed, note.
    assert sig2 in y
    assert sig1 != sig2
Example #8
0
def test_save_load_multisig_json():
    e1 = sourmash_lib.MinHash(n=1, ksize=20)
    sig1 = SourmashSignature('*****@*****.**', e1)

    e2 = sourmash_lib.MinHash(n=1, ksize=20)
    sig2 = SourmashSignature('*****@*****.**', e2)

    x = save_signatures_json([sig1, sig2])
    y = list(load_signatures_json(x))

    print(x)

    assert len(y) == 2
    assert sig1 in y  # order not guaranteed, note.
    assert sig2 in y
    assert sig1 != sig2
Example #9
0
def test_name_3(track_abundance):
    e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance)
    sig = SourmashSignature('*****@*****.**',
                            e,
                            name='foo',
                            filename='foo.txt')
    assert sig.name() == 'foo'
Example #10
0
def test_load_one_succeed(track_abundance):
    e1 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig1 = SourmashSignature(e1)

    x = save_signatures([sig1])

    y = load_one_signature(x)
    assert sig1 == y
Example #11
0
def test_hashable(track_abundance):
    # check: can we use signatures as keys in dictionaries and sets?
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)

    sig = SourmashSignature(e)

    x = set()
    x.add(sig)
Example #12
0
def sbt_index(client, db, cell, query, ksize, nsketch, key, file):
    '''Create a sequence Bloom tree from a cell/ database cursor.
    1. select seqs for tree
    2. assign common id (field derivative.minhash.sbt.ids)
    3. minhash seqs, name == UUID, md5? (think about SBT reuse)
    4. query a different collection/ metagenome against this

    --index {raw, minhash}
    input: all of cell or cursor

    \b
    $ zoo sbt_index --db ref --cell ref --ksize 16 --nsketch 1000 \
    reference
    Initialize SBT.
    Compute minhash signatures for selected documents.
    k-mer size: 16, sketch size: 1000
    \ 9158 Elapsed Time: 0:01:45
    Save SBT.
    Done.

    \b
    $ sourmash sbt_search --ksize 16 reference survey.fa.sig
    # running sourmash subcommand: sbt_search
    loaded query: survey.fa... (k=16, DNA)
    0.11 0ef85591-d464-4953-915f-f673907b7e8e (Zika reference genome)

    TODO: add query
    TODO: --key arg not working?
    '''
    c = MongoClient(client)[db][cell]

    print('Initialize SBT.')
    # init SBT
    factory = GraphFactory(ksize=ksize, starting_size=1e5, n_tables=4)
    # 4 .. nt?
    tree = SBT(factory, d=2)  # d .. see "n-ary " in notebook

    print('Compute minhash signatures for selected documents.')
    print('{}{}{}{}'.format(
        'k-mer size: ', ksize, ', sketch size: ', nsketch
        ))
    bar = ProgressBar(max_value=UnknownLength)
    counter = 0
    for d in c.find():
        counter += 1
        e = Estimators(ksize=ksize, n=nsketch)
        e.add_sequence(d['sequence'], force=True)
        s = SourmashSignature(email='', estimator=e, name=deep_get(d, key))
        leaf = SigLeaf(metadata=deep_get(d, key), data=s)
        tree.add_node(node=leaf)
        bar.update(counter)
    print('\nSave SBT.')
    tree.save(file)
    print('Done.')
Example #13
0
def test_save_load_multisig(track_abundance):
    e1 = sourmash_lib.Estimators(n=1,
                                 ksize=20,
                                 track_abundance=track_abundance)
    sig1 = SourmashSignature('*****@*****.**', e1)

    e2 = sourmash_lib.Estimators(n=1,
                                 ksize=20,
                                 track_abundance=track_abundance)
    sig2 = SourmashSignature('*****@*****.**', e2)

    x = save_signatures([sig1, sig2])
    y = list(load_signatures(x))

    print(x)

    assert len(y) == 2
    assert sig1 in y  # order not guaranteed, note.
    assert sig2 in y
    assert sig1 != sig2
Example #14
0
def test_roundtrip(track_abundance):
    e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)
    sig = SourmashSignature('*****@*****.**', e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.estimator

    assert sig.similarity(sig2) == 1.0
    assert sig2.similarity(sig) == 1.0
Example #15
0
def test_roundtrip(track_abundance):
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)
    sig = SourmashSignature(e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.minhash

    assert sig.similarity(sig2) == 1.0
    assert sig2.similarity(sig) == 1.0
Example #16
0
def test_roundtrip_empty(track_abundance):
    # edge case, but: empty minhash? :)
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)

    sig = SourmashSignature(e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.minhash

    assert sig.similarity(sig2) == 0
    assert sig2.similarity(sig) == 0
Example #17
0
def test_roundtrip_empty(track_abundance):
    # edge case, but: empty estimator? :)
    e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance)

    sig = SourmashSignature('*****@*****.**', e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.estimator

    assert sig.similarity(sig2) == 0
    assert sig2.similarity(sig) == 0
Example #18
0
def test_str(track_abundance):
    # signatures should be printable
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)

    sig = SourmashSignature(e)

    print(sig)
    assert str(sig) == 'SourmashSignature(59502a74)'
    assert repr(sig) == 'SourmashSignature(59502a74)'

    sig.d['name'] = 'fizbar'
    assert str(sig) == 'SourmashSignature(\'fizbar\', 59502a74)'
    assert repr(sig) == 'SourmashSignature(\'fizbar\', 59502a74)'
Example #19
0
def test_roundtrip_max_hash(track_abundance):
    e = sourmash_lib.MinHash(n=0,
                             ksize=20,
                             track_abundance=track_abundance,
                             max_hash=10)
    e.add_hash(5)
    sig = SourmashSignature(e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.minhash

    assert e.max_hash == e2.max_hash

    assert sig.similarity(sig2) == 1.0
    assert sig2.similarity(sig) == 1.0
Example #20
0
def test_roundtrip_seed(track_abundance):
    e = sourmash_lib.Estimators(n=1,
                                ksize=20,
                                track_abundance=track_abundance,
                                seed=10)
    e.mh.add_hash(5)
    sig = SourmashSignature('*****@*****.**', e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.estimator

    assert e.seed == e2.seed

    assert sig.similarity(sig2) == 1.0
    assert sig2.similarity(sig) == 1.0
Example #21
0
def minhash(client, db, cell, query, ksize, nsketch, key, file):
    '''Minhash a cell/ database cursor.
    just plain old sigs for collection
    '''
    c = MongoClient(client)[db][cell]

    bar = ProgressBar(max_value=UnknownLength)
    counter = 0
    l = []
    print('Compute minhash signatures for selected documents.')
    print('{}{}{}{}'.format(
        'k-mer size: ', ksize, ', sketch size: ', nsketch
        ))
    for d in c.find():
        counter += 1
        e = Estimators(ksize=ksize, n=nsketch)
        e.add_sequence(d['sequence'], force=True)
        s = SourmashSignature(email='', estimator=e, name=deep_get(d, key))
        l.append(s)
        bar.update(counter)

    print('\nSave signatures.')
    signature.save_signatures(l, fp=file)
    print('Done.')
Example #22
0
def test_md5(track_abundance):
    e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance)
    e.mh.add_hash(5)
    sig = SourmashSignature('*****@*****.**', e)
    print(sig._save())
    assert sig.md5sum() == 'eae27d77ca20db309e056e3d2dcd7d69', sig.md5sum()
Example #23
0
def test_md5(track_abundance):
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add_hash(5)
    sig = SourmashSignature(e)
    print(sig._save())
    assert sig.md5sum() == 'eae27d77ca20db309e056e3d2dcd7d69', sig.md5sum()
Example #24
0
def test_name_4(track_abundance):
    e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance)
    sig = SourmashSignature('*****@*****.**', e)
    assert sig.name() == sig.md5sum()[:8]
Example #25
0
def test_name_3(track_abundance):
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig = SourmashSignature(e, name='foo', filename='foo.txt')
    assert sig.name() == 'foo'
Example #26
0
def test_name_4(track_abundance):
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig = SourmashSignature(e)
    assert sig.name() == sig.md5sum()[:8]
Example #27
0
N = 1000

# init SBT
factory = GraphFactory(ksize=KSIZE, starting_size=1e5, n_tables=4)
# 4 .. nt?
tree = SBT(factory, d=2)  # d .. see "n-ary " in notebook

bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
cursor = db.ref.find()
c = 0
for i in cursor:
    key = deep_get(i, 'metadata.alt_id.gb')
    seq = i['sequence']  # db.ref.find_one()['sequence']  # 'ACTG...'
    e = Estimators(ksize=KSIZE, n=N)
    e.add_sequence(seq, force=True)  # e.get_hashes()
    s = SourmashSignature(email='', estimator=e, name=key)

    leaf = SigLeaf(metadata=key, data=s)
    tree.add_node(node=leaf)
    c += 1
    bar.update(c)
# \ 9158 Elapsed Time: 0:01:49

# search the last fasta entry against the SBT (">0.95")
# filtered = tree.find(search_minhashes, s, 0.1)
# matches = [(str(i.metadata), i.data.similarity(s)) for i in filtered]
# [('0.95', 1.0)]  # fasta header, similarity

tree.save('ref')
'''
sourmash sbt_search -k 16 ref ~/repos/zoo/zoo/data/zika/survey.sig