def test_tree_save_load(n_children): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = {str(s) for s in tree.find(search_minhashes, to_search.data, 0.1)} print(*old_result, sep='\n') with utils.TempDirectory() as location: tree.save(os.path.join(location, 'demo')) tree = SBT.load(os.path.join(location, 'demo'), leaf_loader=SigLeaf.load) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = {str(s) for s in tree.find(search_minhashes, to_search.data, 0.1)} print(*new_result, sep='\n') assert old_result == new_result
def sbt_index(args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf parser = argparse.ArgumentParser() parser.add_argument('sbt_name', help='name to save SBT into') parser.add_argument('signatures', nargs='+', help='signatures to load into SBT') parser.add_argument('-k', '--ksize', type=int, default=None) parser.add_argument('--traverse-directory', action='store_true') parser.add_argument('-x', '--bf-size', type=float, default=1e5) sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) moltype = sourmash_args.calculate_moltype(args) factory = GraphFactory(1, args.bf_size, 4) tree = SBT(factory) if args.traverse_directory: inp_files = list(sourmash_args.traverse_find_sigs(args.signatures)) else: inp_files = list(args.signatures) notify('loading {} files into SBT', len(inp_files)) n = 0 ksizes = set() moltypes = set() for f in inp_files: siglist = sig.load_signatures(f, select_ksize=args.ksize, select_moltype=moltype) # load all matching signatures in this file for ss in siglist: ksizes.add(ss.estimator.ksize) moltypes.add(sourmash_args.get_moltype(ss)) leaf = SigLeaf(ss.md5sum(), ss) tree.add_node(leaf) n += 1 # check to make sure we aren't loading incompatible signatures if len(ksizes) > 1 or len(moltypes) > 1: error('multiple k-mer sizes or molecule types present; fail.') error('specify --dna/--protein and --ksize as necessary') error('ksizes: {}; moltypes: {}', ", ".join(map(str, ksizes)), ", ".join(moltypes)) sys.exit(-1) # did we load any!? if n == 0: error('no signatures found to load into tree!? failing.') sys.exit(-1) notify('loaded {} sigs; saving SBT under "{}"', n, args.sbt_name) tree.save(args.sbt_name)
def sbt_index(client, db, cell, query, ksize, nsketch, key, file): '''Create a sequence Bloom tree from a cell/ database cursor. 1. select seqs for tree 2. assign common id (field derivative.minhash.sbt.ids) 3. minhash seqs, name == UUID, md5? (think about SBT reuse) 4. query a different collection/ metagenome against this --index {raw, minhash} input: all of cell or cursor \b $ zoo sbt_index --db ref --cell ref --ksize 16 --nsketch 1000 \ reference Initialize SBT. Compute minhash signatures for selected documents. k-mer size: 16, sketch size: 1000 \ 9158 Elapsed Time: 0:01:45 Save SBT. Done. \b $ sourmash sbt_search --ksize 16 reference survey.fa.sig # running sourmash subcommand: sbt_search loaded query: survey.fa... (k=16, DNA) 0.11 0ef85591-d464-4953-915f-f673907b7e8e (Zika reference genome) TODO: add query TODO: --key arg not working? ''' c = MongoClient(client)[db][cell] print('Initialize SBT.') # init SBT factory = GraphFactory(ksize=ksize, starting_size=1e5, n_tables=4) # 4 .. nt? tree = SBT(factory, d=2) # d .. see "n-ary " in notebook print('Compute minhash signatures for selected documents.') print('{}{}{}{}'.format( 'k-mer size: ', ksize, ', sketch size: ', nsketch )) bar = ProgressBar(max_value=UnknownLength) counter = 0 for d in c.find(): counter += 1 e = Estimators(ksize=ksize, n=nsketch) e.add_sequence(d['sequence'], force=True) s = SourmashSignature(email='', estimator=e, name=deep_get(d, key)) leaf = SigLeaf(metadata=deep_get(d, key), data=s) tree.add_node(node=leaf) bar.update(counter) print('\nSave SBT.') tree.save(file) print('Done.')
def sbt_index(self, args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf parser = argparse.ArgumentParser() parser.add_argument('sbt_name') parser.add_argument('signatures', nargs='+') parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) parser.add_argument('--traverse-directory', action='store_true') parser.add_argument('-x', '--bf-size', type=float, default=1e5) sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) if args.protein: if args.dna is True: raise Exception('cannot specify both --dna and --protein!') args.dna = False moltype = 'protein' else: args.dna = True moltype = 'dna' factory = GraphFactory(1, args.bf_size, 4) tree = SBT(factory) inp_files = list(args.signatures) if args.traverse_directory: inp_files = [] for dirname in args.signatures: for root, dirs, files in os.walk(dirname): for name in files: if name.endswith('.sig'): fullname = os.path.join(root, name) inp_files.append(fullname) print('loading {} files into SBT'.format(len(inp_files))) n = 0 for f in inp_files: s = sig.load_signatures(f, select_ksize=args.ksize, select_moltype=moltype) for ss in s: leaf = SigLeaf(ss.md5sum(), ss) tree.add_node(leaf) n += 1 print('loaded {} sigs; saving SBT under "{}".'.format(n, args.sbt_name)) tree.save(args.sbt_name)
def test_save_sparseness(n_children): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') with utils.TempDirectory() as location: tree.save(os.path.join(location, 'demo'), sparseness=1.0) tree_loaded = SBT.load(os.path.join(location, 'demo'), leaf_loader=SigLeaf.load) assert all(not isinstance(n, Node) for n in tree_loaded.nodes.values()) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree_loaded.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result for pos, node in list(tree_loaded.nodes.items()): # Every parent of a node must be an internal node (and not a leaf), # except for node 0 (the root), whose parent is None. if pos != 0: assert isinstance(tree_loaded.parent(pos).node, Node) # Leaf nodes can't have children if isinstance(node, Leaf): assert all(c.node is None for c in tree_loaded.children(pos))
def test_sbt_ipfsstorage(): ipfsapi = pytest.importorskip('ipfsapi') factory = GraphFactory(31, 1e5, 4) with utils.TempDirectory() as location: tree = SBT(factory) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') try: with IPFSStorage() as storage: tree.save(os.path.join(location, 'tree'), storage=storage) except ipfsapi.exceptions.ConnectionError: pytest.xfail("ipfs not installed/functioning probably") with IPFSStorage() as storage: tree = SBT.load(os.path.join(location, 'tree'), leaf_loader=SigLeaf.load, storage=storage) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result
def test_sbt_fsstorage(): factory = GraphFactory(31, 1e5, 4) with utils.TempDirectory() as location: tree = SBT(factory) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') with FSStorage(os.path.join(location, '.fstree')) as storage: tree.save(os.path.join(location, 'tree'), storage=storage) tree = SBT.load(os.path.join(location, 'tree'), leaf_loader=SigLeaf.load) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result assert os.path.exists(os.path.join(location, tree.storage.path)) assert os.path.exists(os.path.join(location, '.fstree'))
key = deep_get(i, 'metadata.alt_id.gb') seq = i['sequence'] # db.ref.find_one()['sequence'] # 'ACTG...' e = Estimators(ksize=KSIZE, n=N) e.add_sequence(seq, force=True) # e.get_hashes() s = SourmashSignature(email='', estimator=e, name=key) leaf = SigLeaf(metadata=key, data=s) tree.add_node(node=leaf) c += 1 bar.update(c) # \ 9158 Elapsed Time: 0:01:49 # search the last fasta entry against the SBT (">0.95") # filtered = tree.find(search_minhashes, s, 0.1) # matches = [(str(i.metadata), i.data.similarity(s)) for i in filtered] # [('0.95', 1.0)] # fasta header, similarity tree.save('ref') ''' sourmash sbt_search -k 16 ref ~/repos/zoo/zoo/data/zika/survey.sig # running sourmash subcommand: sbt_search loaded query: survey... (k=16, DNA) 0.11 NC_012532 ''' record = next(db.ref.find({'metadata.alt_id.gb': 'NC_012532'})) deep_get(record, 'metadata.description') # 'Zika virus, complete genome' # TODO: do the search internally, i.e. not via commanline
s = SourmashSignature(email='', estimator=e, name=key) # s.estimator.get_hashes() # s.name() leaf = SigLeaf(metadata=key, data=s) # SigLeaf(metadata, data, name=None) tree.add_node(node=leaf) # tree.print() # ignore pylint # search the last fasta entry against the SBT (">0.95") filtered = tree.find(search_minhashes, s, 0.1) matches = [(str(i.metadata), i.data.similarity(s)) for i in filtered] # [('0.95', 1.0)] # fasta header, similarity tree.save('mock_flu') '''shell head -n2 mock_flu.fa | sourmash compute -k 16 -n 200 -o virion.json - sourmash sbt_search -k 16 mock_flu.sbt.json virion.json # header: similarity, fasta header ("key" above) # 1.00 0.00 # 0.44 0.05 # 0.34 0.10 # 0.14 0.15 # 0.10 0.20 '''