def sbt_combine(args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import SigLeaf parser = argparse.ArgumentParser() parser.add_argument('sbt_name', help='name to save SBT into') parser.add_argument('sbts', nargs='+', help='SBTs to combine to a new SBT') parser.add_argument('-x', '--bf-size', type=float, default=1e5) sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) moltype = sourmash_args.calculate_moltype(args) inp_files = list(args.sbts) notify('combining {} SBTs', len(inp_files)) tree = SBT.load(inp_files.pop(0), leaf_loader=SigLeaf.load) for f in inp_files: new_tree = SBT.load(f, leaf_loader=SigLeaf.load) # TODO: check if parameters are the same for both trees! tree.combine(new_tree) notify('saving SBT under "{}".', args.sbt_name) tree.save(args.sbt_name)
def test_binary_nary_tree(): factory = GraphFactory(31, 1e5, 4) trees = {} trees[2] = SBT(factory) trees[5] = SBT(factory, d=5) trees[10] = SBT(factory, d=10) n_leaves = 0 for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) for tree in trees.values(): tree.add_node(leaf) to_search = leaf n_leaves += 1 assert all([len(t.leaves()) == n_leaves for t in trees.values()]) results = {} print('*' * 60) print("{}:".format(to_search.metadata)) for d, tree in trees.items(): results[d] = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*results[2], sep='\n') assert results[2] == results[5] assert results[5] == results[10]
def test_tree_save_load(n_children): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') with utils.TempDirectory() as location: tree.save(os.path.join(location, 'demo')) tree = SBT.load(os.path.join(location, 'demo'), leaf_loader=SigLeaf.load) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result
def test_sbt_fsstorage(): factory = GraphFactory(31, 1e5, 4) with utils.TempDirectory() as location: tree = SBT(factory) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = {str(s) for s in tree.find(search_minhashes, to_search.data, 0.1)} print(*old_result, sep='\n') with FSStorage(os.path.join(location, '.fstree')) as storage: tree.save(os.path.join(location, 'tree'), storage=storage) tree = SBT.load(os.path.join(location, 'tree'), leaf_loader=SigLeaf.load) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = {str(s) for s in tree.find(search_minhashes, to_search.data, 0.1)} print(*new_result, sep='\n') assert old_result == new_result assert os.path.exists(os.path.join(location, tree.storage.path)) assert os.path.exists(os.path.join(location, '.fstree'))
def load_sbts_and_sigs(filenames, query_ksize, query_moltype): databases = [] for sbt_or_sigfile in filenames: try: tree = SBT.load(sbt_or_sigfile, leaf_loader=SigLeaf.load) ksize = get_ksize(tree) if ksize != query_ksize: error("ksize on tree '{}' is {};", sbt_or_sigfile, ksize) error('this is different from query ksize of {}.', query_ksize) sys.exit(-1) databases.append((tree, sbt_or_sigfile, True)) notify('loaded SBT {}', sbt_or_sigfile) except (ValueError, EnvironmentError): # not an SBT - try as a .sig try: siglist = sig.load_signatures(sbt_or_sigfile, select_ksize=query_ksize, select_moltype=query_moltype) siglist = list(siglist) databases.append((list(siglist), sbt_or_sigfile, False)) notify('loaded {} signatures from {}', len(siglist), sbt_or_sigfile) except EnvironmentError: error("file '{}' does not exist", sbt_or_sigfile) sys.exit(-1) return databases
def test_tree_repair(): tree_repair = SBT.load(utils.get_test_data('leaves.sbt.json'), leaf_loader=SigLeaf.load) tree_cur = SBT.load(utils.get_test_data('v3.sbt.json'), leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) to_search = next(signature.load_signatures(testdata1)) results_repair = {str(s) for s in tree_repair.find(search_minhashes, to_search, 0.1)} results_cur = {str(s) for s in tree_cur.find(search_minhashes, to_search, 0.1)} assert results_repair == results_cur assert len(results_repair) == 4
def load_sbts_and_sigs(filenames, query_ksize, query_moltype, traverse=False): n_signatures = 0 n_databases = 0 databases = [] for sbt_or_sigfile in filenames: if traverse and os.path.isdir(sbt_or_sigfile): for sigfile in traverse_find_sigs([sbt_or_sigfile]): try: siglist = sig.load_signatures(sigfile, ksize=query_ksize, select_moltype=query_moltype) siglist = list(siglist) databases.append((list(siglist), sbt_or_sigfile, False)) notify('loaded {} signatures from {}', len(siglist), sigfile, end='\r') n_signatures += len(siglist) except: # ignore errors with traverse continue continue try: tree = SBT.load(sbt_or_sigfile, leaf_loader=SigLeaf.load) ksize = get_ksize(tree) if ksize != query_ksize: error("ksize on tree '{}' is {};", sbt_or_sigfile, ksize) error('this is different from query ksize of {}.', query_ksize) sys.exit(-1) databases.append((tree, sbt_or_sigfile, True)) notify('loaded SBT {}', sbt_or_sigfile, end='\r') n_databases += 1 except (ValueError, EnvironmentError): # not an SBT - try as a .sig try: siglist = sig.load_signatures(sbt_or_sigfile, ksize=query_ksize, select_moltype=query_moltype) siglist = list(siglist) databases.append((list(siglist), sbt_or_sigfile, False)) notify('loaded {} signatures from {}', len(siglist), sbt_or_sigfile, end='\r') n_signatures += len(siglist) except EnvironmentError: error("\nfile '{}' does not exist", sbt_or_sigfile) sys.exit(-1) notify(' ' * 79, end='\r') notify('loaded {} signatures and {} databases total.'.format( n_signatures, n_databases)) if databases: print('') return databases
def test_tree_v1_load(): tree_v1 = SBT.load(utils.get_test_data('v1.sbt.json'), leaf_loader=SigLeaf.load) tree_v2 = SBT.load(utils.get_test_data('v2.sbt.json'), leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) to_search = next(signature.load_signatures(testdata1)) results_v1 = { str(s) for s in tree_v1.find(search_minhashes, to_search, 0.1) } results_v2 = { str(s) for s in tree_v2.find(search_minhashes, to_search, 0.1) } assert results_v1 == results_v2 assert len(results_v1) == 4
def sbt_index(args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf parser = argparse.ArgumentParser() parser.add_argument('sbt_name', help='name to save SBT into') parser.add_argument('signatures', nargs='+', help='signatures to load into SBT') parser.add_argument('-k', '--ksize', type=int, default=None) parser.add_argument('--traverse-directory', action='store_true') parser.add_argument('-x', '--bf-size', type=float, default=1e5) sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) moltype = sourmash_args.calculate_moltype(args) factory = GraphFactory(1, args.bf_size, 4) tree = SBT(factory) if args.traverse_directory: inp_files = list(sourmash_args.traverse_find_sigs(args.signatures)) else: inp_files = list(args.signatures) notify('loading {} files into SBT', len(inp_files)) n = 0 ksizes = set() moltypes = set() for f in inp_files: siglist = sig.load_signatures(f, select_ksize=args.ksize, select_moltype=moltype) # load all matching signatures in this file for ss in siglist: ksizes.add(ss.estimator.ksize) moltypes.add(sourmash_args.get_moltype(ss)) leaf = SigLeaf(ss.md5sum(), ss) tree.add_node(leaf) n += 1 # check to make sure we aren't loading incompatible signatures if len(ksizes) > 1 or len(moltypes) > 1: error('multiple k-mer sizes or molecule types present; fail.') error('specify --dna/--protein and --ksize as necessary') error('ksizes: {}; moltypes: {}', ", ".join(map(str, ksizes)), ", ".join(moltypes)) sys.exit(-1) # did we load any!? if n == 0: error('no signatures found to load into tree!? failing.') sys.exit(-1) notify('loaded {} sigs; saving SBT under "{}"', n, args.sbt_name) tree.save(args.sbt_name)
def test_sbt_ipfsstorage(): ipfsapi = pytest.importorskip('ipfsapi') factory = GraphFactory(31, 1e5, 4) with utils.TempDirectory() as location: tree = SBT(factory) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') try: with IPFSStorage() as storage: tree.save(os.path.join(location, 'tree'), storage=storage) except ipfsapi.exceptions.ConnectionError: pytest.xfail("ipfs not installed/functioning probably") with IPFSStorage() as storage: tree = SBT.load(os.path.join(location, 'tree'), leaf_loader=SigLeaf.load, storage=storage) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result
def sbt_index(client, db, cell, query, ksize, nsketch, key, file): '''Create a sequence Bloom tree from a cell/ database cursor. 1. select seqs for tree 2. assign common id (field derivative.minhash.sbt.ids) 3. minhash seqs, name == UUID, md5? (think about SBT reuse) 4. query a different collection/ metagenome against this --index {raw, minhash} input: all of cell or cursor \b $ zoo sbt_index --db ref --cell ref --ksize 16 --nsketch 1000 \ reference Initialize SBT. Compute minhash signatures for selected documents. k-mer size: 16, sketch size: 1000 \ 9158 Elapsed Time: 0:01:45 Save SBT. Done. \b $ sourmash sbt_search --ksize 16 reference survey.fa.sig # running sourmash subcommand: sbt_search loaded query: survey.fa... (k=16, DNA) 0.11 0ef85591-d464-4953-915f-f673907b7e8e (Zika reference genome) TODO: add query TODO: --key arg not working? ''' c = MongoClient(client)[db][cell] print('Initialize SBT.') # init SBT factory = GraphFactory(ksize=ksize, starting_size=1e5, n_tables=4) # 4 .. nt? tree = SBT(factory, d=2) # d .. see "n-ary " in notebook print('Compute minhash signatures for selected documents.') print('{}{}{}{}'.format( 'k-mer size: ', ksize, ', sketch size: ', nsketch )) bar = ProgressBar(max_value=UnknownLength) counter = 0 for d in c.find(): counter += 1 e = Estimators(ksize=ksize, n=nsketch) e.add_sequence(d['sequence'], force=True) s = SourmashSignature(email='', estimator=e, name=deep_get(d, key)) leaf = SigLeaf(metadata=deep_get(d, key), data=s) tree.add_node(node=leaf) bar.update(counter) print('\nSave SBT.') tree.save(file) print('Done.')
def test_save_sparseness(n_children): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') with utils.TempDirectory() as location: tree.save(os.path.join(location, 'demo'), sparseness=1.0) tree_loaded = SBT.load(os.path.join(location, 'demo'), leaf_loader=SigLeaf.load) assert all(not isinstance(n, Node) for n in tree_loaded.nodes.values()) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree_loaded.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result for pos, node in list(tree_loaded.nodes.items()): # Every parent of a node must be an internal node (and not a leaf), # except for node 0 (the root), whose parent is None. if pos != 0: assert isinstance(tree_loaded.parent(pos).node, Node) # Leaf nodes can't have children if isinstance(node, Leaf): assert all(c.node is None for c in tree_loaded.children(pos))
def sbt_index(self, args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf parser = argparse.ArgumentParser() parser.add_argument('sbt_name') parser.add_argument('signatures', nargs='+') parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) parser.add_argument('--traverse-directory', action='store_true') parser.add_argument('-x', '--bf-size', type=float, default=1e5) sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) if args.protein: if args.dna is True: raise Exception('cannot specify both --dna and --protein!') args.dna = False moltype = 'protein' else: args.dna = True moltype = 'dna' factory = GraphFactory(1, args.bf_size, 4) tree = SBT(factory) inp_files = list(args.signatures) if args.traverse_directory: inp_files = [] for dirname in args.signatures: for root, dirs, files in os.walk(dirname): for name in files: if name.endswith('.sig'): fullname = os.path.join(root, name) inp_files.append(fullname) print('loading {} files into SBT'.format(len(inp_files))) n = 0 for f in inp_files: s = sig.load_signatures(f, select_ksize=args.ksize, select_moltype=moltype) for ss in s: leaf = SigLeaf(ss.md5sum(), ss) tree.add_node(leaf) n += 1 print('loaded {} sigs; saving SBT under "{}".'.format(n, args.sbt_name)) tree.save(args.sbt_name)
def test_tree_repair_add_node(): tree_repair = SBT.load(utils.get_test_data('leaves.sbt.json'), leaf_loader=SigLeaf.load) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree_repair.add_node(leaf) for pos, node in list(tree_repair.nodes.items()): # Every parent of a node must be an internal node (and not a leaf), # except for node 0 (the root), whose parent is None. if pos != 0: assert isinstance(tree_repair.parent(pos).node, Node) # Leaf nodes can't have children if isinstance(node, Leaf): assert all(c.node is None for c in tree_repair.children(pos))
def sbt_search(args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBest parser = argparse.ArgumentParser() parser.add_argument('sbt_name', help='name of SBT to load') parser.add_argument('query', help='signature to query') parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) parser.add_argument('--threshold', default=0.08, type=float) parser.add_argument('--save-matches', type=argparse.FileType('wt')) parser.add_argument('--best-only', action='store_true') sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) moltype = sourmash_args.calculate_moltype(args) search_fn = search_minhashes if args.best_only: search_fn = SearchMinHashesFindBest().search tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) query = sourmash_args.load_query_signature(args.query, select_ksize=args.ksize, select_moltype=moltype) query_moltype = sourmash_args.get_moltype(query) query_ksize = query.estimator.ksize notify('loaded query: {}... (k={}, {})', query.name()[:30], query_ksize, query_moltype) results = [] for leaf in tree.find(search_fn, query, args.threshold): results.append((query.similarity(leaf.data), leaf.data)) #results.append((leaf.data.similarity(ss), leaf.data)) results.sort(key=lambda x: -x[0]) # reverse sort on similarity for (similarity, query) in results: print('{:.2f} {}'.format(similarity, query.name())) if args.save_matches: outname = args.save_matches.name notify('saving all matches to "{}"', outname) sig.save_signatures([m for (sim, m) in results], args.save_matches)
def test_search_minhashes(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory) n_leaves = 0 for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = next(iter(tree.leaves())) # this fails if 'search_minhashes' is calc containment and not similarity. results = tree.find(search_minhashes, to_search.data, 0.08) for leaf in results: assert to_search.data.similarity(leaf.data) >= 0.08 print(results)
def test_simple_index(n_children): factory = GraphFactory(5, 100, 3) root = SBT(factory, d=n_children) leaf1 = Leaf("a", factory()) leaf1.data.count("AAAAA") leaf1.data.count("AAAAT") leaf1.data.count("AAAAC") leaf2 = Leaf("b", factory()) leaf2.data.count("AAAAA") leaf2.data.count("AAAAT") leaf2.data.count("AAAAG") leaf3 = Leaf("c", factory()) leaf3.data.count("AAAAA") leaf3.data.count("AAAAT") leaf3.data.count("CAAAA") leaf4 = Leaf("d", factory()) leaf4.data.count("AAAAA") leaf4.data.count("CAAAA") leaf4.data.count("GAAAA") leaf5 = Leaf("e", factory()) leaf5.data.count("AAAAA") leaf5.data.count("AAAAT") leaf5.data.count("GAAAA") root.add_node(leaf1) root.add_node(leaf2) root.add_node(leaf3) root.add_node(leaf4) root.add_node(leaf5) def search_kmer(obj, seq): return obj.data.get(seq) kmers = ["AAAAA", "AAAAT", "AAAAG", "CAAAA", "GAAAA"] linear = LinearIndex() linear.insert(leaf1) linear.insert(leaf2) linear.insert(leaf3) linear.insert(leaf4) linear.insert(leaf5) for kmer in kmers: assert set(root.find(search_kmer, kmer)) == set(linear.find(search_kmer, kmer)) print("-----") print([x.metadata for x in root.find(search_kmer, "AAAAA")]) print([x.metadata for x in root.find(search_kmer, "AAAAT")]) print([x.metadata for x in root.find(search_kmer, "AAAAG")]) print([x.metadata for x in root.find(search_kmer, "CAAAA")]) print([x.metadata for x in root.find(search_kmer, "GAAAA")])
def load_sbts_and_sigs(filenames, query, is_similarity_query, traverse=False): query_ksize = query.minhash.ksize query_moltype = get_moltype(query) n_signatures = 0 n_databases = 0 databases = [] for sbt_or_sigfile in filenames: if traverse and os.path.isdir(sbt_or_sigfile): for sigfile in traverse_find_sigs([sbt_or_sigfile]): try: siglist = sig.load_signatures(sigfile, ksize=query_ksize, select_moltype=query_moltype) siglist = filter_compatible_signatures(query, siglist, 1) siglist = list(siglist) databases.append((siglist, sbt_or_sigfile, False)) notify('loaded {} signatures from {}', len(siglist), sigfile, end='\r') n_signatures += len(siglist) except: # ignore errors with traverse pass # done! jump to beginning of main 'for' loop continue # no traverse? try loading as an SBT. try: tree = SBT.load(sbt_or_sigfile, leaf_loader=SigLeaf.load) if not check_tree_is_compatible(sbt_or_sigfile, tree, query, is_similarity_query): sys.exit(-1) databases.append((tree, sbt_or_sigfile, True)) notify('loaded SBT {}', sbt_or_sigfile, end='\r') n_databases += 1 # done! jump to beginning of main 'for' loop continue except (ValueError, EnvironmentError): # not an SBT - try as a .sig pass # not a tree? try loading as a signature. try: siglist = sig.load_signatures(sbt_or_sigfile, ksize=query_ksize, select_moltype=query_moltype) siglist = list(siglist) if len(siglist) == 0: # file not found, or parse error? raise ValueError siglist = filter_compatible_signatures(query, siglist, False) siglist = list(siglist) databases.append((siglist, sbt_or_sigfile, False)) notify('loaded {} signatures from {}', len(siglist), sbt_or_sigfile, end='\r') n_signatures += len(siglist) except (EnvironmentError, ValueError): error("\nCannot open file '{}'", sbt_or_sigfile) sys.exit(-1) notify(' ' * 79, end='\r') if n_signatures and n_databases: notify('loaded {} signatures and {} databases total.', n_signatures, n_databases) elif n_signatures: notify('loaded {} signatures.', n_signatures) elif n_databases: notify('loaded {} databases.', n_databases) else: sys.exit(-1) if databases: print('') return databases
def test_longer_search(n_children): ksize = 5 factory = GraphFactory(ksize, 100, 3) root = SBT(factory, d=n_children) leaf1 = Leaf("a", factory()) leaf1.data.count('AAAAA') leaf1.data.count('AAAAT') leaf1.data.count('AAAAC') leaf2 = Leaf("b", factory()) leaf2.data.count('AAAAA') leaf2.data.count('AAAAT') leaf2.data.count('AAAAG') leaf3 = Leaf("c", factory()) leaf3.data.count('AAAAA') leaf3.data.count('AAAAT') leaf3.data.count('CAAAA') leaf4 = Leaf("d", factory()) leaf4.data.count('AAAAA') leaf4.data.count('CAAAA') leaf4.data.count('GAAAA') leaf5 = Leaf("e", factory()) leaf5.data.count('AAAAA') leaf5.data.count('AAAAT') leaf5.data.count('GAAAA') root.add_node(leaf1) root.add_node(leaf2) root.add_node(leaf3) root.add_node(leaf4) root.add_node(leaf5) def kmers(k, seq): for start in range(len(seq) - k + 1): yield seq[start:start + k] def search_transcript(node, seq, threshold): presence = [node.data.get(kmer) for kmer in kmers(ksize, seq)] if sum(presence) >= int(threshold * (len(seq) - ksize + 1)): return 1 return 0 try1 = [x.metadata for x in root.find(search_transcript, "AAAAT", 1.0)] assert set(try1) == set(['a', 'b', 'c', 'e']), try1 # no 'd' try2 = [x.metadata for x in root.find(search_transcript, "GAAAAAT", 0.6)] assert set(try2) == set(['a', 'b', 'c', 'd', 'e']) try3 = [x.metadata for x in root.find(search_transcript, "GAAAA", 1.0)] assert set(try3) == set(['d', 'e']), try3
def test_sbt_combine(n_children): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=n_children) tree_1 = SBT(factory, d=n_children) tree_2 = SBT(factory, d=n_children) n_leaves = 0 for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) if n_leaves < 4: tree_1.add_node(leaf) else: tree_2.add_node(leaf) n_leaves += 1 tree_1.combine(tree_2) t1_leaves = {str(l) for l in tree_1.leaves()} t_leaves = {str(l) for l in tree.leaves()} assert len(t1_leaves) == n_leaves assert len(t_leaves) == len(t1_leaves) assert t1_leaves == t_leaves to_search = next( signature.load_signatures(utils.get_test_data(utils.SIG_FILES[0]))) t1_result = {str(s) for s in tree_1.find(search_minhashes, to_search, 0.1)} tree_result = {str(s) for s in tree.find(search_minhashes, to_search, 0.1)} assert t1_result == tree_result # TODO: save and load both trees # check if adding a new node will use the next empty position next_empty = 0 for n, d in tree_1.nodes.items(): if d is None: next_empty = n break if not next_empty: next_empty = n + 1 tree_1.add_node(leaf) assert tree_1.max_node == next_empty
with open('ref.json', 'w+') as outjson: outjson.write(dumps(db.ref.find(), indent=4)) from sourmash_lib import Estimators from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import SigLeaf, search_minhashes from sourmash_lib.signature import SourmashSignature KSIZE = 16 N = 1000 # init SBT factory = GraphFactory(ksize=KSIZE, starting_size=1e5, n_tables=4) # 4 .. nt? tree = SBT(factory, d=2) # d .. see "n-ary " in notebook bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength) cursor = db.ref.find() c = 0 for i in cursor: key = deep_get(i, 'metadata.alt_id.gb') seq = i['sequence'] # db.ref.find_one()['sequence'] # 'ACTG...' e = Estimators(ksize=KSIZE, n=N) e.add_sequence(seq, force=True) # e.get_hashes() s = SourmashSignature(email='', estimator=e, name=key) leaf = SigLeaf(metadata=key, data=s) tree.add_node(node=leaf) c += 1 bar.update(c)
def test_simple(n_children): factory = GraphFactory(5, 100, 3) root = SBT(factory, d=n_children) leaf1 = Leaf("a", factory()) leaf1.data.count('AAAAA') leaf1.data.count('AAAAT') leaf1.data.count('AAAAC') leaf2 = Leaf("b", factory()) leaf2.data.count('AAAAA') leaf2.data.count('AAAAT') leaf2.data.count('AAAAG') leaf3 = Leaf("c", factory()) leaf3.data.count('AAAAA') leaf3.data.count('AAAAT') leaf3.data.count('CAAAA') leaf4 = Leaf("d", factory()) leaf4.data.count('AAAAA') leaf4.data.count('CAAAA') leaf4.data.count('GAAAA') leaf5 = Leaf("e", factory()) leaf5.data.count('AAAAA') leaf5.data.count('AAAAT') leaf5.data.count('GAAAA') root.add_node(leaf1) root.add_node(leaf2) root.add_node(leaf3) root.add_node(leaf4) root.add_node(leaf5) def search_kmer(obj, seq): return obj.data.get(seq) leaves = [leaf1, leaf2, leaf3, leaf4, leaf5] kmers = ["AAAAA", "AAAAT", "AAAAG", "CAAAA", "GAAAA"] def search_kmer_in_list(kmer): x = [] for l in leaves: if l.data.get(kmer): x.append(l) return set(x) for kmer in kmers: assert set(root.find(search_kmer, kmer)) == search_kmer_in_list(kmer) print('-----') print([x.metadata for x in root.find(search_kmer, "AAAAA")]) print([x.metadata for x in root.find(search_kmer, "AAAAT")]) print([x.metadata for x in root.find(search_kmer, "AAAAG")]) print([x.metadata for x in root.find(search_kmer, "CAAAA")]) print([x.metadata for x in root.find(search_kmer, "GAAAA")])
def sbt_gather(args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBestIgnoreMaxHash parser = argparse.ArgumentParser() parser.add_argument('sbt_name', help='name of SBT to search') parser.add_argument('query', help='query signature') parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) parser.add_argument('--threshold', default=0.05, type=float) parser.add_argument('-o', '--output', type=argparse.FileType('wt')) parser.add_argument('--csv', type=argparse.FileType('wt')) parser.add_argument('--save-matches', type=argparse.FileType('wt')) sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) moltype = sourmash_args.calculate_moltype(args) tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) query = sourmash_args.load_query_signature(args.query, select_ksize=args.ksize, select_moltype=moltype) query_moltype = sourmash_args.get_moltype(query) query_ksize = query.estimator.ksize notify('loaded query: {}... (k={}, {})', query.name()[:30], query_ksize, query_moltype) if query.estimator.max_hash == 0: error('query signature needs to be created with --scaled') error('or using --with-cardinality.') sys.exit(-1) notify('query signature has max_hash: {}', query.estimator.max_hash) orig_query = query R_metagenome = 2**64 / float(orig_query.estimator.max_hash) new_mins = query.estimator.get_hashes() e = sourmash_lib.Estimators(ksize=args.ksize, n=len(new_mins)) e.update(query.estimator) query = sig.SourmashSignature('', e) sum_found = 0. found = [] while 1: search_fn = SearchMinHashesFindBestIgnoreMaxHash().search results = [] # use super low threshold for this part of the search for leaf in tree.find(search_fn, query, 0.00001): results.append((query.estimator.similarity_ignore_maxhash( leaf.data.estimator), leaf.data)) if not len(results): # no matches at all! break # take the best result results.sort(key=lambda x: -x[0]) # reverse sort on similarity best_sim, best_ss = results[0] # subtract found hashes from search hashes, construct new search new_mins = set(query.estimator.get_hashes()) found_mins = best_ss.estimator.get_hashes() if best_ss.estimator.max_hash: R_genome = 2**64 / float(best_ss.estimator.max_hash) elif best_ss.estimator.hll: genome_size = best_ss.estimator.hll.estimate_cardinality() genome_max_hash = max(found_mins) R_genome = float(genome_size) / float(genome_max_hash) else: error('Best hash match in sbt_gather has no cardinality') error('Please prepare database of sequences with --scaled') error('...or with --with-cardinality') sys.exit(-1) R_comparison = max(R_metagenome, R_genome) new_max_hash = 2**64 / float(R_comparison) new_mins = set([i for i in new_mins if i < new_max_hash]) found_mins = set([i for i in found_mins if i < new_max_hash]) # intersection: intersect_mins = new_mins.intersection(found_mins) if len(intersect_mins) < 5: # hard cutoff for now notify('found only {} hashes in common.', len(intersect_mins)) notify('this is below a sane threshold => exiting.') break # first denominator - genome size genome_n_mins = len(found_mins) f_genome = len(intersect_mins) / float(genome_n_mins) # second denominator - metagenome size query_n_mins = len(orig_query.estimator.get_hashes()) f_query = len(intersect_mins) / float(query_n_mins) # print interim & save notify('found: {:.2f} {:.2f} {}', f_genome, f_query, best_ss.name()) found.append((f_genome, best_ss)) new_mins -= set(found_mins) e = sourmash_lib.Estimators(ksize=args.ksize, n=len(new_mins)) e.add_many(new_mins) query = sig.SourmashSignature('', e) notify('found {}, total fraction {:.3f}', len(found), sum_found) notify('') if not found: sys.exit(0) found.sort(key=lambda x: x[0]) found.reverse() notify('Composition:') for (frac, leaf_sketch) in found: notify('{:.2f} {}', frac, leaf_sketch.name()) if args.output: print('Composition:', file=args.output) for (frac, leaf_sketch) in found: print('{:.2f} {}'.format(frac, leaf_sketch.name()), file=args.output) if args.csv: fieldnames = ['fraction', 'name', 'sketch_kmers'] w = csv.DictWriter(args.csv, fieldnames=fieldnames) w.writeheader() for (frac, leaf_sketch) in found: cardinality = leaf_sketch.estimator.hll.estimate_cardinality() w.writerow( dict(fraction=frac, name=leaf_sketch.name(), sketch_kmers=cardinality)) if args.save_matches: outname = args.save_matches.name notify('saving all matches to "{}"', outname) sig.save_signatures([ss for (f, ss) in found], args.save_matches)
def sbt_search(self, args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBest parser = argparse.ArgumentParser() parser.add_argument('sbt_name') parser.add_argument('query') parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) parser.add_argument('--threshold', default=0.08, type=float) parser.add_argument('--save-matches', type=argparse.FileType('wt')) parser.add_argument('--best-only', action='store_true') sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) if args.protein: if args.dna is True: raise Exception('cannot specify both --dna and --protein!') args.dna = False moltype = None if args.protein: moltype = 'protein' elif args.dna: moltype = 'dna' search_fn = search_minhashes if args.best_only: search_fn = SearchMinHashesFindBest().search tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) sl = sig.load_signatures(args.query, select_ksize=args.ksize, select_moltype=moltype) sl = list(sl) if len(sl) != 1: print('When loading query from "{}",'.format(args.query), file=sys.stderr) print('{} query signatures matching ksize and molecule type; need exactly one.'.format(len(sl))) sys.exit(-1) query = sl[0] query_moltype = 'UNKNOWN' if query.estimator.is_molecule_type('dna'): query_moltype = 'DNA' elif query.estimator.is_molecule_type('protein'): query_moltype = 'protein' query_ksize = query.estimator.ksize print('loaded query: {}... (k={}, {})'.format(query.name()[:30], query_ksize, query_moltype)) results = [] for leaf in tree.find(search_fn, query, args.threshold): results.append((query.similarity(leaf.data), leaf.data)) #results.append((leaf.data.similarity(ss), leaf.data)) results.sort(key=lambda x: -x[0]) # reverse sort on similarity for (similarity, query) in results: print('{:.2f} {}'.format(similarity, query.name())) if args.save_matches: outname = args.save_matches.name print('saving all matches to "{}"'.format(outname)) sig.save_signatures([ m for (sim, m) in results ], args.save_matches)
def categorize(args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBest parser = argparse.ArgumentParser() parser.add_argument('sbt_name', help='name of SBT to load') parser.add_argument('queries', nargs='+', help='list of signatures to categorize') parser.add_argument('-k', '--ksize', type=int, default=None) parser.add_argument('--threshold', default=0.08, type=float) parser.add_argument('--traverse-directory', action="store_true") sourmash_args.add_moltype_args(parser) parser.add_argument('--csv', type=argparse.FileType('at')) parser.add_argument('--load-csv', default=None) args = parser.parse_args(args) moltype = sourmash_args.calculate_moltype(args) already_names = set() if args.load_csv: with open(args.load_csv, 'rt') as fp: r = csv.reader(fp) for row in r: already_names.add(row[0]) tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) if args.traverse_directory: inp_files = set(sourmash_args.traverse_find_sigs(args.queries)) else: inp_files = set(args.queries) - already_names inp_files = set(inp_files) - already_names notify('found {} files to query', len(inp_files)) loader = sourmash_args.LoadSingleSignatures(inp_files, args.ksize, moltype) for queryfile, query, query_moltype, query_ksize in loader: notify('loaded query: {}... (k={}, {})', query.name()[:30], query_ksize, query_moltype) results = [] search_fn = SearchMinHashesFindBest().search for leaf in tree.find(search_fn, query, args.threshold): if leaf.data.md5sum() != query.md5sum(): # ignore self. results.append((query.similarity(leaf.data), leaf.data)) best_hit_sim = 0.0 best_hit_query_name = "" if results: results.sort(key=lambda x: -x[0]) # reverse sort on similarity best_hit_sim, best_hit_query = results[0] notify('for {}, found: {:.2f} {}', query.name(), best_hit_sim, best_hit_query.name()) best_hit_query_name = best_hit_query.name() else: notify('for {}, no match found', query.name()) if args.csv: w = csv.writer(args.csv) w.writerow([queryfile, best_hit_query_name, best_hit_sim]) if loader.skipped_ignore: notify('skipped/ignore: {}', loader.skipped_ignore) if loader.skipped_nosig: notify('skipped/nosig: {}', loader.skipped_nosig)
def sbt_gather(self, args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBest parser = argparse.ArgumentParser() parser.add_argument('sbt_name') parser.add_argument('query') parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) parser.add_argument('--threshold', default=0.05, type=float) parser.add_argument('-o', '--output', type=argparse.FileType('wt')) parser.add_argument('--csv', type=argparse.FileType('wt')) sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) if args.protein: if args.dna is True: raise Exception('cannot specify both --dna and --protein!') args.dna = False moltype = None if args.protein: moltype = 'protein' elif args.dna: moltype = 'dna' tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) sl = sig.load_signatures(args.query, select_ksize=args.ksize, select_moltype=moltype) sl = list(sl) if len(sl) != 1: print('When loading query from "{}",'.format(args.query), file=sys.stderr) print('{} query signatures matching ksize and molecule type; need exactly one.'.format(len(sl))) sys.exit(-1) query = sl[0] query_moltype = 'UNKNOWN' if query.estimator.is_molecule_type('dna'): query_moltype = 'DNA' elif query.estimator.is_molecule_type('protein'): query_moltype = 'protein' query_ksize = query.estimator.ksize print('loaded query: {}... (k={}, {})'.format(query.name()[:30], query_ksize, query_moltype)) tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) #s = sig.load_signatures(args.query, select_ksize=args.ksize) orig_query = query sum_found = 0. found = [] while 1: search_fn = SearchMinHashesFindBest().search results = [] # use super low threshold for this part of the search for leaf in tree.find(search_fn, query, 0.00001): results.append((query.similarity(leaf.data), leaf.data)) #results.append((leaf.data.similarity(ss), leaf.data)) if not len(results): # no matches at all! break # take the best result results.sort(key=lambda x: -x[0]) # reverse sort on similarity best_sim, best_ss = results[0] sim = best_ss.similarity(orig_query) # adjust by size of leaf (kmer cardinality of original genome) if best_ss.estimator.hll: leaf_kmers = best_ss.estimator.hll.estimate_cardinality() query_kmers = orig_query.estimator.hll.estimate_cardinality() f_of_total = leaf_kmers / query_kmers * sim else: f_of_total = 0 if not found and sim < args.threshold: print('best match: {}'.format(best_ss.name())) print('similarity is {:.5f} of db signature;'.format(sim)) print('this is below specified threshold => exiting.') break # subtract found hashes from search hashes, construct new search new_mins = set(query.estimator.mh.get_mins()) found_mins = best_ss.estimator.mh.get_mins() # print interim & save print('found: {:.2f} {} {}'.format(f_of_total, len(new_mins), best_ss.name())) found.append((f_of_total, best_ss, sim)) sum_found += f_of_total new_mins -= set(found_mins) e = sourmash_lib.Estimators(ksize=args.ksize, n=len(new_mins)) for m in new_mins: e.mh.add_hash(m) new_ss = sig.SourmashSignature('foo', e) query = new_ss print('found {}, total fraction {:.3f}'.format(len(found), sum_found)) print('') if not found: sys.exit(0) found.sort() found.reverse() print('Composition:') for (frac, leaf_sketch, sim) in found: print('{:.2f} {}'.format(frac, leaf_sketch.name())) if args.output: print('Composition:', file=args.output) for (frac, leaf_sketch, sim) in found: print('{:.2f} {}'.format(frac, leaf_sketch.name()), file=args.output) if args.csv: fieldnames = ['fraction', 'name', 'similarity', 'sketch_kmers'] w = csv.DictWriter(args.csv, fieldnames=fieldnames) w.writeheader() for (frac, leaf_sketch, sim) in found: cardinality = leaf_sketch.estimator.hll.estimate_cardinality() w.writerow(dict(fraction=frac, name=leaf_sketch.name(), similarity=sim, sketch_kmers=cardinality))
def categorize(self, args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBest parser = argparse.ArgumentParser() parser.add_argument('sbt_name') parser.add_argument('queries', nargs='+') parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) parser.add_argument('--threshold', default=0.08, type=float) parser.add_argument('--traverse-directory', action="store_true") sourmash_args.add_moltype_args(parser) parser.add_argument('--csv', type=argparse.FileType('at')) parser.add_argument('--load-csv', default=None) args = parser.parse_args(args) if args.protein: if args.dna is True: raise Exception('cannot specify both --dna and --protein!') args.dna = False moltype = None if args.protein: moltype = 'protein' elif args.dna: moltype = 'dna' already_names = set() if args.load_csv: with open(args.load_csv, 'rt') as fp: r = csv.reader(fp) for row in r: already_names.add(row[0]) tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) if args.traverse_directory: inp_files = [] for dirname in args.queries: for root, dirs, files in os.walk(dirname): for name in files: if name.endswith('.sig'): fullname = os.path.join(root, name) if fullname not in already_names: inp_files.append(fullname) else: inp_files = args.queries print('found {} files to query'.format(len(inp_files))) loader = sourmash_args.LoadSingleSignatures(inp_files, args.ksize, moltype) for queryfile, query, query_moltype, query_ksize in loader: print('loaded query: {}... (k={}, {})'.format(query.name()[:30], query_ksize, query_moltype)) results = [] search_fn = SearchMinHashesFindBest().search for leaf in tree.find(search_fn, query, args.threshold): # ignore self if leaf.data.md5sum() != query.md5sum(): results.append((query.similarity(leaf.data), leaf.data)) best_hit_sim = 0.0 best_hit_query_name = "" if results: results.sort(key=lambda x: -x[0]) # reverse sort on similarity best_hit_sim, best_hit_query = results[0] print('for {}, found: {:.2f} {}'.format(query.name(), best_hit_sim, best_hit_query.name())) best_hit_query_name = best_hit_query.name() else: print('for {}, no match found'.format(query.name())) if args.csv: w = csv.writer(args.csv) w.writerow([queryfile, best_hit_query_name, best_hit_sim]) if loader.skipped_ignore: print('skipped/ignore: {}'.format(loader.skipped_ignore)) if loader.skipped_nosig: print('skipped/nosig: {}'.format(loader.skipped_nosig))
def main(): p = argparse.ArgumentParser() p.add_argument('dir1') p.add_argument('sbt1') p.add_argument('dir2') p.add_argument('sbt2') p.add_argument('-k', '--ksize', type=int, default=31) args = p.parse_args() print('loading all signatures:', args.dir1) sigdict1 = load_all_signatures(args.dir1, args.ksize) tree1 = SBT.load(args.sbt1, leaf_loader=SigLeaf.load) print('...loaded {} signatures at k={}'.format(len(sigdict1), args.ksize)) print('loading all signatures:', args.dir2) sigdict2 = load_all_signatures(args.dir2, args.ksize) tree2 = SBT.load(args.sbt2, leaf_loader=SigLeaf.load) print('...loaded {} signatures at k={}'.format(len(sigdict2), args.ksize)) # first, find all matches in 2 for 1, and 1 for 2 THRESHOLD = 0.05 matches_1_in_2 = make_all_matches(sigdict1, tree2, THRESHOLD) matches_2_in_1 = make_all_matches(sigdict2, tree1, THRESHOLD) # now, do containment contained_1_in_2 = containment(matches_1_in_2, sigdict1, sigdict2) contained_2_in_1 = containment(matches_2_in_1, sigdict2, sigdict1) # summary stats CONTAIN_THRESHOLD = 0.95 IDENT_THRESHOLD = 0.80 print('thresholds:') print('min Jaccard similarity for any match:', THRESHOLD) print('to score as identical, similarity must be >=', IDENT_THRESHOLD) print('to score as contained, containment must be >=', CONTAIN_THRESHOLD) # 1 in 2 c_ident = 0 c_match = 0 c_contain = 0 c_no_match = 0 c_no_contain = 0 identical_names = [] for query_name in sigdict1: best_match = None similarity = 0.0 cont = 0.0 if query_name in matches_1_in_2: (best_match, similarity) = matches_1_in_2[query_name] if query_name in contained_1_in_2: cont = contained_1_in_2[query_name] if not best_match: c_no_match += 1 else: c_match += 1 if cont < CONTAIN_THRESHOLD: c_no_contain += 1 else: c_contain += 1 if similarity > IDENT_THRESHOLD: identical_names.append((query_name, best_match)) c_ident += 1 print('----') print('{} vs {}: {} signatures'.format(args.dir1, args.dir2, len(sigdict1))) print('identical count:', c_ident) print('containment count:', c_contain) print('matches:', c_match) print('no match:', c_no_match) print('no contain:', c_no_contain) print('identical:') for (k, v) in identical_names: print("{} = {}".format(k, v)) # 2 in 1 c_ident = 0 c_match = 0 c_contain = 0 c_no_match = 0 c_no_contain = 0 identical_names = [] for query_name in sigdict2: best_match = None similarity = 0.0 cont = 0.0 if query_name in matches_2_in_1: (best_match, similarity) = matches_2_in_1[query_name] if query_name in contained_2_in_1: cont = contained_2_in_1[query_name] if not best_match: c_no_match += 1 else: c_match += 1 if cont < CONTAIN_THRESHOLD: c_no_contain += 1 else: c_contain += 1 if similarity > IDENT_THRESHOLD: identical_names.append((query_name, best_match)) c_ident += 1 print('----') print('{} vs {}: {} signatures'.format(args.dir2, args.dir1, len(sigdict2))) print('identical count:', c_ident) print('containment count:', c_contain) print('matches:', c_match) print('no match:', c_no_match) print('no contain:', c_no_contain) print('identical:') for (k, v) in identical_names: print("{} = {}".format(k, v))
def watch(self, args): "Build a signature from raw FASTA/FASTQ coming in on stdin, search." from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBest parser = argparse.ArgumentParser() parser.add_argument('sbt_name') parser.add_argument('-o', '--output', type=argparse.FileType('wt')) parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) parser.add_argument('--threshold', default=0.05, type=float) parser.add_argument('--input-is-protein', action='store_true') sourmash_args.add_moltype_args(parser, default_dna=True) parser.add_argument('-n', '--num-hashes', type=int, default=DEFAULT_N, help='number of hashes to use in each sketch (default: %(default)i)') parser.add_argument('--name', type=str, default='stdin') args = parser.parse_args(args) if args.input_is_protein and args.dna: print('WARNING: input is protein, turning off DNA hash computing.', file=sys.stderr) args.dna = False args.protein = True if args.dna and args.protein: notify('ERROR: cannot use "watch" with both DNA and protein.') if args.dna: moltype = 'DNA' is_protein = False else: moltype = 'protein' is_protein = True E = sourmash_lib.Estimators(ksize=args.ksize, n=args.num_hashes, protein=is_protein) streamsig = sig.SourmashSignature('', E, filename='stdin', name=args.name) notify('Computing signature for k={}, {} from stdin', args.ksize, moltype) tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) def do_search(): search_fn = SearchMinHashesFindBest().search results = [] for leaf in tree.find(search_fn, streamsig, args.threshold): results.append((streamsig.similarity(leaf.data), leaf.data)) return results notify('reading sequences from stdin') screed_iter = screed.open('/dev/stdin') watermark = WATERMARK_SIZE # iterate over input records n = 0 for n, record in enumerate(screed_iter): # at each watermark, print status & check cardinality if n >= watermark: notify('... read {} sequences', n) watermark += WATERMARK_SIZE if do_search(): break if args.input_is_protein: E.mh.add_protein(record.sequence) else: E.add_sequence(record.sequence, False) results = do_search() if not results: notify('... read {} sequences, no matches found.', n) else: results.sort(key=lambda x: -x[0]) # take best similarity, found_sig = results[0] notify('FOUND: {}, at {:.3f}', found_sig.name(), similarity) if args.output: sig.save_signatures([streamsig], args.output)
def watch(args): "Build a signature from raw FASTA/FASTQ coming in on stdin, search." from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBest parser = argparse.ArgumentParser() parser.add_argument('sbt_name', help='name of SBT to search') parser.add_argument('inp_file', nargs='?', default='/dev/stdin') parser.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') parser.add_argument('-o', '--output', type=argparse.FileType('wt'), help='save signature generated from data here') parser.add_argument('--threshold', default=0.05, type=float, help='minimum threshold for matches') parser.add_argument( '--input-is-protein', action='store_true', help='Consume protein sequences - no translation needed') sourmash_args.add_construct_moltype_args(parser) parser.add_argument( '-n', '--num-hashes', type=int, default=DEFAULT_N, help='number of hashes to use in each sketch (default: %(default)i)') parser.add_argument('--name', type=str, default='stdin', help='name to use for generated signature') sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K) args = parser.parse_args(args) set_quiet(args.quiet) if args.input_is_protein and args.dna: notify('WARNING: input is protein, turning off DNA hashing.') args.dna = False args.protein = True if args.dna and args.protein: notify('ERROR: cannot use "watch" with both DNA and protein.') if args.dna: moltype = 'DNA' is_protein = False else: moltype = 'protein' is_protein = True tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) def get_ksize(tree): """Walk nodes in `tree` to find out ksize""" for node in tree.nodes.values(): if isinstance(node, sourmash_lib.sbtmh.SigLeaf): return node.data.minhash.ksize # deduce ksize from the SBT we are loading ksize = args.ksize if ksize is None: ksize = get_ksize(tree) E = sourmash_lib.MinHash(ksize=ksize, n=args.num_hashes, is_protein=is_protein) streamsig = sig.SourmashSignature('', E, filename='stdin', name=args.name) notify('Computing signature for k={}, {} from stdin', ksize, moltype) def do_search(): search_fn = SearchMinHashesFindBest().search results = [] for leaf in tree.find(search_fn, streamsig, args.threshold): results.append((streamsig.similarity(leaf.data), leaf.data)) return results notify('reading sequences from stdin') screed_iter = screed.open(args.inp_file) watermark = WATERMARK_SIZE # iterate over input records n = 0 for n, record in enumerate(screed_iter): # at each watermark, print status & check cardinality if n >= watermark: notify('\r... read {} sequences', n, end='') watermark += WATERMARK_SIZE if do_search(): break if args.input_is_protein: E.add_protein(record.sequence) else: E.add_sequence(record.sequence, False) results = do_search() if not results: notify('... read {} sequences, no matches found.', n) else: results.sort(key=lambda x: -x[0]) # take best similarity, found_sig = results[0] print_results('FOUND: {}, at {:.3f}', found_sig.name(), similarity) if args.output: notify('saving signature to {}', args.output.name) sig.save_signatures([streamsig], args.output)