def grow_sbt(args): # find input files if necessary input_files = args.input_files if args.input_is_directory: input_files = collect_input_files_from_dir(input_files[0], args.subset_csv, args.subset_info_colname) ksize = args.ksize alphabet = args.alphabet if alphabet == "protein" or alphabet == "dayhoff" or alphabet == "hp": ksize = ksize * 3 # create or load sbt sbt = create_sbt_or_load_existing(args.sbt, args.load_existing_sbt) # iterate through input files; add to sbt for n, filename in enumerate(input_files): # swipe some handy progress reporting code from titus: if n % 100 == 0: sys.stderr.write( f"... loading {filename} file {n} of {len(input_files)}\n") # build or load signature from file sig = load_or_generate_sig_from_file(filename, alphabet, ksize, args.scaled, args.ignore_abundance, args.translate) # add to sbt if sig: # is this necessary? if sig.minhash: leaf = SigLeaf(sig.md5sum(), sig) sbt.add_node(leaf) # save the tree sbt.save(args.sbt)
def test_binary_nary_tree(): factory = GraphFactory(31, 1e5, 4) trees = {} trees[2] = SBT(factory) trees[5] = SBT(factory, d=5) trees[10] = SBT(factory, d=10) n_leaves = 0 for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) for tree in trees.values(): tree.add_node(leaf) to_search = leaf n_leaves += 1 assert all([len(list(t.leaves())) == n_leaves for t in trees.values()]) results = {} print('*' * 60) print("{}:".format(to_search.metadata)) for d, tree in trees.items(): results[d] = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*results[2], sep='\n') assert results[2] == results[5] assert results[5] == results[10]
def test_tree_save_load(n_children): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') with utils.TempDirectory() as location: tree.save(os.path.join(location, 'demo')) tree = SBT.load(os.path.join(location, 'demo'), leaf_loader=SigLeaf.load) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result
def test_sbt_combine(n_children): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=n_children) tree_1 = SBT(factory, d=n_children) tree_2 = SBT(factory, d=n_children) n_leaves = 0 for f in utils.SIG_FILES: sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) if n_leaves < 4: tree_1.add_node(leaf) else: tree_2.add_node(leaf) n_leaves += 1 tree_1.combine(tree_2) t1_leaves = {str(l) for l in tree_1.leaves()} t_leaves = {str(l) for l in tree.leaves()} assert len(t1_leaves) == n_leaves assert len(t_leaves) == len(t1_leaves) assert t1_leaves == t_leaves to_search = load_one_signature(utils.get_test_data(utils.SIG_FILES[0])) t1_result = {str(s) for s in tree_1.find(search_minhashes, to_search, 0.1)} tree_result = {str(s) for s in tree.find(search_minhashes, to_search, 0.1)} assert t1_result == tree_result # TODO: save and load both trees # check if adding a new node will use the next empty position next_empty = 0 for n, (d, _) in enumerate(tree_1): if n != d: next_empty = n break if not next_empty: next_empty = n + 1 tree_1.add_node(SigLeaf(to_search.name(), to_search)) assert tree_1.next_node == next_empty
def add_singleton_sigs(sbt, input_file, ksize, scaled, alphabet, ignore_abundance=False, translate=False): if input_file.endswith(".sig"): sigs = sourmash.signature.load_signatures(input_file, ksize=ksize, select_moltype=alphabet) for sig in sigs: if sig.minhash: leaf = SigLeaf(sig.md5sum(), sig) sbt.add_node(leaf) # loop through and add each to sbt else: # read file and add sigs records = try_reading_fasta_file(input_file) # start with fresh minhash if records: for n, record in enumerate(records): signame = (record.name).rsplit("\t", 1)[0] if n % 10000 == 0: sys.stderr.write(f"... building {n}th sig, {signame}\n") mh = determine_appropriate_fresh_minhash( alphabet, ksize, scaled, ignore_abundance) if alphabet == "nucleotide" or translate: mh.add_sequence(record.sequence, force=True) else: mh.add_protein(record.sequence) # minhash --> signature sig = sourmash.SourmashSignature(mh, name=signame) if sig.minhash: leaf = SigLeaf(sig.md5sum(), sig) sbt.add_node(leaf) return sbt
def test_tree_repair_add_node(): tree_repair = SBT.load(utils.get_test_data('leaves.sbt.json'), leaf_loader=SigLeaf.load) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree_repair.add_node(leaf) for pos, node in list(tree_repair.nodes.items()): # Every parent of a node must be an internal node (and not a leaf), # except for node 0 (the root), whose parent is None. if pos != 0: assert isinstance(tree_repair.parent(pos).node, Node) # Leaf nodes can't have children if isinstance(node, Leaf): assert all(c.node is None for c in tree_repair.children(pos))
def test_search_minhashes(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory) n_leaves = 0 for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = next(iter(tree.leaves())) # this fails if 'search_minhashes' is calc containment and not similarity. results = tree.find(search_minhashes, to_search.data, 0.08) for leaf in results: assert to_search.data.similarity(leaf.data) >= 0.08 print(results)
def test_save_sparseness(n_children): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') with utils.TempDirectory() as location: tree.save(os.path.join(location, 'demo'), sparseness=1.0) tree_loaded = SBT.load(os.path.join(location, 'demo'), leaf_loader=SigLeaf.load) assert all(not isinstance(n, Node) for n in tree_loaded.nodes.values()) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree_loaded.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result for pos, node in list(tree_loaded.nodes.items()): # Every parent of a node must be an internal node (and not a leaf), # except for node 0 (the root), whose parent is None. if pos != 0: assert isinstance(tree_loaded.parent(pos).node, Node) # Leaf nodes can't have children if isinstance(node, Leaf): assert all(c.node is None for c in tree_loaded.children(pos))
def test_sbt_ipfsstorage(): ipfshttpclient = pytest.importorskip('ipfshttpclient') factory = GraphFactory(31, 1e5, 4) with utils.TempDirectory() as location: tree = SBT(factory) for f in utils.SIG_FILES: sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') try: with IPFSStorage() as storage: tree.save(os.path.join(location, 'tree'), storage=storage) except ipfshttpclient.exceptions.ConnectionError: pytest.xfail("ipfs not installed/functioning probably") with IPFSStorage() as storage: tree = SBT.load(os.path.join(location, 'tree'), leaf_loader=SigLeaf.load, storage=storage) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result
def test_sbt_redisstorage(): redis = pytest.importorskip('redis') factory = GraphFactory(31, 1e5, 4) with utils.TempDirectory() as location: tree = SBT(factory) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') try: with RedisStorage() as storage: tree.save(os.path.join(location, 'tree'), storage=storage) except redis.exceptions.ConnectionError: pytest.xfail("Couldn't connect to redis server") with RedisStorage() as storage: tree = SBT.load(os.path.join(location, 'tree'), leaf_loader=SigLeaf.load, storage=storage) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result
def test_sbt_zipstorage(tmpdir): # create tree, save to a zip, then load and search. factory = GraphFactory(31, 1e5, 4) tree = SBT(factory) for f in utils.SIG_FILES: sig = next(load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') with ZipStorage(str(tmpdir.join("tree.sbt.zip"))) as storage: tree.save(str(tmpdir.join("tree")), storage=storage) with ZipStorage(str(tmpdir.join("tree.sbt.zip"))) as storage: tree = SBT.load(str(tmpdir.join("tree")), leaf_loader=SigLeaf.load, storage=storage) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result
def test_sbt_tarstorage(): factory = GraphFactory(31, 1e5, 4) with utils.TempDirectory() as location: tree = SBT(factory) for f in utils.SIG_FILES: sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') with TarStorage(os.path.join(location, 'tree.tar.gz')) as storage: tree.save(os.path.join(location, 'tree'), storage=storage) with TarStorage(os.path.join(location, 'tree.tar.gz')) as storage: tree = SBT.load(os.path.join(location, 'tree'), leaf_loader=SigLeaf.load, storage=storage) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result