def test_sbt_dayhoff_command_index(c): # test command-line creation of SBT database with dayhoff sigs sigfile1 = utils.get_test_data( 'prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') sigfile2 = utils.get_test_data( 'prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') db_out = c.output('dayhoff.sbt.zip') c.run_sourmash('index', db_out, sigfile1, sigfile2, '--scaled', '100', '-k', '57', '--dayhoff') db2 = sourmash.load_sbt_index(db_out) sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) # check reconstruction -- mh_list = [x.minhash for x in db2.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list # and search, gather results = db2.search(sig1, threshold=0.0, ignore_abundance=True, do_containment=False, best_only=False) assert len(results) == 2 results = db2.gather(sig2) assert results[0][0] == 1.0
def load_sbt_file(tree_file): if os.path.exists(tree_file): try: sbt = sourmash.load_sbt_index(tree_file) sys.stderr.write(f"loaded sbt file at {tree_file}\n") return sbt except: sys.stderr.write(f"cannot load sbt file at {tree_file}\n") sys.exit()
def create_sbt_or_load_existing(tree_file, load_existing=False): # hmm.. adding and overwriting seems complicated but managed? see sourmash/sbt_storage.py if load_existing: try: sbt = sourmash.load_sbt_index(tree_file) except: sys.stderr.write(f"\ncannot load sbt file at {tree_file}\n") else: sbt = sourmash.create_sbt_index() return sbt
def main(): p = argparse.ArgumentParser() p.add_argument('sbt') args = p.parse_args() db = sourmash.load_sbt_index(args.sbt) threshold = THRESHOLD for leaf in db.leaves(): query = leaf.data matches = db.find(search_minhashes, query, threshold) matches = list([ x.data for x in matches ]) if query not in matches: print(query) assert 0
def main(): p = argparse.ArgumentParser() p.add_argument('sbt') args = p.parse_args() db = sourmash.load_sbt_index(args.sbt) threshold = THRESHOLD for leaf in db.leaves(): query = leaf.data matches = db.find(search_minhashes, query, threshold) matches = list([x.data for x in matches]) if query not in matches: print(query) assert 0
def load_index(filename): import sourmash index = None try: index = sourmash.load_sbt_index(filename) except (ValueError, EnvironmentError): pass if index is None: try: index = sourmash.lca.lca_utils.LCA_Database() index.load(filename) except (ValueError, EnvironmentError, TypeError): pass if index is None: # TODO: raise error pass return index
import argparse parser = argparse.ArgumentParser() parser.add_argument("-k", "--ksize", type=int, default=51) parser.add_argument("-o", "--output", type=str, default=None) parser.add_argument("query") parser.add_argument("sbt", nargs="+") args = parser.parse_args() query = sourmash.load_one_signature(args.query, ksize=args.ksize) query_mins = set(query.minhash.get_mins()) for index in args.sbt: sbt = sourmash.load_sbt_index(index) for i, dataset in enumerate(sbt.leaves()): dataset_mins = dataset.data.minhash.get_mins() del dataset._data query_mins -= set(dataset_mins) if not query_mins: break if i % 100 == 0: print( f"Progress: {i} sigs processed, query has {len(query_mins)} hashes left" ) new_mh = query.minhash.copy_and_clear() if new_mh.track_abundance: new_mh.set_abundances({
for k in load_routing_keys(args.tags): channel.queue_bind(exchange=args.exchange, queue=queue_name, routing_key=k) # channel.queue_bind( # exchange=args.exchange, queue=queue_name, routing_key='phiweger2.#') # channel.queue_bind( # exchange=args.exchange, queue=queue_name, routing_key='*.*.found.#') # # exchange=args.exchange, queue=queue_name, routing_key='hello.#') # Pass user data to callbacks using partial() # https://github.com/pika/pika/issues/158 # on_message_callback=callback(download=True) try: db = sourmash.load_sbt_index(args.db) # 'genomes/context.sbt.json' except (FileNotFoundError, TypeError): db = None channel.basic_consume(queue=queue_name, on_message_callback=partial(callback, download=True, db=db, threshold=args.threshold, outfile=args.log, outdir=args.outdir)) # outfile can be e.g. ".log" or "-" # eprint('Waiting for messages. To exit press CTRL+C') channel.start_consuming()
def _load_database(filename, traverse, traverse_yield_all): """Load file as a database - list of signatures, LCA, SBT, etc. Return (db, dbtype), where dbtype is a DatabaseType enum. This is an internal function used by other functions in sourmash_args. """ loaded = False dbtype = None # special case stdin if not loaded and filename == '-': db = sourmash.load_signatures(sys.stdin, quiet=True, do_raise=True) db = list(db) loaded = True dbtype = DatabaseType.SIGLIST # load signatures from directory if not loaded and os.path.isdir(filename) and traverse: all_sigs = [] for thisfile in traverse_find_sigs([filename], traverse_yield_all): try: with open(thisfile, 'rt') as fp: x = sourmash.load_signatures(fp, quiet=True, do_raise=True) siglist = list(x) all_sigs.extend(siglist) except (IOError, sourmash.exceptions.SourmashError): if traverse_yield_all: continue else: raise loaded = True db = all_sigs dbtype = DatabaseType.SIGLIST # load signatures from single file try: # CTB: could make this a generator, with some trickery; but for # now, just force into list. with open(filename, 'rt') as fp: db = sourmash.load_signatures(fp, quiet=True, do_raise=True) db = list(db) loaded = True dbtype = DatabaseType.SIGLIST except Exception as exc: pass if not loaded: # try load as SBT try: db = load_sbt_index(filename) loaded = True dbtype = DatabaseType.SBT except: pass if not loaded: # try load as LCA try: db, _, _ = load_single_database(filename) loaded = True dbtype = DatabaseType.LCA except: pass if not loaded: successful_screed_load = False it = None try: # CTB: could be kind of time consuming for big record, but at the # moment screed doesn't expose format detection cleanly. with screed.open(filename) as it: record = next(iter(it)) successful_screed_load = True except: pass if successful_screed_load: raise OSError( "Error while reading signatures from '{}' - got sequences instead! Is this a FASTA/FASTQ file?" .format(filename)) if not loaded: raise OSError( "Error while reading signatures from '{}'.".format(filename)) return db, dbtype
found = ix.search(sig, do_containment=True, threshold=threshold) if found: for val, sig, _ in found: # uvig_317315 SRR1160888_95 length_20936_VirSor... name = sig.name().split('\t')[0] if not name in phage_names: phage_sigs.append(sig) phage_names.add(name) return phage_sigs # Housekeeping params = {'ksize': args.k, 'n': 0, 'scaled': args.scaled} ix = load_sbt_index(args.index) # Search each (pro)phage candidate in the phage database phage_sigs = sorted(search_ix(args.candidates, ix, params, args.min_containment), key=lambda x: x.name().split('\t')[0]) ''' Example header from gut phage database .fasta: uvig_256501\tERR1190858_420 length_42504_VirSorter_cat_2 On VirSorter categories: > Categories 1 and 4 represent the most confident assignments within each type meaning at least one hallmark viral gene is detected and an enrichment in viral‐like genes, 2 and 5 for ‘likely’ predictions containing either an enrichment in viral‐like genes or a hallmark gene, and 3 and 6 are ‘possible’ predictions. -- https://sfamjournals.onlinelibrary.wiley.com/doi/full/10.1111/1462-2920.15186 '''