def sbt_combine(args): parser = argparse.ArgumentParser() parser.add_argument('sbt_name', help='name to save SBT into') parser.add_argument('sbts', nargs='+', help='SBTs to combine to a new SBT') parser.add_argument('-x', '--bf-size', type=float, default=1e5) sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) moltype = sourmash_args.calculate_moltype(args) inp_files = list(args.sbts) notify('combining {} SBTs', len(inp_files)) tree = sourmash_lib.load_sbt_index(inp_files.pop(0)) for f in inp_files: new_tree = sourmash_lib.load_sbt_index(f) # TODO: check if parameters are the same for both trees! tree.combine(new_tree) notify('saving SBT under "{}".', args.sbt_name) tree.save(args.sbt_name)
def main(): p = argparse.ArgumentParser() p.add_argument('sbt') p.add_argument('-o', '--output', type=argparse.FileType('wt')) args = p.parse_args() db = sourmash_lib.load_sbt_index(args.sbt) for n, leaf in enumerate(db.leaves()): if n % 1000 == 0: print('... at leaf', n) name = leaf.data.name() # & output! args.output.write('{}\n'.format(name)) print('got accessions from {} signatures'.format(n + 1))
def gather_main(args): """ """ p = argparse.ArgumentParser() p.add_argument('--debug', action='store_true') p.add_argument('spreadsheet') p.add_argument('species') p.add_argument('--sbt') p.add_argument('-o', '--output', type=argparse.FileType('wt')) args = p.parse_args(args) if args.debug: set_debug(args.debug) assignments, num_rows = load_taxonomy_assignments(args.spreadsheet) found = False for ident, lineage in assignments.items(): for vv in lineage: if vv.rank == 'species' and vv.name == args.species: found = True found_lineage = lineage break if not found: print('nothing found for {}; quitting'.format(args.species)) sys.exit(-1) print('found:', ", ".join(lca_utils.zip_lineage(found_lineage))) lineage_search = dict(found_lineage) rank_found = defaultdict(list) rank_idents = defaultdict(list) taxlist = list(reversed(list(lca_utils.taxlist()))) for ident, lineage in assignments.items(): dd = dict(lineage) for k in taxlist: if dd.get(k) and dd.get(k) == lineage_search.get(k): rank_found[k].append(lineage) rank_idents[k].append(ident) break retrieve_idents = defaultdict(set) gimme_idents = {} for k in rank_found: print('at', k, 'found', len(rank_found.get(k))) num_to_extract = min(len(rank_idents[k]), 10) gimme = random.sample(rank_idents[k], num_to_extract) for g in gimme: gimme_idents[g] = k if not args.output or not args.sbt: print('no output arg or SBT arg given; quitting without extracting') sys.exit(-1) print('looking for:', len(gimme_idents)) tree = sourmash_lib.load_sbt_index(args.sbt) w = csv.writer(args.output) for n, leaf in enumerate(tree.leaves()): if n % 1000 == 0: print('...', n) name = leaf.data.name() # hack for NCBI-style names, etc. name = name.split(' ')[0].split('.')[0] if name in gimme_idents: level = gimme_idents[name] level_n = taxlist.index(level) filename = leaf.data.d['filename'] w.writerow([level, level_n, filename, leaf.data.name()]) print('FOUND!', leaf.data.name(), level)
def categorize(args): parser = argparse.ArgumentParser() parser.add_argument('sbt_name', help='name of SBT to load') parser.add_argument('queries', nargs='+', help='list of signatures to categorize') parser.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') parser.add_argument('-k', '--ksize', type=int, default=None) parser.add_argument('--threshold', default=0.08, type=float) parser.add_argument('--traverse-directory', action="store_true") sourmash_args.add_moltype_args(parser) parser.add_argument('--csv', type=argparse.FileType('at')) parser.add_argument('--load-csv', default=None) args = parser.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) already_names = set() if args.load_csv: with open(args.load_csv, 'rt') as fp: r = csv.reader(fp) for row in r: already_names.add(row[0]) tree = sourmash_lib.load_sbt_index(args.sbt_name) if args.traverse_directory: inp_files = set(sourmash_args.traverse_find_sigs(args.queries)) else: inp_files = set(args.queries) - already_names inp_files = set(inp_files) - already_names notify('found {} files to query', len(inp_files)) loader = sourmash_args.LoadSingleSignatures(inp_files, args.ksize, moltype) for queryfile, query, query_moltype, query_ksize in loader: notify('loaded query: {}... (k={}, {})', query.name()[:30], query_ksize, query_moltype) results = [] search_fn = sourmash_lib.sbtmh.SearchMinHashesFindBest().search for leaf in tree.find(search_fn, query, args.threshold): if leaf.data.md5sum() != query.md5sum(): # ignore self. results.append((query.similarity(leaf.data), leaf.data)) best_hit_sim = 0.0 best_hit_query_name = "" if results: results.sort(key=lambda x: -x[0]) # reverse sort on similarity best_hit_sim, best_hit_query = results[0] notify('for {}, found: {:.2f} {}', query.name(), best_hit_sim, best_hit_query.name()) best_hit_query_name = best_hit_query.name() else: notify('for {}, no match found', query.name()) if args.csv: w = csv.writer(args.csv) w.writerow([queryfile, best_hit_query_name, best_hit_sim]) if loader.skipped_ignore: notify('skipped/ignore: {}', loader.skipped_ignore) if loader.skipped_nosig: notify('skipped/nosig: {}', loader.skipped_nosig)
def index(args): """ Build an Sequence Bloom Tree index of the given signatures. """ import sourmash_lib.sbtmh parser = argparse.ArgumentParser() parser.add_argument('sbt_name', help='name to save SBT into') parser.add_argument('signatures', nargs='+', help='signatures to load into SBT') parser.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') parser.add_argument('-k', '--ksize', type=int, default=None, help='k-mer size for which to build the SBT.') parser.add_argument('-d', '--n_children', type=int, default=2, help='Number of children for internal nodes') parser.add_argument('--traverse-directory', action='store_true', help='load all signatures underneath this directory.') parser.add_argument('--append', action='store_true', default=False, help='add signatures to an existing SBT.') parser.add_argument('-x', '--bf-size', type=float, default=1e5, help='Bloom filter size used for internal nodes.') parser.add_argument( '-s', '--sparseness', type=float, default=.0, help='What percentage of internal nodes will not be saved. ' 'Ranges from 0.0 (save all nodes) to 1.0 (no nodes saved)') sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) if args.append: tree = sourmash_lib.load_sbt_index(args.sbt_name) else: tree = sourmash_lib.create_sbt_index(args.bf_size, n_children=args.n_children) if args.traverse_directory: inp_files = list(sourmash_args.traverse_find_sigs(args.signatures)) else: inp_files = list(args.signatures) if args.sparseness < 0 or args.sparseness > 1.0: error('sparseness must be in range [0.0, 1.0].') notify('loading {} files into SBT', len(inp_files)) n = 0 ksizes = set() moltypes = set() nums = set() scaleds = set() for f in inp_files: siglist = sig.load_signatures(f, ksize=args.ksize, select_moltype=moltype) # load all matching signatures in this file for ss in siglist: ksizes.add(ss.minhash.ksize) moltypes.add(sourmash_args.get_moltype(ss)) nums.add(ss.minhash.num) scaleds.add(ss.minhash.scaled) leaf = sourmash_lib.sbtmh.SigLeaf(ss.md5sum(), ss) tree.add_node(leaf) n += 1 # check to make sure we aren't loading incompatible signatures if len(ksizes) > 1 or len(moltypes) > 1: error('multiple k-mer sizes or molecule types present; fail.') error('specify --dna/--protein and --ksize as necessary') error('ksizes: {}; moltypes: {}', ", ".join(map(str, ksizes)), ", ".join(moltypes)) sys.exit(-1) if nums == {0} and len(scaleds) == 1: pass # good elif scaleds == {0} and len(nums) == 1: pass # also good else: error('trying to build an SBT with incompatible signatures.') error('nums = {}; scaleds = {}', repr(nums), repr(scaleds)) sys.exit(-1) # did we load any!? if n == 0: error('no signatures found to load into tree!? failing.') sys.exit(-1) notify('loaded {} sigs; saving SBT under "{}"', n, args.sbt_name) tree.save(args.sbt_name, sparseness=args.sparseness)
def watch(args): "Build a signature from raw FASTA/FASTQ coming in on stdin, search." from sourmash_lib.sbtmh import SearchMinHashesFindBest parser = argparse.ArgumentParser() parser.add_argument('sbt_name', help='name of SBT to search') parser.add_argument('inp_file', nargs='?', default='/dev/stdin') parser.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') parser.add_argument('-o', '--output', type=argparse.FileType('wt'), help='save signature generated from data here') parser.add_argument('--threshold', default=0.05, type=float, help='minimum threshold for matches') parser.add_argument( '--input-is-protein', action='store_true', help='Consume protein sequences - no translation needed') sourmash_args.add_construct_moltype_args(parser) parser.add_argument( '-n', '--num-hashes', type=int, default=DEFAULT_N, help='number of hashes to use in each sketch (default: %(default)i)') parser.add_argument('--name', type=str, default='stdin', help='name to use for generated signature') sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K) args = parser.parse_args(args) set_quiet(args.quiet) if args.input_is_protein and args.dna: notify('WARNING: input is protein, turning off DNA hashing.') args.dna = False args.protein = True if args.dna and args.protein: notify('ERROR: cannot use "watch" with both DNA and protein.') if args.dna: moltype = 'DNA' is_protein = False else: moltype = 'protein' is_protein = True tree = sourmash_lib.load_sbt_index(args.sbt_name) # check ksize from the SBT we are loading ksize = args.ksize if ksize is None: leaf = next(iter(tree.leaves())) tree_mh = leaf.data.minhash ksize = tree_mh.ksize E = sourmash_lib.MinHash(ksize=ksize, n=args.num_hashes, is_protein=is_protein) streamsig = sig.SourmashSignature(E, filename='stdin', name=args.name) notify('Computing signature for k={}, {} from stdin', ksize, moltype) def do_search(): search_fn = SearchMinHashesFindBest().search results = [] for leaf in tree.find(search_fn, streamsig, args.threshold): results.append((streamsig.similarity(leaf.data), leaf.data)) return results notify('reading sequences from stdin') screed_iter = screed.open(args.inp_file) watermark = WATERMARK_SIZE # iterate over input records n = 0 for n, record in enumerate(screed_iter): # at each watermark, print status & check cardinality if n >= watermark: notify('\r... read {} sequences', n, end='') watermark += WATERMARK_SIZE if do_search(): break if args.input_is_protein: E.add_protein(record.sequence) else: E.add_sequence(record.sequence, False) results = do_search() if not results: notify('... read {} sequences, no matches found.', n) else: results.sort(key=lambda x: -x[0]) # take best similarity, found_sig = results[0] print_results('FOUND: {}, at {:.3f}', found_sig.name(), similarity) if args.output: notify('saving signature to {}', args.output.name) sig.save_signatures([streamsig], args.output)