def write(self, csv_writer, csvoutfp, outdir): containment = self.containment() similarity = self.similarity() q_name = self.query.filename bp = self.total_bp seqs = self.total_seq k = self.query.ksize num_q_kmers = len(self.query.kmers) (best_con, cdbg_min_oh, catlas_min_oh) = self.query.con_sim_upper_bounds(self.catlas, self.kmer_idx) # output to results.csv! csv_writer.writerow([q_name, containment, similarity, bp, seqs, k, num_q_kmers, best_con, cdbg_min_oh, catlas_min_oh]) csvoutfp.flush() # write out signature from retrieved contigs. sig_filename = os.path.basename(q_name) + '.contigs.sig' with open(os.path.join(outdir, sig_filename), 'wt') as fp: ss = sourmash_lib.SourmashSignature(self.contigs_minhash, name='nbhd:'+self.query.name, filename=sig_filename) sourmash_lib.save_signatures([ss], fp) # write out cDBG IDs cdbg_listname = os.path.basename(q_name) + '.cdbg_ids.txt.gz' with gzip.open(os.path.join(outdir, cdbg_listname), 'wt') as fp: fp.write("\n".join([str(x) for x in sorted(self.shadow)])) # write out frontier nodes by seed frontier_listname = os.path.basename(q_name) + '.frontier.txt.gz' with gzip.open(os.path.join(outdir, frontier_listname), 'wt') as fp: for node, seedlist in sorted(self.frontier.items()): fp.write('{},{}\n'.format(node, " ".join([str(x) for x in sorted(seedlist)]))) # write response curve response_curve_filename = os.path.basename(q_name) + '.response.txt' response_curve_filename = os.path.join(outdir, response_curve_filename) cdbg_match_counts = self.query.cdbg_match_counts[self.catlas.name] search_utils.output_response_curve(response_curve_filename, cdbg_match_counts, self.kmer_idx, self.catlas.layer1_to_cdbg)
def write(self, csv_writer, csvoutfp, outdir): containment = self.containment() similarity = self.similarity() q_name = self.query.filename bp = self.total_bp seqs = self.total_seq k = self.query.ksize num_q_kmers = len(self.query.kmers) (best_con, cdbg_min_oh, catlas_min_oh) = self.query.con_sim_upper_bounds( self.catlas, self.kmer_idx) # output to results.csv! csv_writer.writerow([ q_name, containment, similarity, bp, seqs, k, num_q_kmers, best_con, cdbg_min_oh, catlas_min_oh ]) csvoutfp.flush() # write out signature from retrieved contigs. sig_filename = os.path.basename(q_name) + '.contigs.sig' with open(os.path.join(outdir, sig_filename), 'wt') as fp: ss = sourmash_lib.SourmashSignature(self.contigs_minhash, name='nbhd:' + self.query.name, filename=sig_filename) sourmash_lib.save_signatures([ss], fp) # write out cDBG IDs cdbg_listname = os.path.basename(q_name) + '.cdbg_ids.txt.gz' with gzip.open(os.path.join(outdir, cdbg_listname), 'wt') as fp: fp.write("\n".join([str(x) for x in sorted(self.shadow)])) # write out frontier nodes by seed frontier_listname = os.path.basename(q_name) + '.frontier.txt.gz' with gzip.open(os.path.join(outdir, frontier_listname), 'wt') as fp: for node, seedlist in sorted(self.frontier.items()): fp.write('{},{}\n'.format( node, " ".join([str(x) for x in sorted(seedlist)]))) # write response curve response_curve_filename = os.path.basename(q_name) + '.response.txt' response_curve_filename = os.path.join(outdir, response_curve_filename) cdbg_match_counts = self.query.cdbg_match_counts[self.catlas.name] search_utils.output_response_curve(response_curve_filename, cdbg_match_counts, self.kmer_idx, self.catlas.layer1_to_cdbg)
def test_sourmash_signature_api(): e = sourmash.MinHash(n=1, ksize=20) sig = sourmash.SourmashSignature(e) s = sourmash.save_signatures([sig]) sig_x1 = sourmash.load_one_signature(s) sig_x2 = list(sourmash.load_signatures(s))[0] assert sig_x1 == sig assert sig_x2 == sig
def main(args=sys.argv[1:]): p = argparse.ArgumentParser() p.add_argument('catlas_prefix', help='catlas prefix') p.add_argument('output') p.add_argument('--minsize', type=float, default=100) p.add_argument('--maxsize', type=float, default=10000) p.add_argument('--keep-fraction', type=float, default=0.1) p.add_argument('-k', '--ksize', default=31, type=int, help='k-mer size (default: 31)') args = p.parse_args(args) print('minsize: {:g}'.format(args.minsize)) print('maxsize: {:g}'.format(args.maxsize)) basename = os.path.basename(args.catlas_prefix) catlas = os.path.join(args.catlas_prefix, 'catlas.csv') domfile = os.path.join(args.catlas_prefix, 'first_doms.txt') # load catlas DAG top_node_id, dag, dag_up, dag_levels, cdbg_to_catlas = search_utils.load_dag( catlas) print('loaded {} nodes from catlas {}'.format(len(dag), catlas)) # load mapping between dom nodes and cDBG/graph nodes: layer1_to_cdbg = search_utils.load_layer1_to_cdbg(cdbg_to_catlas, domfile) print('loaded {} layer 1 catlas nodes'.format(len(layer1_to_cdbg))) # calculate the cDBG shadow sizes for each catlas node. print('decorating catlas with shadow size info.') node_shadow_sizes = search_utils.decorate_catlas_with_shadow_sizes( layer1_to_cdbg, dag, dag_levels) # ...and load cdbg node sizes print('loading contig size info') cdbg_kmer_sizes, cdbg_weighted_kmer_sizes = search_utils.load_cdbg_size_info( args.catlas_prefix) # decorate catlas with cdbg node sizes underneath them print('decorating catlas with contig size info.') node_kmer_sizes, node_weighted_kmer_sizes = search_utils.decorate_catlas_with_kmer_sizes( layer1_to_cdbg, dag, dag_levels, cdbg_kmer_sizes, cdbg_weighted_kmer_sizes) ### ok, the real work: look at articulation of cDBG graph. # find highest nodes with kmer size less than given max_size def find_terminal_nodes(node_id, max_size): node_list = set() for sub_id in dag[node_id]: # shadow size size = node_kmer_sizes[sub_id] if size < max_size: node_list.add(sub_id) else: children = find_terminal_nodes(sub_id, max_size) node_list.update(children) return node_list print('finding terminal nodes for {}.'.format(args.maxsize)) terminal = find_terminal_nodes(top_node_id, args.maxsize) print('...got {}'.format(len(terminal))) terminal = {n for n in terminal if node_kmer_sizes[n] > args.minsize} print('...down to {} between {} and {} in size.'.format( len(terminal), args.minsize, args.maxsize)) # now, go through and calculate ratios x = [] for node_id in terminal: # calculate: how many k-mers per cDBG node? kmer_size = node_kmer_sizes[node_id] shadow_size = node_shadow_sizes[node_id] ratio = math.log(kmer_size, 2) - math.log(shadow_size, 2) # track basic info x.append((ratio, node_id, shadow_size, kmer_size)) print('terminal node stats for maxsize: {:g}'.format(args.maxsize)) print('n tnodes:', len(terminal)) print('total k-mers:', node_kmer_sizes[top_node_id]) x.sort(reverse=True) for (k, v, a, b) in x[:10]: print('ratio: {:.3f}'.format(2**k), '/ shadow size:', a, '/ kmers:', b) print('... eliding {} nodes'.format(len(x) - 20)) for (k, v, a, b) in x[-10:]: print('ratio: {:.3f}'.format(2**k), '/ shadow size:', a, '/ kmers:', b) # keep the last keep-fraction (default 10%) for examination keep_sum_kmer = args.keep_fraction * node_kmer_sizes[top_node_id] sofar = 0 keep_terminal = set() for (k, v, a, b) in reversed(x): sofar += b if sofar > keep_sum_kmer: break keep_terminal.add(v) print( 'keeping last {} k-mers worth of nodes for examination.'.format(sofar)) # build cDBG shadow ID list. cdbg_shadow = set() terminal_shadow = find_shadow(keep_terminal, dag) for x in terminal_shadow: cdbg_shadow.update(layer1_to_cdbg.get(x)) #### extract contigs print('extracting contigs & building a sourmash signature') contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz') # track results as signature contigs_mh = sourmash_lib.MinHash(n=0, ksize=args.ksize, scaled=1000) total_bp = 0 total_seqs = 0 outfp = open(args.output, 'wt') for n, record in enumerate(screed.open(contigs)): if n and n % 10000 == 0: offset_f = total_seqs / len(cdbg_shadow) print('...at n {} ({:.1f}% of shadow)'.format( total_seqs, offset_f * 100), end='\r') # contig names == cDBG IDs contig_id = int(record.name) if contig_id not in cdbg_shadow: continue outfp.write('>{}\n{}\n'.format(record.name, record.sequence)) contigs_mh.add_sequence(record.sequence) # track retrieved sequences in a minhash total_bp += len(record.sequence) total_seqs += 1 # done - got all contigs! print('') print('fetched {} contigs, {} bp.'.format(total_seqs, total_bp)) print('wrote contigs to {}'.format(args.output)) with open(args.output + '.sig', 'wt') as fp: ss = sourmash_lib.SourmashSignature(contigs_mh) sourmash_lib.save_signatures([ss], fp)
def gather_main(args): """ Do a greedy search for the hash components of a query against an LCA db. Here we don't actually do a least-common-ancestor search of any kind; we do essentially the same kind of search as we do in `sourmash gather`, with the main difference that we are implicitly combining different genomes of identical lineages. This takes advantage of the structure of the LCA db, where we store the full lineage information for each known hash, as opposed to storing only the least-common-ancestor information for it. """ p = argparse.ArgumentParser(prog="sourmash lca gather") p.add_argument('query') p.add_argument('db', nargs='+') p.add_argument('-d', '--debug', action='store_true') p.add_argument('-o', '--output', type=argparse.FileType('wt'), help='output CSV containing matches to this file') p.add_argument('--output-unassigned', type=argparse.FileType('wt'), help='output unassigned portions of the query as a signature to this file') p.add_argument('--ignore-abundance', action='store_true', help='do NOT use k-mer abundances if present') args = p.parse_args(args) if args.debug: set_debug(args.debug) # load all the databases dblist, ksize, scaled = lca_utils.load_databases(args.db, None) # for each query, gather all the matches across databases query_sig = sourmash_args.load_query_signature(args.query, ksize, 'DNA') debug('classifying', query_sig.name()) # make sure we're looking at the same scaled value as database query_sig.minhash = query_sig.minhash.downsample_scaled(scaled) # do the classification, output results found = [] for result, f_unassigned, est_bp, remaining_mins in gather_signature(query_sig, dblist, args.ignore_abundance): # is this our first time through the loop? print headers, if so. if not len(found): print_results("") print_results("overlap p_query p_match ") print_results("--------- ------- --------") # output! pct_query = '{:.1f}%'.format(result.f_unique_to_query*100) pct_match = '{:.1f}%'.format(result.f_match*100) str_bp = format_bp(result.intersect_bp) name = format_lineage(result.lineage) equal_match_str = "" if result.n_equal_matches: equal_match_str = " (** {} equal matches)".format(result.n_equal_matches) print_results('{:9} {:>6} {:>6} {}{}', str_bp, pct_query, pct_match, name, equal_match_str) found.append(result) if found: print_results('') if f_unassigned: print_results('{:.1f}% ({}) of hashes have no assignment.', f_unassigned*100, format_bp(est_bp)) else: print_results('Query is completely assigned.') print_results('') # nothing found. else: est_bp = len(query_sig.minhash.get_mins()) * query_sig.minhash.scaled print_results('') print_results('No assignment for est {} of sequence.', format_bp(est_bp)) print_results('') if not found: sys.exit(0) if args.output: fieldnames = ['intersect_bp', 'f_match', 'f_unique_to_query', 'f_unique_weighted', 'average_abund', 'name', 'n_equal_matches'] + list(lca_utils.taxlist()) w = csv.DictWriter(args.output, fieldnames=fieldnames) w.writeheader() for result in found: lineage = result.lineage d = dict(result._asdict()) del d['lineage'] for (rank, value) in lineage: d[rank] = value w.writerow(d) if args.output_unassigned: if not found: notify('nothing found - entire query signature unassigned.') elif not remaining_mins: notify('no unassigned hashes! not saving.') else: outname = args.output_unassigned.name notify('saving unassigned hashes to "{}"', outname) e = query_sig.minhash.copy_and_clear() e.add_many(remaining_mins) sourmash_lib.save_signatures([ sourmash_lib.SourmashSignature(e) ], args.output_unassigned)
def main(args=sys.argv[1:]): p = argparse.ArgumentParser() p.add_argument('catlas_prefix', help='catlas prefix') p.add_argument('output') p.add_argument('--minsize', type=float, default=100) p.add_argument('--maxsize', type=float, default=10000) p.add_argument('--keep-fraction', type=float, default=0.1) p.add_argument('-k', '--ksize', default=31, type=int, help='k-mer size (default: 31)') args = p.parse_args(args) print('minsize: {:g}'.format(args.minsize)) print('maxsize: {:g}'.format(args.maxsize)) catlas_file = os.path.join(args.catlas_prefix, 'catlas.csv') domfile = os.path.join(args.catlas_prefix, 'first_doms.txt') sizefile = os.path.join(args.catlas_prefix, 'contigs.fa.gz.info.csv') # load catlas DAG catlas = CAtlas(catlas_file, domfile=domfile, sizefile=sizefile) print('loaded {} nodes from catlas {}'.format(len(catlas), catlas_file)) print('loaded {} layer 1 catlas nodes'.format(len(catlas.layer1_to_cdbg))) # calculate the cDBG shadow sizes for each catlas node. print('decorating catlas with shadow size info.') catlas.decorate_with_shadow_sizes() # ok, the real work: look at articulation of cDBG graph. # find highest nodes with kmer size less than given max_size def find_terminal_nodes(node_id, max_size): node_list = set() for sub_id in catlas.children[node_id]: # shadow size size = catlas.kmer_sizes[sub_id] if size < max_size: node_list.add(sub_id) else: children = find_terminal_nodes(sub_id, max_size) node_list.update(children) return node_list print('finding terminal nodes for {}.'.format(args.maxsize)) terminal = find_terminal_nodes(catlas.root, args.maxsize) print('...got {}'.format(len(terminal))) terminal = {n for n in terminal if catlas.kmer_sizes[n] > args.minsize} print('...down to {} between {} and {} in size.'.format(len(terminal), args.minsize, args.maxsize)) # now, go through and calculate ratios x = [] for node_id in terminal: # calculate: how many k-mers per cDBG node? kmer_size = catlas.kmer_sizes[node_id] shadow_size = catlas.shadow_sizes[node_id] ratio = math.log(kmer_size, 2) - math.log(shadow_size, 2) # track basic info x.append((ratio, node_id, shadow_size, kmer_size)) print('terminal node stats for maxsize: {:g}'.format(args.maxsize)) print('n tnodes:', len(terminal)) print('total k-mers:', catlas.kmer_sizes[catlas.root]) x.sort(reverse=True) for (k, v, a, b) in x[:10]: print('ratio: {:.3f}'.format(2**k), '/ shadow size:', a, '/ kmers:', b) print('... eliding {} nodes'.format(len(x) - 20)) for (k, v, a, b) in x[-10:]: print('ratio: {:.3f}'.format(2**k), '/ shadow size:', a, '/ kmers:', b) # keep the last keep-fraction (default 10%) for examination keep_sum_kmer = args.keep_fraction * catlas.kmer_sizes[catlas.root] sofar = 0 keep_terminal = set() for (k, v, a, b) in reversed(x): sofar += b if sofar > keep_sum_kmer: break keep_terminal.add(v) print('keeping last {} k-mers worth of nodes for' 'examination.'.format(sofar)) # build cDBG shadow ID list. cdbg_shadow = catlas.shadow(keep_terminal) # extract contigs print('extracting contigs & building a sourmash signature') contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz') # track results as signature contigs_mh = sourmash_lib.MinHash(n=0, ksize=args.ksize, scaled=1000) total_bp = 0 total_seqs = 0 outfp = open(args.output, 'wt') for n, record in enumerate(screed.open(contigs)): if n and n % 10000 == 0: offset_f = total_seqs / len(cdbg_shadow) print('...at n {} ({:.1f}% of shadow)'.format(total_seqs, offset_f * 100), end='\r') # contig names == cDBG IDs contig_id = int(record.name) if contig_id not in cdbg_shadow: continue outfp.write('>{}\n{}\n'.format(record.name, record.sequence)) contigs_mh.add_sequence(record.sequence) # track retrieved sequences in a minhash total_bp += len(record.sequence) total_seqs += 1 # done - got all contigs! print('') print('fetched {} contigs, {} bp.'.format(total_seqs, total_bp)) print('wrote contigs to {}'.format(args.output)) with open(args.output + '.sig', 'wt') as fp: ss = sourmash_lib.SourmashSignature(contigs_mh) sourmash_lib.save_signatures([ss], fp)
def main(args=sys.argv[1:]): p = argparse.ArgumentParser() p.add_argument('catlas_prefix', help='catlas prefix') p.add_argument('query') p.add_argument('output') p.add_argument('--threshold', default=0.0, type=float) p.add_argument('--minsize', default=0, type=int) p.add_argument('-k', '--ksize', default=31, type=int, help='k-mer size (default: 31)') args = p.parse_args(args) print('threshold: {:.3f}'.format(args.threshold)) basename = os.path.basename(args.catlas_prefix) catlas = os.path.join(args.catlas_prefix, 'catlas.csv') domfile = os.path.join(args.catlas_prefix, 'first_doms.txt') # load catlas DAG top_node_id, dag, dag_up, dag_levels, cdbg_to_catlas = search_utils.load_dag(catlas) print('loaded {} nodes from catlas {}'.format(len(dag), catlas)) # load mapping between dom nodes and cDBG/graph nodes: layer1_to_cdbg = search_utils.load_layer1_to_cdbg(cdbg_to_catlas, domfile) print('loaded {} layer 1 catlas nodes'.format(len(layer1_to_cdbg))) # calculate the cDBG shadow sizes for each catlas node. print('decorating catlas with shadow size info.') node_shadow_sizes = search_utils.decorate_catlas_with_shadow_sizes(layer1_to_cdbg, dag, dag_levels) # ...and load cdbg node sizes print('loading contig size info') cdbg_kmer_sizes, cdbg_weighted_kmer_sizes = search_utils.load_cdbg_size_info(args.catlas_prefix) # decorate catlas with cdbg node sizes underneath them print('decorating catlas with contig size info.') node_kmer_sizes, node_weighted_kmer_sizes = search_utils.decorate_catlas_with_kmer_sizes(layer1_to_cdbg, dag, dag_levels, cdbg_kmer_sizes, cdbg_weighted_kmer_sizes) # load k-mer index, query, etc. etc. kmer_idx = search_utils.load_kmer_index(args.catlas_prefix) bf = khmer.Nodetable(args.ksize, 1, 1) query_kmers = set() for record in screed.open(args.query): query_kmers.update(bf.get_kmer_hashes(record.sequence)) print('got {} k-mers from {}'.format(len(query_kmers), args.query)) # construct dict cdbg_id -> # of query k-mers cdbg_match_counts = kmer_idx.get_match_counts(query_kmers) total_match_kmers = sum(cdbg_match_counts.values()) f_found = total_match_kmers / len(query_kmers) print('=> containment: {:.1f}%'.format(f_found * 100)) print('done loading & counting query k-mers in cDBG.') if total_match_kmers == 0: print('no match k-mers!?') sys.exit(-1) # calculate the cDBG matching k-mers sizes for each catlas node. catlas_match_counts = kmer_idx.build_catlas_match_counts(cdbg_match_counts, dag, dag_levels, layer1_to_cdbg) ### ok, the real work: find nodes that have low # of k-mers in the query. def find_unassembled_nodes(node_id, threshold=0.0): node_list = set() for sub_id in dag[node_id]: n_matched = catlas_match_counts.get(sub_id, 0) size = node_kmer_sizes[sub_id] f_assembled = n_matched / size # if the fraction of unassembled k-mers under this node is below # our threshold, KEEP the node. Otherwise, descend into children. if f_assembled <= threshold: node_list.add(sub_id) else: children = find_unassembled_nodes(sub_id, threshold) node_list.update(children) return node_list print('finding unassembled nodes for threshold {}.'.format(args.threshold)) terminal = find_unassembled_nodes(top_node_id, args.threshold) sum_kmers = sum([ node_kmer_sizes[n] for n in terminal ]) sum_match_kmers = sum([ catlas_match_counts.get(n, 0) for n in terminal ]) print('...got {} nodes, representing {} k-mers'.format(len(terminal), sum_kmers)) # now, go through all nodes and print out characteristics print('writing node info to {}'.format(args.output + '.csv')) with open(args.output + '.csv', 'wt') as fp: w = csv.writer(fp) w.writerow(['node_id', 'contained', 'n_kmers', 'n_weighted_kmers', 'average_weight','shadow_size']) for n in terminal: f_contained = catlas_match_counts.get(n, 0) / node_kmer_sizes[n] w.writerow([n, '{:.3f}'.format(f_contained), node_kmer_sizes[n], '{:.1f}'.format(node_weighted_kmer_sizes[n]), '{:.2f}'.format(node_weighted_kmer_sizes[n] / node_kmer_sizes[n]), node_shadow_sizes[n]]) if args.minsize: print('minsize set: {}. filtering.'.format(args.minsize)) new_terminal = set() for n in terminal: if node_kmer_sizes[n] >= args.minsize: new_terminal.add(n) print('removed {} nodes => {}'.format(len(terminal)-len(new_terminal), len(new_terminal))) terminal = new_terminal # build cDBG shadow ID list, tagged by parent catlas node. cdbg_id_to_node = {} for n in terminal: this_shadow = find_shadow([n], dag) for x in this_shadow: v = layer1_to_cdbg[x] for vv in v: cdbg_id_to_node[vv] = n #### extract contigs print('extracting contigs & building a sourmash signature') contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz') # track results as signature contigs_mh = sourmash_lib.MinHash(n=0, ksize=args.ksize, scaled=1000) total_bp = 0 total_seqs = 0 print('writing contigs to {}'.format(args.output + '.fa')) outfp = open(args.output + '.fa', 'wt') for n, record in enumerate(screed.open(contigs)): if n and n % 10000 == 0: offset_f = total_seqs / len(cdbg_id_to_node) print('...at n {} ({:.1f}% of shadow)'.format(total_seqs, offset_f * 100), end='\r') # contig names == cDBG IDs contig_id = int(record.name) catlas_parent = cdbg_id_to_node.get(contig_id) if catlas_parent is None: continue outfp.write('>{} {}\n{}\n'.format(record.name, catlas_parent, record.sequence)) contigs_mh.add_sequence(record.sequence) # track retrieved sequences in a minhash total_bp += len(record.sequence) total_seqs += 1 # done - got all contigs! print('') print('fetched {} contigs, {} bp.'.format(total_seqs, total_bp)) print('writing sig to {}'.format(args.output + '.sig')) with open(args.output + '.sig', 'wt') as fp: ss = sourmash_lib.SourmashSignature(contigs_mh) sourmash_lib.save_signatures([ss], fp)
def main(argv): p = argparse.ArgumentParser() p.add_argument('catlas_prefix', help='catlas prefix') p.add_argument('--overhead', help='\% of overhead', type=float, default=0.0) p.add_argument('output') p.add_argument('--min_containment', help="minimum containment", type=float, default=1.0) p.add_argument('--max_overhead', help="largest overhead allowed", type=float, default=1.0) p.add_argument('--query', help='query sequences', nargs='+') p.add_argument('--no-empty', action='store_true') p.add_argument('-k', '--ksize', default=31, type=int, help='k-mer size (default: 31)') p.add_argument('--scaled', default=1000, type=float) p.add_argument('-v', '--verbose', action='store_true') args = p.parse_args(argv) # make sure all of the query sequences exist. for filename in args.query: if not os.path.exists(filename): print('query seq file {} does not exist.'.format(filename)) sys.exit(-1) # create output directory if it doesn't exist. try: os.mkdir(args.output) except OSError: pass if not os.path.isdir(args.output): print('output {} is not a directory'.format(args.output)) sys.exit(-1) # figure out catlas and domfile information. basename = os.path.basename(args.catlas_prefix) catlas = os.path.join(args.catlas_prefix, 'catlas.csv') domfile = os.path.join(args.catlas_prefix, 'first_doms.txt') # load catlas DAG top_node_id, dag, dag_up, dag_levels, catlas_to_cdbg = load_dag(catlas) print('loaded {} nodes from catlas {}'.format(len(dag), catlas)) # load mapping between dom nodes and cDBG/graph nodes: layer1_to_cdbg = load_layer1_to_cdbg(catlas_to_cdbg, domfile) print('loaded {} layer 1 catlas nodes'.format(len(layer1_to_cdbg))) # find the contigs filename contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz') # ...and kmer index. ki_start = time.time() kmer_idx = load_kmer_index(args.catlas_prefix) print('loaded {} k-mers in index ({:.1f}s)'.format( len(kmer_idx.mphf_to_kmer), time.time() - ki_start)) # calculate the k-mer sizes for each catlas node. node_sizes = kmer_idx.build_catlas_node_sizes(dag, dag_levels, layer1_to_cdbg) # get a single ksize & scaled ksize = int(args.ksize) scaled = int(args.scaled) # record command line with open(os.path.join(args.output, 'command.txt'), 'wt') as fp: fp.write(str(sys.argv)) fp.write("\n") # output results.csv in the output directory: csvoutfp = open(os.path.join(args.output, 'results.csv'), 'wt') csv_writer = csv.writer(csvoutfp) csv_writer.writerow([ 'query', 'containment', 'similarity', 'bp', 'contigs', 'ksize', 'num_query_kmers', 'best_containment', 'cdbg_min_overhead', 'catlas_min_overhead' ]) # iterate over each query, do the thing. for query in args.query: # ignore all the problems! try: print('----') print('QUERY FILE:', query) start_time = time.time() # build hashes for all the query k-mers print('loading query kmers...', end=' ') bf = khmer.Nodetable(ksize, 1, 1) query_kmers = set() query_name = None for record in screed.open(query): if query_name is None: query_name = record.name query_kmers.update(bf.get_kmer_hashes(record.sequence)) print('got {}'.format(len(query_kmers))) # construct dict cdbg_id -> # of query k-mers cdbg_match_counts = kmer_idx.get_match_counts(query_kmers) for k, v in cdbg_match_counts.items(): assert v <= kmer_idx.get_cdbg_size(k), k total_match_kmers = sum(cdbg_match_counts.values()) f_found = total_match_kmers / len(query_kmers) print('=> containment: {:.1f}%'.format(f_found * 100)) print('done loading & counting query k-mers in cDBG.' ' ({:.1f}s)'.format(time.time() - start_time)) total_kmers_in_cdbg_matches = 0 for cdbg_id in set(cdbg_match_counts.keys()): total_kmers_in_cdbg_matches += kmer_idx.get_cdbg_size(cdbg_id) cdbg_sim = total_match_kmers / total_kmers_in_cdbg_matches print('cdbg match node similarity: {:.1f}%'.format(cdbg_sim * 100)) cdbg_min_overhead = (total_kmers_in_cdbg_matches - total_match_kmers) /\ total_kmers_in_cdbg_matches print('min cdbg overhead: {}'.format(cdbg_min_overhead)) # calculate the cDBG matching k-mers sizes for each catlas node. catlas_match_counts =\ kmer_idx.build_catlas_match_counts(cdbg_match_counts, dag, dag_levels, layer1_to_cdbg) # check a few things - we've propogated properly: assert sum(cdbg_match_counts.values()) == \ catlas_match_counts[top_node_id] # ...and all nodes have no more matches than total k-mers. for k, v in catlas_match_counts.items(): assert v <= node_sizes[k], k # calculate the minimum overhead of the search, based on level 1 # nodes. catlas_min_overhead = 0 if catlas_match_counts[top_node_id]: all_query_kmers = catlas_match_counts[top_node_id] total_kmers_in_query_nodes = 0 for node_id, level in dag_levels.items(): if level == 1 and catlas_match_counts.get(node_id): total_kmers_in_query_nodes += node_sizes[node_id] catlas_min_overhead = (total_kmers_in_query_nodes - all_query_kmers) /\ total_kmers_in_query_nodes print( 'minimum catlas overhead: {}'.format(catlas_min_overhead)) # gather results of all queries fuzzy = args.max_overhead != 1.0 if fuzzy: max_oh = args.max_overhead min_con = args.min_containment total_frontier = collect_frontier(dag, top_node_id, node_sizes, catlas_match_counts, max_overhead=max_oh, min_containment=min_con) else: total_frontier = collect_frontier_exact(dag, top_node_id, node_sizes, catlas_match_counts, overhead=args.overhead, verbose=args.verbose) # calculate level 1 nodes for this frontier in the catlas total_shadow = find_shadow(total_frontier, dag) # calculate associated cDBG nodes cdbg_shadow = set() for x in total_shadow: cdbg_shadow.update(layer1_to_cdbg.get(x)) # done with main loop! now extract contigs using cDBG shadow # node list. print('done searching! {} frontier, {} catlas shadow nodes, {}' ' cdbg nodes.'.format(len(total_frontier), len(total_shadow), len(cdbg_shadow))) # track extracted info total_bp = 0 total_seqs = 0 # build check MinHash w/seed=42 query_sig = build_query_mh_for_seed(42, ksize, scaled, query) # track minhash of retrieved contigs using original query minhash: contigs_minhash = query_sig.minhash.copy_and_clear() retrieve_start = time.time() # walk through the contigs, retrieving. print('extracting contigs...') for n, record in enumerate( search_utils.get_contigs_by_cdbg(contigs, cdbg_shadow)): if n and n % 10000 == 0: offset_f = total_seqs / len(cdbg_shadow) print('...at n {} ({:.1f}% of shadow)'.format( total_seqs, offset_f * 100), end='\r') # track retrieved sequences in a minhash contigs_minhash.add_sequence(str(record.sequence), True) total_bp += len(record.sequence) total_seqs += 1 # done - got all contigs! print('...fetched {} contigs, {} bp matching combined frontiers. ' ' ({:.1f}s)'.format(total_seqs, total_bp, time.time() - retrieve_start)) # calculate summary values of extracted contigs containment = query_sig.minhash.contained_by(contigs_minhash) similarity = query_sig.minhash.similarity(contigs_minhash) print('query inclusion by retrieved contigs:' ' {:.3f}%'.format(containment * 100)) print('query similarity to retrieved contigs:' ' {:.3f}%'.format(similarity * 100)) # recover from above. best_containment = f_found # output to results.csv! csv_writer.writerow([ query, containment, similarity, total_bp, total_seqs, ksize, len(query_kmers), best_containment, cdbg_min_overhead, catlas_min_overhead ]) csvoutfp.flush() # write out signature from retrieved contigs. sig_filename = os.path.basename(query) + '.contigs.sig' with open(os.path.join(args.output, sig_filename), 'wt') as fp: ss = sourmash_lib.SourmashSignature(contigs_minhash, name='nbhd:' + query_name, filename=sig_filename) sourmash_lib.save_signatures([ss], fp) # write out cDBG IDs cdbg_listname = os.path.basename(query) + '.cdbg_ids.txt.gz' with gzip.open(os.path.join(args.output, cdbg_listname), 'wt') as fp: fp.write("\n".join([str(x) for x in cdbg_shadow])) # write out frontier nodes by seed frontier_listname = os.path.basename(query) + '.frontier.txt.gz' with gzip.open(os.path.join(args.output, frontier_listname), 'wt') as fp: for node, seedlist in total_frontier.items(): fp.write('{},{}\n'.format( node, " ".join([str(x) for x in seedlist]))) # write response curve response_curve_filename = os.path.basename(query) + '.response.txt' response_curve_filename = os.path.join(args.output, response_curve_filename) search_utils.output_response_curve(response_curve_filename, cdbg_match_counts, kmer_idx, layer1_to_cdbg) print('total time: {:.1f}s'.format(time.time() - start_time)) except KeyboardInterrupt: raise except: traceback.print_exc() # end main loop! sys.exit(0)
def main(): p = argparse.ArgumentParser() p.add_argument('lca_filename') p.add_argument('sigfiles', nargs='+') p.add_argument('-k', '--ksize', default=31, type=int) p.add_argument( '--output-unassigned', type=argparse.FileType('wt'), help= 'output unassigned portions of the query as a signature to this file') args = p.parse_args() # load lca info lca_db = lca_json.LCA_Database(args.lca_filename) taxfoo, hashval_to_lca, scaled = lca_db.get_database(args.ksize, SCALED) # load signatures siglist = [] print('loading signatures from {} signature files'.format( len(args.sigfiles))) for sigfile in args.sigfiles: sigs = sourmash_lib.load_signatures(sigfile, ksize=args.ksize) sigs = list(sigs) siglist.extend(sigs) print('loaded {} signatures total at k={}'.format(len(siglist), args.ksize)) # downsample print('downsampling to scaled value: {}'.format(scaled)) for sig in siglist: if sig.minhash.scaled < scaled: sig.minhash = sig.minhash.downsample_scaled(scaled) # now, extract hash values! hashvals = collections.defaultdict(int) for sig in siglist: for hashval in sig.minhash.get_mins(): hashvals[hashval] += 1 found = 0 total = 0 by_taxid = collections.defaultdict(int) unassigned_hashvals = set() # for every hash, get LCA of labels for hashval, count in hashvals.items(): lca = hashval_to_lca.get(hashval) total += count if lca is None: by_taxid[0] += count unassigned_hashvals.add(hashval) continue by_taxid[lca] += count found += count print('found LCA classifications for', found, 'of', total, 'hashes') not_found = total - found # now, propogate counts up the taxonomic tree. by_taxid_lca = collections.defaultdict(int) for taxid, count in by_taxid.items(): by_taxid_lca[taxid] += count parent = taxfoo.child_to_parent.get(taxid) while parent != None and parent != 1: by_taxid_lca[parent] += count parent = taxfoo.child_to_parent.get(parent) total_count = sum(by_taxid.values()) # sort by lineage length x = [] for taxid, count in by_taxid_lca.items(): x.append((len(taxfoo.get_lineage(taxid)), taxid, count)) x.sort() # ...aaaaaand output. print('{}\t{}\t{}\t{}\t{}\t{}'.format('percent', 'below', 'at node', 'code', 'taxid', 'name')) for _, taxid, count_below in x: if taxid == 0: continue percent = round(100 * count_below / total_count, 2) count_at = by_taxid[taxid] rank = taxfoo.node_to_info.get(taxid) if rank: rank = rank[0] classify_code = kraken_rank_code.get(rank, '-') else: classify_code = '-' name = taxfoo.taxid_to_names.get(taxid) if name: name = name[0] else: name = '-' print('{}\t{}\t{}\t{}\t{}\t{}'.format(percent, count_below, count_at, classify_code, taxid, name)) if not_found: classify_code = 'U' percent = round(100 * not_found / total_count, 2) count_below = not_found count_at = not_found taxid = 0 name = 'not classified' print('{}\t{}\t{}\t{}\t{}\t{}'.format(percent, count_below, count_at, classify_code, taxid, name)) if args.output_unassigned: outname = args.output_unassigned.name print('saving unassigned hashes to "{}"'.format(outname)) e = sourmash_lib.MinHash(ksize=args.ksize, n=0, scaled=scaled) e.add_many(unassigned_hashvals) sourmash_lib.save_signatures( [sourmash_lib.SourmashSignature('', e)], args.output_unassigned)