def gather_at_rank(mh, lca_db, lin_db, match_rank): "Run gather, and aggregate at given rank." import copy minhash = copy.copy(mh) query_sig = sourmash.SourmashSignature(minhash) # do the gather: counts = Counter() while 1: results = lca_db.gather(query_sig, threshold_bp=0) if not results: break (match, match_sig, _) = results[0] # retrieve lineage & pop to match_rank match_ident = get_ident(match_sig) match_lineage = lin_db.ident_to_lineage[match_ident] match_lineage = pop_to_rank(match_lineage, match_rank) # count at match_rank common = match_sig.minhash.count_common(query_sig.minhash) counts[match_lineage] += common # finish out gather algorithm! minhash.remove_many(match_sig.minhash.hashes) query_sig = sourmash.SourmashSignature(minhash) # return! for lin, count in counts.most_common(): yield lin, count
def intersect(args): """ intersect one or more signatures by taking the intersection of hashes. This function always removes abundances. """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) first_sig = None mins = None total_loaded = 0 for sigfile in args.signatures: for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True): if first_sig is None: first_sig = sigobj mins = set(sigobj.minhash.get_mins()) mins.intersection_update(sigobj.minhash.get_mins()) total_loaded += 1 notify('loaded and intersected signatures from {}...', sigfile, end='\r') if total_loaded == 0: error("no signatures to merge!?") sys.exit(-1) # forcibly turn off track_abundance, unless --abundances-from set. if not args.abundances_from: intersect_mh = first_sig.minhash.copy_and_clear() intersect_mh.track_abundance = False intersect_mh.add_many(mins) intersect_sigobj = sourmash.SourmashSignature(intersect_mh) else: notify('loading signature from {}, keeping abundances', args.abundances_from) abund_sig = sourmash.load_one_signature(args.abundances_from, ksize=args.ksize, select_moltype=moltype) if not abund_sig.minhash.track_abundance: error("--track-abundance not set on loaded signature?! exiting.") sys.exit(-1) intersect_mh = abund_sig.minhash.copy_and_clear() abund_mins = abund_sig.minhash.get_mins(with_abundance=True) # do one last intersection mins.intersection_update(abund_mins) abund_mins = { k: abund_mins[k] for k in mins } intersect_mh.set_abundances(abund_mins) intersect_sigobj = sourmash.SourmashSignature(intersect_mh) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures([intersect_sigobj], fp=fp) notify('loaded and intersected {} signatures', total_loaded)
def sig_import(args): """ import a signature into sourmash format. """ set_quiet(args.quiet) siglist = [] for filename in args.filenames: with open(filename) as fp: x = json.loads(fp.read()) ksize = x['kmer'] num = x['sketchSize'] assert x['hashType'] == "MurmurHash3_x64_128" assert x['hashBits'] == 64 assert x['hashSeed'] == 42 xx = x['sketches'][0] hashes = xx['hashes'] mh = sourmash.MinHash(ksize=ksize, n=num, is_protein=False) mh.add_many(hashes) s = sourmash.SourmashSignature(mh, filename=filename) siglist.append(s) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures(siglist, fp)
def test_calculate_containment_at_rank_3(): # two lineages with overlapping hashes (50% containment) hashval1 = 12345678 ident = 'uniq' mh1, sig1, lin1 = make_sig_and_lin([hashval1], ident, 'a;b;c') lin2 = lca_utils.make_lineage('a;d') hashval2 = 87654321 match_rank = "genus" # make lineage hashD lineage_hashD = defaultdict(test_gen_mh) lineage_hashD = add_hashes_at_ranks(lineage_hashD, [hashval1], lin1, match_rank) lineage_hashD = add_hashes_at_ranks(lineage_hashD, [hashval2], lin2, match_rank) # make query sig mh = make_mh([hashval1, hashval2]) query_sig = sourmash.SourmashSignature(mh, name='query') # calculate containment containmentD = calculate_containment_at_rank(lineage_hashD, query_sig, match_rank) # superkingdom lineage that should have 100% containment lin3 = lca_utils.make_lineage('a') assert containmentD["superkingdom"][0][1] == 1.0 # class should have 50% containment assert containmentD["class"][0][1] == 0.5 phylum_containment = [ containmentD["phylum"][0][1], containmentD["phylum"][1][1] ] assert [0.5, 0.5] == phylum_containment
def main(): p = argparse.ArgumentParser() p.add_argument('contigs') # this is an assembly p.add_argument('read_sig') # this contains sourmash sig with abunds p.add_argument('-o', '--output', required=True) args = p.parse_args() siglist = sourmash.load_file_as_signatures(args.read_sig) siglist = list(siglist) assert len(siglist) == 1 sig = siglist[0] contigs_mh = sig.minhash.copy_and_clear() for record in screed.open(args.contigs): contigs_mh.add_sequence(record.sequence, force=True) # intersect the genome assembly with the read abundances # so now we get the abundances of only the k-mers that are in the # assembly. abunds = {} for hashval in contigs_mh.hashes: abunds[hashval] = sig.minhash.hashes.get(hashval, 0) output_mh = sig.minhash.copy_and_clear() output_mh.set_abundances(abunds) out_sig = sourmash.SourmashSignature(output_mh) with open(args.output, 'wt') as fp: print(f"Saving output to '{args.output}'") sourmash.save_signatures([out_sig], fp)
def test_calculate_containment_at_rank_4(): # add two (nonmatching) hashvals to query hashval1 = 12345678 ident = 'uniq' mh1, sig1, lin1 = make_sig_and_lin([hashval1], ident, 'a;b;c') lin2 = lca_utils.make_lineage('a;d') hashval2 = 87654321 match_rank = "genus" # make lineage hashD lineage_hashD = defaultdict(test_gen_mh) lineage_hashD = add_hashes_at_ranks(lineage_hashD, [hashval1], lin1, match_rank) lineage_hashD = add_hashes_at_ranks(lineage_hashD, [hashval2], lin2, match_rank) # make query sig mh = make_mh([hashval1, hashval2, 33333333, 44444444]) query_sig = sourmash.SourmashSignature(mh, name='query') # calculate containment containmentD = calculate_containment_at_rank(lineage_hashD, query_sig, match_rank) # superkingdom lineage that should have 50% containment lin3 = lca_utils.make_lineage('a') assert containmentD["superkingdom"][0][1] == 0.5 # each class should have 25% containment assert containmentD["class"][0][1] == 0.25 assert [containmentD["phylum"][0][1], containmentD["phylum"][1][1]] == [0.25, 0.25]
def build_signature(p): header, seq = p mg_minhash = sourmash.MinHash(n=0, ksize=51, scaled=100) mg_minhash.add_sequence(str(seq), force=True) mg_sig = sourmash.SourmashSignature(mg_minhash, name=header) return mg_sig
def load_or_generate_sig_from_file(input_file, alphabet, ksize, scaled, ignore_abundance=False, translate=False): sig = "" if input_file.endswith(".sig"): # do I want to enable multiple sigs per file here? sig = sourmash.load_one_signature(input_file, ksize=ksize) else: # read file and add sigs records = try_reading_fasta_file(input_file) # build signature name from filename .. maybe just keep filename? #signame = os.path.basename(input_file.rsplit("_", 1)[0]) # start with fresh minhash mh = determine_appropriate_fresh_minhash(alphabet, ksize, scaled, ignore_abundance) if records: for record in records: if alphabet == "nucleotide" or translate: mh.add_sequence(record.sequence, force=True) else: mh.add_protein(record.sequence) # minhash --> signature, using filename as signature name ..i think this happens automatically if don't provide name? sig = sourmash.SourmashSignature(mh, name=os.path.basename(input_file)) return sig
def test_sort_by_rank_and_containment_2(): # 1. three results, check that they sort by rank, containment hashval1 = 12345678 ident = 'uniq' mh1, sig1, lin1 = make_sig_and_lin([hashval1], ident, 'a;b;c') lin2 = lca_utils.make_lineage('a;d') hashval2 = 87654321 hashval3 = 33333333 match_rank = "genus" # make lineage hashD lineage_hashD = defaultdict(test_gen_mh) lineage_hashD = add_hashes_at_ranks(lineage_hashD, [hashval1, hashval3], lin1, match_rank) lineage_hashD = add_hashes_at_ranks(lineage_hashD, [hashval2], lin2, match_rank) # make query sig mh = make_mh([hashval1, hashval2, hashval3, 44444444]) query_sig = sourmash.SourmashSignature(mh, name='query') superK_lin = lca_utils.make_lineage('a') phylum_match_lin = lca_utils.make_lineage('a;b') # calculate containment containmentD = calculate_containment_at_rank(lineage_hashD, query_sig, match_rank) sorted_results = sort_by_rank_and_containment(containmentD, match_rank) assert sorted_results[0].lineage == superK_lin assert sorted_results[0].contained_at_rank == 0.75 # phylum results should also be sorted by containment assert sorted_results[1].lineage[-1].rank == "phylum" assert sorted_results[1].contained_at_rank == 0.5 assert sorted_results[2].lineage[-1].rank == "phylum" assert sorted_results[2].contained_at_rank == 0.25 # class results assert sorted_results[3].lineage[-1].rank == "class" assert sorted_results[3].contained_at_rank == 0.5
def write(self, csv_writer, csvoutfp, outdir): hashval = self.query_hashval bp = self.total_bp seqs = self.total_seq # output to results.csv! csv_writer.writerow([hashval, bp, seqs]) csvoutfp.flush() # TR add contigs folder # write out cDBG IDs q_name = str(hashval) cdbg_listname = os.path.basename(q_name) + '.cdbg_ids.txt.gz' with gzip.open(os.path.join(outdir, "contigs", cdbg_listname), 'wt') as fp: fp.write("\n".join([str(x) for x in sorted(self.cdbg_shadow)])) # write out contigs contigs_outname = os.path.basename(q_name) + '.contigs.fa.gz' with gzip.open(os.path.join(outdir, "contigs", contigs_outname), 'wt') as fp: for name, sequence in self.contigs: fp.write('>{}\n{}\n'.format(name, sequence)) # save minhash? if self.mh: ss = sourmash.SourmashSignature( self.mh, name='hashval query:{}'.format(q_name)) sigfile = os.path.join(outdir, "contigs", q_name + '.contigs.sig') with open(sigfile, 'wt') as fp: sourmash.save_signatures([ss], fp)
def __init__(self, query_file, ksize, scaled, catlas_name, debug=True): self.filename = query_file self.ksize = ksize self.kmers = set() self.name = None mh = MinHash(0, ksize, scaled=scaled) self.mh = mh self.catlas_name = catlas_name self.debug = debug notify('----') notify('QUERY FILE: {}', self.filename) # build hashes for all the query k-mers & create signature notify('loading query kmers...', end=' ') bf = khmer.Nodetable(ksize, 1, 1) for record in screed.open(self.filename): if self.name is None: self.name = record.name if len(record.sequence) >= int(ksize): self.kmers.update(bf.get_kmer_hashes(record.sequence)) mh.add_sequence(record.sequence, True) self.sig = sourmash.SourmashSignature(mh, name=self.name, filename=self.filename) notify('got {} k-mers from query', len(self.kmers)) self.cdbg_match_counts = {} self.catlas_match_counts = {}
def test_sourmash_signature_api(): e = sourmash.MinHash(n=1, ksize=20) sig = sourmash.SourmashSignature(e) s = sourmash.save_signatures([sig]) sig_x1 = sourmash.load_one_signature(s) sig_x2 = list(sourmash.load_signatures(s))[0] assert sig_x1 == sig assert sig_x2 == sig
def merge(args): """ merge one or more signatures. """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) first_sig = None mh = None total_loaded = 0 # iterate over all the sigs from all the files. for sigfile in args.signatures: notify('loading signatures from {}...', sigfile, end='\r') this_n = 0 for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True): # first signature? initialize a bunch of stuff if first_sig is None: first_sig = sigobj mh = first_sig.minhash.copy_and_clear() # forcibly remove abundance? if args.flatten: mh.track_abundance = False try: sigobj_mh = sigobj.minhash if not args.flatten: _check_abundance_compatibility(first_sig, sigobj) else: sigobj_mh.track_abundance = False mh.merge(sigobj_mh) except: error("ERROR when merging signature '{}' ({}) from file {}", sigobj.name(), sigobj.md5sum()[:8], sigfile) raise this_n += 1 total_loaded += 1 if this_n: notify('loaded and merged {} signatures from {}...', this_n, sigfile, end='\r') if not total_loaded: error("no signatures to merge!?") sys.exit(-1) merged_sigobj = sourmash.SourmashSignature(mh) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures([merged_sigobj], fp=fp) notify('loaded and merged {} signatures', total_loaded)
def intersect(args): """ intersect one or more signatures by taking the intersection of hashes. This function always removes abundances. """ p = SourmashArgumentParser(prog='sourmash signature intersect') p.add_argument('signatures', nargs='+') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) first_sig = None mins = None total_loaded = 0 for sigfile in args.signatures: for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True): if first_sig is None: first_sig = sigobj mins = set(sigobj.minhash.get_mins()) mins.intersection_update(sigobj.minhash.get_mins()) total_loaded += 1 notify('loaded and intersected signatures from {}...', sigfile, end='\r') if total_loaded == 0: error("no signatures to merge!?") sys.exit(-1) # forcibly turn off track_abundance intersect_mh = first_sig.minhash.copy_and_clear() _flatten(intersect_mh) intersect_mh.add_many(mins) intersect_sigobj = sourmash.SourmashSignature(intersect_mh) output_json = sourmash.save_signatures([intersect_sigobj], fp=args.output) notify('loaded and intersected {} signatures', total_loaded)
def main(): p = argparse.ArgumentParser() p.add_argument('node_mh_pickle') p.add_argument('lca_db') args = p.parse_args() node_mhs = pickle.load(open(args.node_mh_pickle, 'rb')) lca_obj = LCA_Database() lca_obj.load(args.lca_db) databases = ((lca_obj, args.lca_db, 'LCA'),) d = {} n_pure95 = 0 total = 0 for k, v in node_mhs.items(): ss = sourmash.SourmashSignature(v) results = [ x[0] for x in gather_databases(ss, databases, 0, True) ] sum_f_uniq = sum([result.f_unique_to_query for result in results]) keep_results = [] for result in results: if result.f_unique_to_query < 0.10: break keep_results.append(result) if not keep_results: print('** no match for {}'.format(k)) continue idents = [ result.name.split()[0].split('.')[0] for result in keep_results ] idxlist = [ lca_obj.ident_to_idx[ident] for ident in idents ] lidlist = [ lca_obj.idx_to_lid[idx] for idx in idxlist ] lineages = [ lca_obj.lid_to_lineage[lid] for lid in lidlist ] tree = lca_utils.build_tree(lineages) lca, reason = lca_utils.find_lca(tree) level = '*none*' if lca: level = lca[-1].rank lineage = ";".join(lca_utils.zip_lineage(lca, truncate_empty=True)) this_f_uniq = sum([ result.f_unique_to_query for result in keep_results ]) print('node {} matches {} @ {:.1f}'.format(k, level, this_f_uniq / sum_f_uniq * 100)) if level in ('strain', 'genus', 'species') and this_f_uniq / sum_f_uniq >= 0.95: n_pure95 += 1 total += 1 print('XXX', n_pure95, total)
def search_containment_at_rank(mh, lca_db, lin_db, match_rank, ignore_abundance=False, summarize_at_ranks=True): "Run search --containment, and aggregate at given rank and above." results = [] found_md5 = set() def gen_mh(): return mh.copy_and_clear() lin_hashes = defaultdict( gen_mh) #defaultdict requires function that defines an empty minhash query_hashes = set(mh.hashes) query_sig = sourmash.SourmashSignature(mh) # search search_iter = lca_db.search(query_sig, threshold=0, do_containment=True, \ ignore_abundance=ignore_abundance, best_only=False, unload_data=False) # iterate through matches for (similarity, match_sig, filename) in search_iter: md5 = match_sig.md5sum() if md5 not in found_md5: found_md5.add(md5) match_lineage = get_lineage_at_match_rank(lin_db, match_sig, match_rank) results.append((similarity, match_sig, filename, match_lineage)) #store search results + lineage if summarize_at_ranks: # Keep track of matched hashes at higher taxonomic ranks intersected_hashes = query_hashes.intersection( set(match_sig.minhash.hashes)) lin_hashes = add_hashes_at_ranks(lin_hashes, intersected_hashes, match_lineage, match_rank) # sort and store results search_results = sort_and_store_search_results(results) search_results_at_rank = [] if summarize_at_ranks: rank_containment = calculate_containment_at_rank( lin_hashes, query_sig, match_rank) search_results_at_rank = sort_by_rank_and_containment( rank_containment, match_rank) return search_results, search_results_at_rank
def subtract(args): """ subtract one or more signatures from another """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) from_sigfile = args.signature_from from_sigobj = sourmash.load_one_signature(from_sigfile, ksize=args.ksize, select_moltype=moltype) from_mh = from_sigobj.minhash if from_mh.track_abundance and not args.flatten: error('Cannot use subtract on signatures with abundance tracking, sorry!') sys.exit(1) subtract_mins = set(from_mh.get_mins()) notify('loaded signature from {}...', from_sigfile, end='\r') total_loaded = 0 for sigfile in args.subtraction_sigs: for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True): if sigobj.minhash.track_abundance and not args.flatten: error('Cannot use subtract on signatures with abundance tracking, sorry!') sys.exit(1) subtract_mins -= set(sigobj.minhash.get_mins()) notify('loaded and subtracted signatures from {}...', sigfile, end='\r') total_loaded += 1 if not total_loaded: error("no signatures to subtract!?") sys.exit(-1) subtract_mh = from_sigobj.minhash.copy_and_clear() subtract_mh.add_many(subtract_mins) subtract_sigobj = sourmash.SourmashSignature(subtract_mh) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures([subtract_sigobj], fp=fp) notify('loaded and subtracted {} signatures', total_loaded)
def create_signatures(file_list, ksize=21, verbose=False): file_list = [Path(str(f) + '.sig') for f in file_list] gt = GenomeTools() if verbose: file_list = tqdm(file_list, total=len(file_list)) for f in file_list: if f.is_file(): sig = sourmash.load_one_signature(str(f)) if sig.minhash.ksize == ksize: continue minhash = sourmash.MinHash(n=1000, ksize=ksize) genome = gt.read_fasta(f.with_suffix('')) minhash.add_sequence(genome, True) sig = sourmash.SourmashSignature(minhash, name=f.stem) with f.open('wt') as handle: sourmash.save_signatures([sig], handle)
def write(self, csv_writer, csvoutfp, outdir, catlas_name): containment = self.containment() similarity = self.similarity() q_name = self.query.filename bp = self.total_bp seqs = self.total_seq k = self.query.ksize num_q_kmers = len(self.query.kmers) (best_con, cdbg_min_oh, catlas_min_oh) = self.query.con_sim_upper_bounds(self.catlas, self.kmer_idx) # output to results.csv! csv_writer.writerow([q_name, containment, similarity, bp, seqs, k, num_q_kmers, best_con, cdbg_min_oh, catlas_min_oh, catlas_name]) csvoutfp.flush() # write out signature from retrieved contigs. sig_filename = os.path.basename(q_name) + '.contigs.sig' with open(os.path.join(outdir, sig_filename), 'wt') as fp: ss = sourmash.SourmashSignature(self.contigs_minhash, name='nbhd:'+self.query.name, filename=sig_filename) sourmash.save_signatures([ss], fp) # write out cDBG IDs cdbg_listname = os.path.basename(q_name) + '.cdbg_ids.txt.gz' with gzip.open(os.path.join(outdir, cdbg_listname), 'wt') as fp: fp.write("\n".join([str(x) for x in sorted(self.shadow)])) # write out catlas nodes frontier_listname = os.path.basename(q_name) + '.frontier.txt.gz' with gzip.open(os.path.join(outdir, frontier_listname), 'wt') as fp: for node in sorted(self.leaves): fp.write('{}\n'.format(node)) # write response curve response_curve_filename = os.path.basename(q_name) + '.response.txt' response_curve_filename = os.path.join(outdir, response_curve_filename) cdbg_match_counts = self.query.cdbg_match_counts[self.catlas.name] search_utils.output_response_curve(response_curve_filename, cdbg_match_counts, self.kmer_idx, self.catlas.layer1_to_cdbg)
def calculate_containment_at_rank(lineage_hashD, query_sig, match_rank): # calculate containment for each lineage match at each rank summarized_results = defaultdict(list) scaled_val = int(query_sig.minhash.scaled) ksize = int(query_sig.minhash.ksize) for lin, matched_hashes in lineage_hashD.items(): rank = lin[-1].rank # TODO; check this. just scaled_val, or scaled * ksize * num matched hashes? #intersect_bp = scaled_val * len(matched_hashes) * ksize intersect_bp = get_match_bp(scaled_val, ksize, num_matched_hashes=len(matched_hashes)) linmatch_sig = sourmash.SourmashSignature( matched_hashes ) #ADD MORE INFO (e.g. name/ident?) HERE IF KEEPING SIG? containment = query_sig.contained_by(linmatch_sig) summarized_results[rank].append( (lin, containment, intersect_bp, linmatch_sig)) # optionally don't keep track of sig here return summarized_results
def compare_sigs(sag_id, sag_file, mhr_path, sig_path, mg_sig_list, jacc_threshold): sag_subcontigs = s_utils.get_seqs(sag_file) if isfile(o_join(mhr_path, sag_id + '.mhr_recruits.tsv')): logging.info('[SABer]: Loading %s and MetaG signature recruit list\n' % sag_id) with open(o_join(mhr_path, sag_id + '.mhr_recruits.tsv'), 'r') as mhr_in: pass_list = [ x.rstrip('\n').split('\t') for x in mhr_in.readlines() ] else: # Calculate\Load MinHash Signatures with SourMash for SAG subseqs if isfile(o_join(sig_path, sag_id + '.SAG.sig')): logging.info('[SABer]: Loading Signature for %s\n' % sag_id) sag_sig = sourmash.signature.load_one_signature( o_join(sig_path, sag_id + '.SAG.sig')) else: logging.info('[SABer]: Building Signature for %s\n' % sag_id) sag_minhash = sourmash.MinHash(n=0, ksize=51, scaled=100) for sg_head in sag_subcontigs: sag_subseq = str(sag_subcontigs[sg_head].seq) sag_minhash.add_sequence(sag_subseq, force=True) sag_sig = sourmash.SourmashSignature(sag_minhash, name=sag_id) with open(o_join(sig_path, sag_id + '.SAG.sig'), 'w') as sags_out: sourmash.signature.save_signatures([sag_sig], fp=sags_out) logging.info('[SABer]: Comparing %s and MetaG signature\n' % sag_id) pass_list = [] for mg_sig in mg_sig_list: jacc_sim = mg_sig.similarity(sag_sig) mg_nm = mg_sig.name() if jacc_sim >= jacc_threshold: pass_list.append([sag_id, mg_nm, mg_nm.rsplit('_', 1)[0]]) with open(o_join(mhr_path, sag_id + '.mhr_recruits.tsv'), 'w') as mhr_out: mhr_out.write('\n'.join(['\t'.join(x) for x in pass_list])) pass_list = tuple(pass_list) return pass_list
def sig_import(args): """ import a signature into sourmash format. """ p = SourmashArgumentParser(prog='sourmash signature import') p.add_argument('filenames', nargs='+') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') args = p.parse_args(args) set_quiet(args.quiet) siglist = [] for filename in args.filenames: with open(filename) as fp: x = json.loads(fp.read()) ksize = x['kmer'] num = x['sketchSize'] assert x['hashType'] == "MurmurHash3_x64_128" assert x['hashBits'] == 64 assert x['hashSeed'] == 42 xx = x['sketches'][0] hashes = xx['hashes'] mh = sourmash.MinHash(ksize=ksize, n=num, is_protein=False) mh.add_many(hashes) s = sourmash.SourmashSignature(mh, filename=filename) siglist.append(s) sourmash.save_signatures(siglist, args.output)
def add_singleton_sigs(sbt, input_file, ksize, scaled, alphabet, ignore_abundance=False, translate=False): if input_file.endswith(".sig"): sigs = sourmash.signature.load_signatures(input_file, ksize=ksize, select_moltype=alphabet) for sig in sigs: if sig.minhash: leaf = SigLeaf(sig.md5sum(), sig) sbt.add_node(leaf) # loop through and add each to sbt else: # read file and add sigs records = try_reading_fasta_file(input_file) # start with fresh minhash if records: for n, record in enumerate(records): signame = (record.name).rsplit("\t", 1)[0] if n % 10000 == 0: sys.stderr.write(f"... building {n}th sig, {signame}\n") mh = determine_appropriate_fresh_minhash( alphabet, ksize, scaled, ignore_abundance) if alphabet == "nucleotide" or translate: mh.add_sequence(record.sequence, force=True) else: mh.add_protein(record.sequence) # minhash --> signature sig = sourmash.SourmashSignature(mh, name=signame) if sig.minhash: leaf = SigLeaf(sig.md5sum(), sig) sbt.add_node(leaf) return sbt
# write out hashes # let's try building a sig. we will use this sig later to intersect with sample-specific sigs new_mins = set(counts.keys()) print(len(new_mins)) with open(outhashes, "w") as out: for hsh in new_mins: out.write(str(hsh) + '\n') if len(new_mins) > 0: minhash = MinHash( n=0, ksize=ksize, scaled=scaled ) # scaled=1 so we keep all (though these were previously at some other scaled val) minhash.add_many(set(counts.keys())) # write sig to file sigobj = sourmash.SourmashSignature( minhash, name=f"aggregated_hashvals_above_{min_count}", filename=f"generated with drop_unique_hashes.py") sigobjs += [sigobj] ## this part only handles one output file -- doesn't take care of case with many ksizes/moltypes with open(outsig, 'wt') as sigout: sourmash.save_signatures(sigobjs, sigout) #notify('wrote signature to {}', args.output) # write out hashes to a text file # this part is from # https://github.com/dib-lab/sourmash/blob/7661087aa0b0e81bfec82a58002463d7c699528a/utils/hashvals-to-signature.py #ksize = int(snakemake.params.get("ksize", 7)) #do some checking here?
def make_sig_and_lin(hashvals, ident, lin, ksize=3, scaled=1): mh = make_mh(hashvals) sig = sourmash.SourmashSignature(mh, name=ident) lineage = lca_utils.make_lineage(lin) return mh, sig, lineage
def main(args=sys.argv[1:]): p = argparse.ArgumentParser() p.add_argument('catlas_prefix', help='catlas prefix') p.add_argument('output') p.add_argument('--minsize', type=float, default=100) p.add_argument('--maxsize', type=float, default=10000) p.add_argument('--keep-fraction', type=float, default=0.1) p.add_argument('-k', '--ksize', default=31, type=int, help='k-mer size (default: 31)') args = p.parse_args(args) print('minsize: {:g}'.format(args.minsize)) print('maxsize: {:g}'.format(args.maxsize)) # load catlas DAG catlas = CAtlas(args.catlas_prefix, load_sizefile=True) print('loaded {} nodes from catlas {}'.format(len(catlas), catlas)) print('loaded {} layer 1 catlas nodes'.format(len(catlas.layer1_to_cdbg))) # calculate the cDBG shadow sizes for each catlas node. print('decorating catlas with shadow size info.') catlas.decorate_with_shadow_sizes() # ok, the real work: look at articulation of cDBG graph. # find highest nodes with kmer size less than given max_size def find_terminal_nodes(node_id, max_size): node_list = set() for sub_id in catlas.children[node_id]: # shadow size size = catlas.kmer_sizes[sub_id] if size < max_size: node_list.add(sub_id) else: children = find_terminal_nodes(sub_id, max_size) node_list.update(children) return node_list print('finding terminal nodes for {}.'.format(args.maxsize)) terminal = find_terminal_nodes(catlas.root, args.maxsize) print('...got {}'.format(len(terminal))) terminal = {n for n in terminal if catlas.kmer_sizes[n] > args.minsize} print('...down to {} between {} and {} in size.'.format( len(terminal), args.minsize, args.maxsize)) # now, go through and calculate ratios x = [] for node_id in terminal: # calculate: how many k-mers per cDBG node? kmer_size = catlas.kmer_sizes[node_id] shadow_size = catlas.shadow_sizes[node_id] ratio = math.log(kmer_size, 2) - math.log(shadow_size, 2) # track basic info x.append((ratio, node_id, shadow_size, kmer_size)) print('terminal node stats for maxsize: {:g}'.format(args.maxsize)) print('n tnodes:', len(terminal)) print('total k-mers:', catlas.kmer_sizes[catlas.root]) x.sort(reverse=True) for (k, v, a, b) in x[:10]: print('ratio: {:.3f}'.format(2**k), '/ shadow size:', a, '/ kmers:', b) print('... eliding {} nodes'.format(len(x) - 20)) for (k, v, a, b) in x[-10:]: print('ratio: {:.3f}'.format(2**k), '/ shadow size:', a, '/ kmers:', b) # keep the last keep-fraction (default 10%) for examination keep_sum_kmer = args.keep_fraction * catlas.kmer_sizes[catlas.root] sofar = 0 keep_terminal = set() for (k, v, a, b) in reversed(x): sofar += b if sofar > keep_sum_kmer: break keep_terminal.add(v) print('keeping last {} k-mers worth of nodes for' 'examination.'.format(sofar)) # build cDBG shadow ID list. cdbg_shadow = catlas.shadow(keep_terminal) # extract contigs print('extracting contigs & building a sourmash signature') contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz') # track results as signature contigs_mh = sourmash.MinHash(n=0, ksize=args.ksize, scaled=1000) total_bp = 0 total_seqs = 0 outfp = open(args.output, 'wt') for n, record in enumerate(screed.open(contigs)): if n and n % 10000 == 0: offset_f = total_seqs / len(cdbg_shadow) print('...at n {} ({:.1f}% of shadow)'.format( total_seqs, offset_f * 100), end='\r') # contig names == cDBG IDs contig_id = int(record.name) if contig_id not in cdbg_shadow: continue outfp.write('>{}\n{}\n'.format(record.name, record.sequence)) contigs_mh.add_sequence(record.sequence) # track retrieved sequences in a minhash total_bp += len(record.sequence) total_seqs += 1 # done - got all contigs! print('') print('fetched {} contigs, {} bp.'.format(total_seqs, total_bp)) print('wrote contigs to {}'.format(args.output)) with open(args.output + '.sig', 'wt') as fp: ss = sourmash.SourmashSignature(contigs_mh) sourmash.save_signatures([ss], fp)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument('bcalm_unitigs') parser.add_argument('gxt_out') parser.add_argument('contigs_out') parser.add_argument('-k', '--ksize', type=int, default=31) parser.add_argument('-d', '--debug', action='store_true') parser.add_argument('-P', '--pendants', action="store_true", help="don't remove low abundance pendants") parser.add_argument('-a', '--abundance', nargs='?', type=float, default=1.1) parser.add_argument('--randomize', help='randomize cDBG order') args = parser.parse_args(argv) k = args.ksize trim = not args.pendants trim_cutoff = args.abundance unitigs = args.bcalm_unitigs debug = args.debug if args.debug: logging.basicConfig(filename='bcalm_to_gxt.log', filemode='w', level=logging.DEBUG) else: logging.basicConfig(filename='bcalm_to_gxt.log', filemode='w', level=logging.WARNING) logging.debug("starting bcalm_to_gxt run.") gxtfp = open(args.gxt_out, 'wt') contigsfp = bgzf.open(args.contigs_out, 'wb') info_filename = args.contigs_out + '.info.csv' info_fp = open(info_filename, 'wt') in_mh = sourmash.MinHash(0, 31, scaled=1000) out_mh = sourmash.MinHash(0, 31, scaled=1000) # load in the basic graph structure from the BCALM output file neighbors, sequences, mean_abunds, sizes = read_bcalm(unitigs, debug, k) # record input k-mers in a minhash for seq in sequences.values(): in_mh.add_sequence(seq) # make order deterministic by reordering around min value of first, last, # and reverse complementing sequences appropriately print('reordering...') reordering = {} # first, put sequences in specific orientation sequence_list = [] for key in neighbors: v = sequences[key] # pick lexicographically smaller of forward & reverse complement. v2 = screed.rc(v) if v > v2: v = v2 sequence_list.append((v, key)) del sequences[key] # sort all sequences: sequence_list.sort(reverse=True) if args.randomize: print('(!! randomizing order per --randomize !!)') random.shuffle(sequence_list) # ok, now remap all the things. remapping = {} new_sequences = {} # remap sequences new_key = 0 while sequence_list: # consume while iterating sequence, old_key = sequence_list.pop() remapping[old_key] = new_key new_sequences[new_key] = sequence new_key += 1 # remap other things new_neighbors = collections.defaultdict(set) for old_key, vv in neighbors.items(): new_vv = [remapping[v] for v in vv] new_neighbors[remapping[old_key]] = set(new_vv) new_mean_abunds = {} for old_key, value in mean_abunds.items(): new_mean_abunds[remapping[old_key]] = value new_sizes = {} for old_key, value in sizes.items(): new_sizes[remapping[old_key]] = value assert len(sequences) == 0 print('...done') sequences = new_sequences mean_abunds = new_mean_abunds sizes = new_sizes neighbors = new_neighbors # if we are removing pendants, we need to relabel the contigs so they are # consecutive integers starting from 0. If not, we create dummy data # structures to make the interface the same elsewhere in the data if trim: print('removing pendants...') non_pendants = set(v for v, N in neighbors.items() if len(N) > 1 or mean_abunds[v] > trim_cutoff) contract_degree_two(non_pendants, neighbors, sequences, mean_abunds, sizes, k) else: non_pendants = list(neighbors.keys()) aliases = {x: i for i, x in enumerate(sorted(non_pendants))} n = len(aliases) # write out sequences & compute offsets offsets = {} kv_list = sorted(aliases.items(), key=lambda x: x[1]) for x, i in kv_list: offsets[x] = contigsfp.tell() contigsfp.write('>{}\n{}\n'.format(i, sequences[x])) out_mh.add_sequence(sequences[x]) contigsfp.close() print('... done! {} unitigs'.format(n)) # start the gxt file by writing the number of nodes (unitigs)) gxtfp.write('{}\n'.format(n)) # write out all of the links, in 'from to' format. n_edges = 0 for v, N in sorted(neighbors.items()): for u in sorted(N): gxtfp.write('{} {}\n'.format(aliases[v], aliases[u])) n_edges += 1 print('{} vertices, {} edges'.format(n, n_edges)) info_fp.write('contig_id,offset,mean_abund,n_kmers\n') for v, i in aliases.items(): info_fp.write('{},{},{:.3f},{}\n'.format(i, offsets[v], mean_abunds[v], sizes[v])) # output two sourmash signatures: one for input contigs, one for # output contigs. in_sig = sourmash.SourmashSignature(in_mh, filename=args.bcalm_unitigs) sourmash.save_signatures([in_sig], open(args.bcalm_unitigs + '.sig', 'wt')) out_sig = sourmash.SourmashSignature(out_mh, filename=args.contigs_out) sourmash.save_signatures([out_sig], open(args.contigs_out + '.sig', 'wt'))
def main(): p = argparse.ArgumentParser() p.add_argument('hashfile') # file that contains hashes p.add_argument('-o', '--output', default=None, help='file to output signature to') p.add_argument('-k', '--ksize', default=None, type=int) p.add_argument('--scaled', default=None, type=int) p.add_argument('--num', default=None, type=int) p.add_argument('--name', default='', help='signature name') p.add_argument('--filename', default='', help='filename to add to signature') args = p.parse_args() # check arguments. if args.scaled and args.num: error('cannot specify both --num and --scaled! exiting.') return -1 if not args.ksize: error('must specify --ksize') return -1 if not args.output: error('must specify --output') return -1 # first, load in all the hashes hashes = set() for line in open(args.hashfile, 'rt'): hashval = int(line.strip()) hashes.add(hashval) if not hashes: error("ERROR, no hashes loaded from {}!", args.hashfile) return -1 notify('loaded {} distinct hashes from {}', len(hashes), args.hashfile) # now, create the MinHash object that we'll use. scaled = 0 num = 0 if args.scaled: scaled = args.scaled elif args.num: num = args.num else: notify('setting --num automatically from the number of hashes.') num = len(hashes) # construct empty MinHash object according to args minhash = MinHash(n=num, ksize=args.ksize, scaled=scaled) # add hashes into! minhash.add_many(hashes) if len(minhash) < len(hashes): notify("WARNING: loaded {} hashes, but only {} made it into MinHash.", len(hashes), len(minhash)) if scaled: notify("This is probably because of the scaled argument.") elif args.num: notify("This is probably because your --num is set to {}", args.num) if num > len(minhash): notify("WARNING: --num set to {}, but only {} hashes in signature.", num, len(minhash)) sigobj = sourmash.SourmashSignature(minhash, name=args.name, filename=args.filename) with open(args.output, 'wt') as fp: sourmash.save_signatures([sigobj], fp) notify('wrote signature to {}', args.output)
def subtract(args): """ subtract one or more signatures from another """ p = SourmashArgumentParser(prog='sourmash signature subtract') p.add_argument('signature_from') p.add_argument('subtraction_sigs', nargs='+') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') p.add_argument('--flatten', action='store_true', help='remove abundance from signatures before subtracting') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) from_sigfile = args.signature_from from_sigobj = sourmash.load_one_signature(from_sigfile, ksize=args.ksize, select_moltype=moltype) from_mh = from_sigobj.minhash if from_mh.track_abundance and not args.flatten: error( 'Cannot use subtract on signatures with abundance tracking, sorry!' ) sys.exit(1) subtract_mins = set(from_mh.get_mins()) notify('loaded signature from {}...', from_sigfile, end='\r') total_loaded = 0 for sigfile in args.subtraction_sigs: for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True): if sigobj.minhash.track_abundance and not args.flatten: error( 'Cannot use subtract on signatures with abundance tracking, sorry!' ) sys.exit(1) subtract_mins -= set(sigobj.minhash.get_mins()) notify('loaded and subtracted signatures from {}...', sigfile, end='\r') total_loaded += 1 if not total_loaded: error("no signatures to subtract!?") sys.exit(-1) subtract_mh = from_sigobj.minhash.copy_and_clear() subtract_mh.add_many(subtract_mins) subtract_sigobj = sourmash.SourmashSignature(subtract_mh) output_json = sourmash.save_signatures([subtract_sigobj], fp=args.output) notify('loaded and subtracted {} signatures', total_loaded)
def merge(args): """ merge one or more signatures. """ p = SourmashArgumentParser(prog='sourmash signature merge') p.add_argument('signatures', nargs='+') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') p.add_argument('--flatten', action='store_true', help='Remove abundances from all signatures.') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) first_sig = None mh = None total_loaded = 0 # iterate over all the sigs from all the files. for sigfile in args.signatures: notify('loading signatures from {}...', sigfile, end='\r') this_n = 0 for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True): # first signature? initialize a bunch of stuff if first_sig is None: first_sig = sigobj mh = first_sig.minhash.copy_and_clear() # forcibly remove abundance? if mh.track_abundance and args.flatten: _flatten(mh) try: if not args.flatten: _check_abundance_compatibility(first_sig, sigobj) mh.merge(sigobj.minhash) except: error("ERROR when merging signature '{}' ({}) from file {}", sigobj.name(), sigobj.md5sum()[:8], sigfile) raise this_n += 1 total_loaded += 1 if this_n: notify('loaded and merged {} signatures from {}...', this_n, sigfile, end='\r') if not total_loaded: error("no signatures to merge!?") sys.exit(-1) merged_sigobj = sourmash.SourmashSignature(mh) output_json = sourmash.save_signatures([merged_sigobj], fp=args.output) notify('loaded and merged {} signatures', total_loaded)