def sig_import(args): """ import a signature into sourmash format. """ set_quiet(args.quiet) siglist = [] for filename in args.filenames: with open(filename) as fp: x = json.loads(fp.read()) ksize = x['kmer'] num = x['sketchSize'] assert x['hashType'] == "MurmurHash3_x64_128" assert x['hashBits'] == 64 assert x['hashSeed'] == 42 xx = x['sketches'][0] hashes = xx['hashes'] mh = sourmash.MinHash(ksize=ksize, n=num, is_protein=False) mh.add_many(hashes) s = sourmash.SourmashSignature(mh, filename=filename) siglist.append(s) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures(siglist, fp)
def write(self, csv_writer, csvoutfp, outdir): hashval = self.query_hashval bp = self.total_bp seqs = self.total_seq # output to results.csv! csv_writer.writerow([hashval, bp, seqs]) csvoutfp.flush() # TR add contigs folder # write out cDBG IDs q_name = str(hashval) cdbg_listname = os.path.basename(q_name) + '.cdbg_ids.txt.gz' with gzip.open(os.path.join(outdir, "contigs", cdbg_listname), 'wt') as fp: fp.write("\n".join([str(x) for x in sorted(self.cdbg_shadow)])) # write out contigs contigs_outname = os.path.basename(q_name) + '.contigs.fa.gz' with gzip.open(os.path.join(outdir, "contigs", contigs_outname), 'wt') as fp: for name, sequence in self.contigs: fp.write('>{}\n{}\n'.format(name, sequence)) # save minhash? if self.mh: ss = sourmash.SourmashSignature( self.mh, name='hashval query:{}'.format(q_name)) sigfile = os.path.join(outdir, "contigs", q_name + '.contigs.sig') with open(sigfile, 'wt') as fp: sourmash.save_signatures([ss], fp)
def rename(args): """ rename one or more signatures. """ set_quiet(args.quiet, args.quiet) moltype = sourmash_args.calculate_moltype(args) progress = sourmash_args.SignatureLoadingProgress() outlist = [] for filename in args.sigfiles: debug('loading {}', filename) siglist = sourmash_args.load_file_as_signatures(filename, ksize=args.ksize, select_moltype=moltype, traverse=True, progress=progress) for sigobj in siglist: sigobj._name = args.name outlist.append(sigobj) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures(outlist, fp=fp) notify("set name to '{}' on {} signatures", args.name, len(outlist))
def main(): p = argparse.ArgumentParser() p.add_argument('contigs') # this is an assembly p.add_argument('read_sig') # this contains sourmash sig with abunds p.add_argument('-o', '--output', required=True) args = p.parse_args() siglist = sourmash.load_file_as_signatures(args.read_sig) siglist = list(siglist) assert len(siglist) == 1 sig = siglist[0] contigs_mh = sig.minhash.copy_and_clear() for record in screed.open(args.contigs): contigs_mh.add_sequence(record.sequence, force=True) # intersect the genome assembly with the read abundances # so now we get the abundances of only the k-mers that are in the # assembly. abunds = {} for hashval in contigs_mh.hashes: abunds[hashval] = sig.minhash.hashes.get(hashval, 0) output_mh = sig.minhash.copy_and_clear() output_mh.set_abundances(abunds) out_sig = sourmash.SourmashSignature(output_mh) with open(args.output, 'wt') as fp: print(f"Saving output to '{args.output}'") sourmash.save_signatures([out_sig], fp)
def flatten(args): """ flatten a signature, removing abundances. """ p = SourmashArgumentParser(prog='sourmash signature flatten') p.add_argument('signatures', nargs='+') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') p.add_argument('--md5', default=None, help='select signatures whose md5 contains this substring') p.add_argument('--name', default=None, help='select signatures whose name contains this substring') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) outlist = [] total_loaded = 0 for filename in args.signatures: siglist = sourmash.load_signatures(filename, ksize=args.ksize, select_moltype=moltype, do_raise=True) siglist = list(siglist) total_loaded += len(siglist) # select! if args.md5 is not None: siglist = [ss for ss in siglist if args.md5 in ss.md5sum()] if args.name is not None: siglist = [ss for ss in siglist if args.name in ss.name()] for ss in siglist: flattened_mh = ss.minhash.copy_and_clear() flattened_mh.track_abundance = False flattened_mh.add_many(ss.minhash.get_mins()) ss.minhash = flattened_mh outlist.extend(siglist) sourmash.save_signatures(outlist, fp=args.output) notify("loaded {} total that matched ksize & molecule type", total_loaded) notify("extracted {} signatures from {} file(s)", len(outlist), len(args.signatures))
def get_target_sig(sample_name): genome = sample_name mh = sourmash.MinHash(n=1000, ksize=31) for record in screed.open(genome): mh.add_sequence(record.sequence, True) sig = SourmashSignature(mh, name=genome) with open(sample_name + '.sig', 'wt') as fp: save_signatures([sig], fp)
def intersect(args): """ intersect one or more signatures by taking the intersection of hashes. This function always removes abundances. """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) first_sig = None mins = None total_loaded = 0 for sigfile in args.signatures: for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True): if first_sig is None: first_sig = sigobj mins = set(sigobj.minhash.get_mins()) mins.intersection_update(sigobj.minhash.get_mins()) total_loaded += 1 notify('loaded and intersected signatures from {}...', sigfile, end='\r') if total_loaded == 0: error("no signatures to merge!?") sys.exit(-1) # forcibly turn off track_abundance, unless --abundances-from set. if not args.abundances_from: intersect_mh = first_sig.minhash.copy_and_clear() intersect_mh.track_abundance = False intersect_mh.add_many(mins) intersect_sigobj = sourmash.SourmashSignature(intersect_mh) else: notify('loading signature from {}, keeping abundances', args.abundances_from) abund_sig = sourmash.load_one_signature(args.abundances_from, ksize=args.ksize, select_moltype=moltype) if not abund_sig.minhash.track_abundance: error("--track-abundance not set on loaded signature?! exiting.") sys.exit(-1) intersect_mh = abund_sig.minhash.copy_and_clear() abund_mins = abund_sig.minhash.get_mins(with_abundance=True) # do one last intersection mins.intersection_update(abund_mins) abund_mins = { k: abund_mins[k] for k in mins } intersect_mh.set_abundances(abund_mins) intersect_sigobj = sourmash.SourmashSignature(intersect_mh) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures([intersect_sigobj], fp=fp) notify('loaded and intersected {} signatures', total_loaded)
def downsample(args): """ downsample a scaled signature. """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) if not args.num and not args.scaled: error('must specify either --num or --scaled value') sys.exit(-1) if args.num and args.scaled: error('cannot specify both --num and --scaled') sys.exit(-1) output_list = [] total_loaded = 0 for sigfile in args.signatures: siglist = sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True) for sigobj in siglist: mh = sigobj.minhash notify('loading and downsampling signature from {}...', sigfile, end='\r') total_loaded += 1 if args.scaled: if mh.scaled: mh_new = mh.downsample_scaled(args.scaled) else: # try to turn a num into a scaled # first check: can we? max_hash = get_max_hash_for_scaled(args.scaled) mins = mh.get_mins() if max(mins) < max_hash: raise ValueError("this num MinHash does not have enough hashes to convert it into a scaled MinHash.") mh_new = copy.copy(mh) _set_num_scaled(mh_new, 0, args.scaled) elif args.num: if mh.num: mh_new = mh.downsample_n(args.num) else: # try to turn a scaled into a num # first check: can we? if len(mh) < args.num: raise ValueError("this scaled MinHash has only {} hashes") mh_new = copy.copy(mh) _set_num_scaled(mh_new, args.num, 0) sigobj.minhash = mh_new output_list.append(sigobj) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures(output_list, fp=fp) notify("loaded and downsampled {} signatures", total_loaded)
def extract(args): """ extract signatures. """ p = SourmashArgumentParser(prog='sourmash signature extract') p.add_argument('signatures', nargs='+') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') p.add_argument('--md5', default=None, help='select signatures whose md5 contains this substring') p.add_argument('--name', default=None, help='select signatures whose name contains this substring') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) outlist = [] total_loaded = 0 for filename in args.signatures: siglist = sourmash.load_signatures(filename, ksize=args.ksize, select_moltype=moltype, do_raise=True) siglist = list(siglist) total_loaded += len(siglist) # select! if args.md5 is not None: siglist = [ss for ss in siglist if args.md5 in ss.md5sum()] if args.name is not None: siglist = [ss for ss in siglist if args.name in ss.name()] outlist.extend(siglist) notify("loaded {} total that matched ksize & molecule type", total_loaded) if not outlist: error("no matching signatures!") sys.exit(-1) sourmash.save_signatures(outlist, fp=args.output) notify("extracted {} signatures from {} file(s)", len(outlist), len(args.signatures))
def merge(args): """ merge one or more signatures. """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) first_sig = None mh = None total_loaded = 0 # iterate over all the sigs from all the files. for sigfile in args.signatures: notify('loading signatures from {}...', sigfile, end='\r') this_n = 0 for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True): # first signature? initialize a bunch of stuff if first_sig is None: first_sig = sigobj mh = first_sig.minhash.copy_and_clear() # forcibly remove abundance? if args.flatten: mh.track_abundance = False try: sigobj_mh = sigobj.minhash if not args.flatten: _check_abundance_compatibility(first_sig, sigobj) else: sigobj_mh.track_abundance = False mh.merge(sigobj_mh) except: error("ERROR when merging signature '{}' ({}) from file {}", sigobj.name(), sigobj.md5sum()[:8], sigfile) raise this_n += 1 total_loaded += 1 if this_n: notify('loaded and merged {} signatures from {}...', this_n, sigfile, end='\r') if not total_loaded: error("no signatures to merge!?") sys.exit(-1) merged_sigobj = sourmash.SourmashSignature(mh) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures([merged_sigobj], fp=fp) notify('loaded and merged {} signatures', total_loaded)
def filter(args): """ filter hashes by abundance in all of the signatures """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) progress = sourmash_args.SignatureLoadingProgress() outlist = [] total_loaded = 0 for filename in args.signatures: siglist = sourmash_args.load_file_as_signatures(filename, ksize=args.ksize, select_moltype=moltype, traverse=True, progress=progress) siglist = list(siglist) total_loaded += len(siglist) # select! if args.md5 is not None: siglist = [ss for ss in siglist if args.md5 in ss.md5sum()] if args.name is not None: siglist = [ss for ss in siglist if args.name in ss.name()] for ss in siglist: mh = ss.minhash if not mh.track_abundance: notify('ignoring signature {} - track_abundance not set.', ss) continue abunds = mh.get_mins(with_abundance=True) abunds2 = {} for k, v in abunds.items(): if v >= args.min_abundance: if args.max_abundance is None or \ v <= args.max_abundance: abunds2[k] = v filtered_mh = mh.copy_and_clear() filtered_mh.set_abundances(abunds2) ss.minhash = filtered_mh outlist.extend(siglist) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures(outlist, fp=fp) notify("loaded {} total that matched ksize & molecule type", total_loaded) notify("extracted {} signatures from {} file(s)", len(outlist), len(args.signatures))
def subtract(args): """ subtract one or more signatures from another """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) from_sigfile = args.signature_from from_sigobj = sourmash.load_one_signature(from_sigfile, ksize=args.ksize, select_moltype=moltype) from_mh = from_sigobj.minhash if from_mh.track_abundance and not args.flatten: error('Cannot use subtract on signatures with abundance tracking, sorry!') sys.exit(1) subtract_mins = set(from_mh.get_mins()) notify('loaded signature from {}...', from_sigfile, end='\r') total_loaded = 0 for sigfile in args.subtraction_sigs: for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True): if sigobj.minhash.track_abundance and not args.flatten: error('Cannot use subtract on signatures with abundance tracking, sorry!') sys.exit(1) subtract_mins -= set(sigobj.minhash.get_mins()) notify('loaded and subtracted signatures from {}...', sigfile, end='\r') total_loaded += 1 if not total_loaded: error("no signatures to subtract!?") sys.exit(-1) subtract_mh = from_sigobj.minhash.copy_and_clear() subtract_mh.add_many(subtract_mins) subtract_sigobj = sourmash.SourmashSignature(subtract_mh) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures([subtract_sigobj], fp=fp) notify('loaded and subtracted {} signatures', total_loaded)
def create_signatures(file_list, ksize=21, verbose=False): file_list = [Path(str(f) + '.sig') for f in file_list] gt = GenomeTools() if verbose: file_list = tqdm(file_list, total=len(file_list)) for f in file_list: if f.is_file(): sig = sourmash.load_one_signature(str(f)) if sig.minhash.ksize == ksize: continue minhash = sourmash.MinHash(n=1000, ksize=ksize) genome = gt.read_fasta(f.with_suffix('')) minhash.add_sequence(genome, True) sig = sourmash.SourmashSignature(minhash, name=f.stem) with f.open('wt') as handle: sourmash.save_signatures([sig], handle)
def subtract(args): """ subtract one or more signatures from another """ p = SourmashArgumentParser(prog='sourmash signature subtract') p.add_argument('signature_from') p.add_argument('subtraction_sigs', nargs='+') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') p.add_argument('--flatten', action='store_true', help='remove abundance from signatures before subtracting') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) from_sigfile = args.signature_from from_sigobj = sourmash.load_one_signature(from_sigfile, ksize=args.ksize, select_moltype=moltype) from_mh = from_sigobj.minhash if from_mh.track_abundance and not args.flatten: error('Cannot use subtract on signatures with abundance tracking, sorry!') sys.exit(1) subtract_mins = set(from_mh.get_mins()) notify('loaded signature from {}...', from_sigfile, end='\r') total_loaded = 0 for sigfile in args.subtraction_sigs: for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True): if sigobj.minhash.track_abundance and not args.flatten: error('Cannot use subtract on signatures with abundance tracking, sorry!') sys.exit(1) subtract_mins -= set(sigobj.minhash.get_mins()) notify('loaded and subtracted signatures from {}...', sigfile, end='\r') total_loaded += 1 if not total_loaded: error("no signatures to subtract!?") sys.exit(-1) subtract_mh = from_sigobj.minhash.copy_and_clear() subtract_mh.add_many(subtract_mins) subtract_sigobj = sourmash.SourmashSignature(subtract_mh) output_json = sourmash.save_signatures([subtract_sigobj], fp=args.output) notify('loaded and subtracted {} signatures', total_loaded)
def write(self, csv_writer, csvoutfp, outdir, catlas_name): containment = self.containment() similarity = self.similarity() q_name = self.query.filename bp = self.total_bp seqs = self.total_seq k = self.query.ksize num_q_kmers = len(self.query.kmers) (best_con, cdbg_min_oh, catlas_min_oh) = self.query.con_sim_upper_bounds(self.catlas, self.kmer_idx) # output to results.csv! csv_writer.writerow([q_name, containment, similarity, bp, seqs, k, num_q_kmers, best_con, cdbg_min_oh, catlas_min_oh, catlas_name]) csvoutfp.flush() # write out signature from retrieved contigs. sig_filename = os.path.basename(q_name) + '.contigs.sig' with open(os.path.join(outdir, sig_filename), 'wt') as fp: ss = sourmash.SourmashSignature(self.contigs_minhash, name='nbhd:'+self.query.name, filename=sig_filename) sourmash.save_signatures([ss], fp) # write out cDBG IDs cdbg_listname = os.path.basename(q_name) + '.cdbg_ids.txt.gz' with gzip.open(os.path.join(outdir, cdbg_listname), 'wt') as fp: fp.write("\n".join([str(x) for x in sorted(self.shadow)])) # write out catlas nodes frontier_listname = os.path.basename(q_name) + '.frontier.txt.gz' with gzip.open(os.path.join(outdir, frontier_listname), 'wt') as fp: for node in sorted(self.leaves): fp.write('{}\n'.format(node)) # write response curve response_curve_filename = os.path.basename(q_name) + '.response.txt' response_curve_filename = os.path.join(outdir, response_curve_filename) cdbg_match_counts = self.query.cdbg_match_counts[self.catlas.name] search_utils.output_response_curve(response_curve_filename, cdbg_match_counts, self.kmer_idx, self.catlas.layer1_to_cdbg)
def cat(args): """ concatenate all signatures into one file. """ set_quiet(args.quiet) encountered_md5sums = defaultdict(int) # used by --unique progress = sourmash_args.SignatureLoadingProgress() siglist = [] for sigfile in args.signatures: this_siglist = [] try: loader = sourmash_args.load_file_as_signatures(sigfile, traverse=True, progress=progress) n_loaded = 0 for sig in loader: n_loaded += 1 md5 = sig.md5sum() encountered_md5sums[md5] += 1 if args.unique and encountered_md5sums[md5] > 1: continue siglist.append(sig) except Exception as exc: error(str(exc)) error('(continuing)') notify('loaded {} signatures from {}...', n_loaded, sigfile, end='\r') notify('loaded {} signatures total.', len(siglist)) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures(siglist, fp=fp) notify('output {} signatures', len(siglist)) multiple_md5 = [1 for cnt in encountered_md5sums.values() if cnt > 1] if multiple_md5: notify('encountered {} MinHashes multiple times', sum(multiple_md5)) if args.unique: notify( '...and removed the duplicates, because --unique was specified.' )
def test_sourmash_signature_api(): e = sourmash.MinHash(n=1, ksize=20) sig = sourmash.SourmashSignature(e) s = sourmash.save_signatures([sig]) sig_x1 = sourmash.load_one_signature(s) sig_x2 = list(sourmash.load_signatures(s))[0] assert sig_x1 == sig assert sig_x2 == sig
def rename(args): """ rename one or more signatures. """ p = SourmashArgumentParser(prog='sourmash signature rename') p.add_argument('sigfiles', nargs='+') p.add_argument('name') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-d', '--debug', action='store_true', help='output debugging output') p.add_argument('-o', '--output', help='output to this file') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet, args.quiet) moltype = sourmash_args.calculate_moltype(args) outlist = [] for filename in args.sigfiles: debug('loading {}', filename) siglist = sourmash.load_signatures(filename, ksize=args.ksize, select_moltype=moltype) for sigobj in siglist: sigobj.d['name'] = args.name outlist.append(sigobj) if args.output: fp = open(args.output, 'wt') else: fp = sys.stdout sourmash.save_signatures(outlist, fp=fp) if args.output: fp.close() notify("set name to '{}' on {} signatures", args.name, len(outlist))
def intersect(args): """ intersect one or more signatures by taking the intersection of hashes. This function always removes abundances. """ p = SourmashArgumentParser(prog='sourmash signature intersect') p.add_argument('signatures', nargs='+') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) first_sig = None mins = None total_loaded = 0 for sigfile in args.signatures: for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True): if first_sig is None: first_sig = sigobj mins = set(sigobj.minhash.get_mins()) mins.intersection_update(sigobj.minhash.get_mins()) total_loaded += 1 notify('loaded and intersected signatures from {}...', sigfile, end='\r') if total_loaded == 0: error("no signatures to merge!?") sys.exit(-1) # forcibly turn off track_abundance intersect_mh = first_sig.minhash.copy_and_clear() _flatten(intersect_mh) intersect_mh.add_many(mins) intersect_sigobj = sourmash.SourmashSignature(intersect_mh) output_json = sourmash.save_signatures([intersect_sigobj], fp=args.output) notify('loaded and intersected {} signatures', total_loaded)
def flatten(args): """ flatten a signature, removing abundances. """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) progress = sourmash_args.SignatureLoadingProgress() outlist = [] total_loaded = 0 for filename in args.signatures: siglist = sourmash_args.load_file_as_signatures(filename, ksize=args.ksize, select_moltype=moltype, traverse=True, progress=progress) siglist = list(siglist) total_loaded += len(siglist) # select! if args.md5 is not None: siglist = [ss for ss in siglist if args.md5 in ss.md5sum()] if args.name is not None: siglist = [ss for ss in siglist if args.name in ss.name()] for ss in siglist: flattened_mh = ss.minhash.copy_and_clear() flattened_mh.track_abundance = False flattened_mh.add_many(ss.minhash.get_mins()) ss.minhash = flattened_mh outlist.extend(siglist) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures(outlist, fp=fp) notify("loaded {} total that matched ksize & molecule type", total_loaded) notify("extracted {} signatures from {} file(s)", len(outlist), len(args.signatures))
def test_linear_index_load(): sig2 = utils.get_test_data('2.fa.sig') sig47 = utils.get_test_data('47.fa.sig') sig63 = utils.get_test_data('63.fa.sig') ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) with utils.TempDirectory() as location: from sourmash import save_signatures filename = os.path.join(location, 'foo') with open(filename, 'wt') as fp: sourmash.save_signatures([ss2, ss47, ss63], fp) linear = LinearIndex.load(filename) x = {ss2, ss47, ss63} assert set(linear.signatures()) == x, linear.signatures assert linear.filename == filename
def flatten(args): """ flatten a signature, removing abundances. """ p = SourmashArgumentParser(prog='sourmash signature flatten') p.add_argument('signatures', nargs='+') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') p.add_argument('--md5', default=None, help='select signatures whose md5 contains this substring') p.add_argument('--name', default=None, help='select signatures whose name contains this substring') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) outlist = [] total_loaded = 0 for filename in args.signatures: siglist = sourmash.load_signatures(filename, ksize=args.ksize, select_moltype=moltype, do_raise=True) siglist = list(siglist) total_loaded += len(siglist) # select! if args.md5 is not None: siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ] if args.name is not None: siglist = [ ss for ss in siglist if args.name in ss.name() ] for ss in siglist: flattened_mh = ss.minhash.copy_and_clear() _flatten(flattened_mh) flattened_mh.add_many(ss.minhash.get_mins()) ss.minhash = flattened_mh outlist.extend(siglist) output_json = sourmash.save_signatures(outlist, fp=args.output) notify("loaded {} total that matched ksize & molecule type", total_loaded) notify("extracted {} signatures from {} file(s)", len(outlist), len(args.signatures))
def sig_import(args): """ import a signature into sourmash format. """ p = SourmashArgumentParser(prog='sourmash signature import') p.add_argument('filenames', nargs='+') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') args = p.parse_args(args) set_quiet(args.quiet) siglist = [] for filename in args.filenames: with open(filename) as fp: x = json.loads(fp.read()) ksize = x['kmer'] num = x['sketchSize'] assert x['hashType'] == "MurmurHash3_x64_128" assert x['hashBits'] == 64 assert x['hashSeed'] == 42 xx = x['sketches'][0] hashes = xx['hashes'] mh = sourmash.MinHash(ksize=ksize, n=num, is_protein=False) mh.add_many(hashes) s = sourmash.SourmashSignature(mh, filename=filename) siglist.append(s) sourmash.save_signatures(siglist, args.output)
def extract(args): """ extract signatures. """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) progress = sourmash_args.SignatureLoadingProgress() outlist = [] total_loaded = 0 for filename in args.signatures: siglist = sourmash_args.load_file_as_signatures(filename, ksize=args.ksize, select_moltype=moltype, traverse=True, progress=progress) siglist = list(siglist) total_loaded += len(siglist) # select! if args.md5 is not None: siglist = [ss for ss in siglist if args.md5 in ss.md5sum()] if args.name is not None: siglist = [ss for ss in siglist if args.name in ss.name()] outlist.extend(siglist) notify("loaded {} total that matched ksize & molecule type", total_loaded) if not outlist: error("no matching signatures!") sys.exit(-1) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures(outlist, fp=fp) notify("extracted {} signatures from {} file(s)", len(outlist), len(args.signatures))
def sketch(args): cwd = os.getcwd() db_path = os.path.join(cwd, args.name + '.db') # check for the existence of the database and tables if os.path.exists(db_path): pass else: print( "Database file not found. Please make sure the name is correct or run mashpit build." ) exit(0) fasta_folder = os.path.join(cwd, 'fasta') if os.path.exists(fasta_folder): pass else: print("Fasta folder not found.") exit(0) sig_file_name = args.name + '.sig' all_fasta_path = os.path.join(fasta_folder, "*_skeasa.fasta") genomes_list = glob.glob(all_fasta_path) minhashes = [] for genome in genomes_list: mh = MinHash(n=1000, ksize=31) for record in screed.open(genome): mh.add_sequence(record.sequence, True) minhashes.append(mh) siglist = [] for i in range(len(minhashes)): signame = genomes_list[i].strip(fasta_folder).strip('_skesa.fasta') siglist.append(SourmashSignature(minhashes[i], name=signame)) with open(sig_file_name, 'w') as f: save_signatures(siglist, fp=f)
def rename(args): """ rename one or more signatures. """ p = SourmashArgumentParser(prog='sourmash signature rename') p.add_argument('sigfiles', nargs='+') p.add_argument('name') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-d', '--debug', action='store_true', help='output debugging output') p.add_argument('-o', '--output', help='output to this file') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet, args.quiet) moltype = sourmash_args.calculate_moltype(args) outlist = [] for filename in args.sigfiles: debug('loading {}', filename) siglist = sourmash.load_signatures(filename, ksize=args.ksize, select_moltype=moltype) for sigobj in siglist: sigobj.d['name'] = args.name outlist.append(sigobj) if args.output: fp = open(args.output, 'wt') else: fp = sys.stdout output_json = sourmash.save_signatures(outlist, fp=fp) if args.output: fp.close() notify("set name to '{}' on {} signatures", args.name, len(outlist))
for i, dataset in enumerate(sbt.leaves()): dataset_mins = dataset.data.minhash.get_mins() del dataset._data query_mins -= set(dataset_mins) if not query_mins: break if i % 100 == 0: print( f"Progress: {i} sigs processed, query has {len(query_mins)} hashes left" ) new_mh = query.minhash.copy_and_clear() if new_mh.track_abundance: new_mh.set_abundances({ k: v for k, v in query.minhash.get_mins(with_abundance=True).items() if k in query_mins }) else: new_mh.add_many(query_mins) query.minhash = new_mh output = args.query + ".unassigned" if args.output: output = args.output with open(output, "w") as fp: sourmash.save_signatures([query], fp)
def main(args=sys.argv[1:]): p = argparse.ArgumentParser() p.add_argument('catlas_prefix', help='catlas prefix') p.add_argument('output') p.add_argument('--minsize', type=float, default=100) p.add_argument('--maxsize', type=float, default=10000) p.add_argument('--keep-fraction', type=float, default=0.1) p.add_argument('-k', '--ksize', default=31, type=int, help='k-mer size (default: 31)') args = p.parse_args(args) print('minsize: {:g}'.format(args.minsize)) print('maxsize: {:g}'.format(args.maxsize)) # load catlas DAG catlas = CAtlas(args.catlas_prefix, load_sizefile=True) print('loaded {} nodes from catlas {}'.format(len(catlas), catlas)) print('loaded {} layer 1 catlas nodes'.format(len(catlas.layer1_to_cdbg))) # calculate the cDBG shadow sizes for each catlas node. print('decorating catlas with shadow size info.') catlas.decorate_with_shadow_sizes() # ok, the real work: look at articulation of cDBG graph. # find highest nodes with kmer size less than given max_size def find_terminal_nodes(node_id, max_size): node_list = set() for sub_id in catlas.children[node_id]: # shadow size size = catlas.kmer_sizes[sub_id] if size < max_size: node_list.add(sub_id) else: children = find_terminal_nodes(sub_id, max_size) node_list.update(children) return node_list print('finding terminal nodes for {}.'.format(args.maxsize)) terminal = find_terminal_nodes(catlas.root, args.maxsize) print('...got {}'.format(len(terminal))) terminal = {n for n in terminal if catlas.kmer_sizes[n] > args.minsize} print('...down to {} between {} and {} in size.'.format( len(terminal), args.minsize, args.maxsize)) # now, go through and calculate ratios x = [] for node_id in terminal: # calculate: how many k-mers per cDBG node? kmer_size = catlas.kmer_sizes[node_id] shadow_size = catlas.shadow_sizes[node_id] ratio = math.log(kmer_size, 2) - math.log(shadow_size, 2) # track basic info x.append((ratio, node_id, shadow_size, kmer_size)) print('terminal node stats for maxsize: {:g}'.format(args.maxsize)) print('n tnodes:', len(terminal)) print('total k-mers:', catlas.kmer_sizes[catlas.root]) x.sort(reverse=True) for (k, v, a, b) in x[:10]: print('ratio: {:.3f}'.format(2**k), '/ shadow size:', a, '/ kmers:', b) print('... eliding {} nodes'.format(len(x) - 20)) for (k, v, a, b) in x[-10:]: print('ratio: {:.3f}'.format(2**k), '/ shadow size:', a, '/ kmers:', b) # keep the last keep-fraction (default 10%) for examination keep_sum_kmer = args.keep_fraction * catlas.kmer_sizes[catlas.root] sofar = 0 keep_terminal = set() for (k, v, a, b) in reversed(x): sofar += b if sofar > keep_sum_kmer: break keep_terminal.add(v) print('keeping last {} k-mers worth of nodes for' 'examination.'.format(sofar)) # build cDBG shadow ID list. cdbg_shadow = catlas.shadow(keep_terminal) # extract contigs print('extracting contigs & building a sourmash signature') contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz') # track results as signature contigs_mh = sourmash.MinHash(n=0, ksize=args.ksize, scaled=1000) total_bp = 0 total_seqs = 0 outfp = open(args.output, 'wt') for n, record in enumerate(screed.open(contigs)): if n and n % 10000 == 0: offset_f = total_seqs / len(cdbg_shadow) print('...at n {} ({:.1f}% of shadow)'.format( total_seqs, offset_f * 100), end='\r') # contig names == cDBG IDs contig_id = int(record.name) if contig_id not in cdbg_shadow: continue outfp.write('>{}\n{}\n'.format(record.name, record.sequence)) contigs_mh.add_sequence(record.sequence) # track retrieved sequences in a minhash total_bp += len(record.sequence) total_seqs += 1 # done - got all contigs! print('') print('fetched {} contigs, {} bp.'.format(total_seqs, total_bp)) print('wrote contigs to {}'.format(args.output)) with open(args.output + '.sig', 'wt') as fp: ss = sourmash.SourmashSignature(contigs_mh) sourmash.save_signatures([ss], fp)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument('bcalm_unitigs') parser.add_argument('gxt_out') parser.add_argument('contigs_out') parser.add_argument('-k', '--ksize', type=int, default=31) parser.add_argument('-d', '--debug', action='store_true') parser.add_argument('-P', '--pendants', action="store_true", help="don't remove low abundance pendants") parser.add_argument('-a', '--abundance', nargs='?', type=float, default=1.1) parser.add_argument('--randomize', help='randomize cDBG order') args = parser.parse_args(argv) k = args.ksize trim = not args.pendants trim_cutoff = args.abundance unitigs = args.bcalm_unitigs debug = args.debug if args.debug: logging.basicConfig(filename='bcalm_to_gxt.log', filemode='w', level=logging.DEBUG) else: logging.basicConfig(filename='bcalm_to_gxt.log', filemode='w', level=logging.WARNING) logging.debug("starting bcalm_to_gxt run.") gxtfp = open(args.gxt_out, 'wt') contigsfp = bgzf.open(args.contigs_out, 'wb') info_filename = args.contigs_out + '.info.csv' info_fp = open(info_filename, 'wt') in_mh = sourmash.MinHash(0, 31, scaled=1000) out_mh = sourmash.MinHash(0, 31, scaled=1000) # load in the basic graph structure from the BCALM output file neighbors, sequences, mean_abunds, sizes = read_bcalm(unitigs, debug, k) # record input k-mers in a minhash for seq in sequences.values(): in_mh.add_sequence(seq) # make order deterministic by reordering around min value of first, last, # and reverse complementing sequences appropriately print('reordering...') reordering = {} # first, put sequences in specific orientation sequence_list = [] for key in neighbors: v = sequences[key] # pick lexicographically smaller of forward & reverse complement. v2 = screed.rc(v) if v > v2: v = v2 sequence_list.append((v, key)) del sequences[key] # sort all sequences: sequence_list.sort(reverse=True) if args.randomize: print('(!! randomizing order per --randomize !!)') random.shuffle(sequence_list) # ok, now remap all the things. remapping = {} new_sequences = {} # remap sequences new_key = 0 while sequence_list: # consume while iterating sequence, old_key = sequence_list.pop() remapping[old_key] = new_key new_sequences[new_key] = sequence new_key += 1 # remap other things new_neighbors = collections.defaultdict(set) for old_key, vv in neighbors.items(): new_vv = [ remapping[v] for v in vv ] new_neighbors[remapping[old_key]] = set(new_vv) new_mean_abunds = {} for old_key, value in mean_abunds.items(): new_mean_abunds[remapping[old_key]] = value new_sizes = {} for old_key, value in sizes.items(): new_sizes[remapping[old_key]] = value assert len(sequences) == 0 print('...done') sequences = new_sequences mean_abunds = new_mean_abunds sizes = new_sizes neighbors = new_neighbors # if we are removing pendants, we need to relabel the contigs so they are # consecutive integers starting from 0. If not, we create dummy data # structures to make the interface the same elsewhere in the data if trim: print('removing pendants...') non_pendants = set(v for v, N in neighbors.items() if len(N) > 1 or mean_abunds[v] > trim_cutoff) contract_degree_two(non_pendants, neighbors, sequences, mean_abunds, sizes, k) else: non_pendants = list(neighbors.keys()) aliases = {x: i for i, x in enumerate(sorted(non_pendants))} n = len(aliases) # write out sequences & compute offsets offsets = {} kv_list = sorted(aliases.items(), key=lambda x:x[1]) for x, i in kv_list: offsets[x] = contigsfp.tell() contigsfp.write('>{}\n{}\n'.format(i, sequences[x])) out_mh.add_sequence(sequences[x]) contigsfp.close() print('... done! {} unitigs'.format(n)) # start the gxt file by writing the number of nodes (unitigs)) gxtfp.write('{}\n'.format(n)) # write out all of the links, in 'from to' format. n_edges = 0 for v, N in sorted(neighbors.items()): for u in sorted(N): gxtfp.write('{} {}\n'.format(aliases[v], aliases[u])) n_edges += 1 print('{} vertices, {} edges'.format(n, n_edges)) info_fp.write('contig_id,offset,mean_abund,n_kmers\n') for v, i in aliases.items(): info_fp.write('{},{},{:.3f},{}\n'.format(i, offsets[v], mean_abunds[v], sizes[v])) # output two sourmash signatures: one for input contigs, one for # output contigs. in_sig = sourmash.SourmashSignature(in_mh, filename=args.bcalm_unitigs) sourmash.save_signatures([ in_sig ], open(args.bcalm_unitigs + '.sig', 'wt')) out_sig = sourmash.SourmashSignature(out_mh, filename=args.contigs_out) sourmash.save_signatures([ out_sig ], open(args.contigs_out + '.sig', 'wt'))
def main(): p = argparse.ArgumentParser() p.add_argument('zipfile') p.add_argument('signatures', nargs='*') p.add_argument('--sig-pathlist') p.add_argument('--compression', type=int, default=9) p.add_argument('--ksize', type=int) # can we accept multiple and write mult sigfiles in one pass? p.add_argument('--scaled', type=int) p.add_argument('--alphabet') args = p.parse_args() zf = zipfile.ZipFile(args.zipfile, 'w') siglist = [x.rstrip() for x in open(args.sig_pathlist)] all_sigs = siglist + args.signatures # is this still needed? feel like we accept aliases now... if args.alphabet == "nucleotide": args.alphabet = "DNA" n = 0 all_md5=set() sig_scaled=None downsample=False for i, filename in enumerate(all_sigs): if n % 10000 == 0: print(f"... processing {n}th signature; currently reading signatures from '{filename}'") for sig in sourmash.load_file_as_signatures(filename, ksize=args.ksize, select_moltype=args.alphabet): # zip needs a unique name for each signature. Use sig md5sum. md5= sig.md5sum() # if this is a duplicate md5sum, add _{number} to make it unique. if md5 in all_md5: sys.stderr.write(f"{str(sig)} has an md5sum identical to one already in the zipfile ({md5})") i=0 full_md5 = f"{md5}_{i}" while full_md5 in all_md5: i+= 1 full_md5 = f"{md5}_{i}" md5=full_md5 sys.stderr.write(f"...adding unique md5 {md5} instead") all_md5.add(md5) md5_name = 'signatures/' + md5 + '.sig' # once, check we can downsample if args.scaled and not sig_scaled: sig_scaled = sig.minhash.scaled if args.scaled < sig_scaled: print(f"Can't downsample: desired scaled {args.scaled} is smaller than original scaled, {sig_scaled}. Exiting!") sys.exit(-1) else: downsample=True # if need to downsample, do it if downsample: sig.minhash = sig.minhash.downsample(scaled=args.scaled) sigstr = sourmash.save_signatures([sig], compression=args.compression) zf.writestr(md5_name, sigstr) n += 1 print(f"wrote {n} signatures to '{args.zipfile}'") return 0
def main(argv): parser = argparse.ArgumentParser() parser.add_argument('bcalm_unitigs') parser.add_argument('gxt_out') parser.add_argument('contigs_out') parser.add_argument('-k', '--ksize', type=int, default=31) parser.add_argument('-d', '--debug', action='store_true') parser.add_argument('-P', '--pendants', action="store_true", help="don't remove low abundance pendants") parser.add_argument('-a', '--abundance', nargs='?', type=float, default=1.1) parser.add_argument('--randomize', help='randomize cDBG order') args = parser.parse_args(argv) k = args.ksize trim = not args.pendants trim_cutoff = args.abundance unitigs = args.bcalm_unitigs debug = args.debug if args.debug: logging.basicConfig(filename='bcalm_to_gxt.log', filemode='w', level=logging.DEBUG) else: logging.basicConfig(filename='bcalm_to_gxt.log', filemode='w', level=logging.WARNING) logging.debug("starting bcalm_to_gxt run.") gxtfp = open(args.gxt_out, 'wt') contigsfp = bgzf.open(args.contigs_out, 'wb') info_filename = args.contigs_out + '.info.csv' info_fp = open(info_filename, 'wt') in_mh = sourmash.MinHash(0, 31, scaled=1000) out_mh = sourmash.MinHash(0, 31, scaled=1000) # load in the basic graph structure from the BCALM output file neighbors, sequences, mean_abunds, sizes = read_bcalm(unitigs, debug, k) # record input k-mers in a minhash for seq in sequences.values(): in_mh.add_sequence(seq) # make order deterministic by reordering around min value of first, last, # and reverse complementing sequences appropriately print('reordering...') reordering = {} # first, put sequences in specific orientation sequence_list = [] for key in neighbors: v = sequences[key] # pick lexicographically smaller of forward & reverse complement. v2 = screed.rc(v) if v > v2: v = v2 sequence_list.append((v, key)) del sequences[key] # sort all sequences: sequence_list.sort(reverse=True) if args.randomize: print('(!! randomizing order per --randomize !!)') random.shuffle(sequence_list) # ok, now remap all the things. remapping = {} new_sequences = {} # remap sequences new_key = 0 while sequence_list: # consume while iterating sequence, old_key = sequence_list.pop() remapping[old_key] = new_key new_sequences[new_key] = sequence new_key += 1 # remap other things new_neighbors = collections.defaultdict(set) for old_key, vv in neighbors.items(): new_vv = [remapping[v] for v in vv] new_neighbors[remapping[old_key]] = set(new_vv) new_mean_abunds = {} for old_key, value in mean_abunds.items(): new_mean_abunds[remapping[old_key]] = value new_sizes = {} for old_key, value in sizes.items(): new_sizes[remapping[old_key]] = value assert len(sequences) == 0 print('...done') sequences = new_sequences mean_abunds = new_mean_abunds sizes = new_sizes neighbors = new_neighbors # if we are removing pendants, we need to relabel the contigs so they are # consecutive integers starting from 0. If not, we create dummy data # structures to make the interface the same elsewhere in the data if trim: print('removing pendants...') non_pendants = set(v for v, N in neighbors.items() if len(N) > 1 or mean_abunds[v] > trim_cutoff) contract_degree_two(non_pendants, neighbors, sequences, mean_abunds, sizes, k) else: non_pendants = list(neighbors.keys()) aliases = {x: i for i, x in enumerate(sorted(non_pendants))} n = len(aliases) # write out sequences & compute offsets offsets = {} kv_list = sorted(aliases.items(), key=lambda x: x[1]) for x, i in kv_list: offsets[x] = contigsfp.tell() contigsfp.write('>{}\n{}\n'.format(i, sequences[x])) out_mh.add_sequence(sequences[x]) contigsfp.close() print('... done! {} unitigs'.format(n)) # start the gxt file by writing the number of nodes (unitigs)) gxtfp.write('{}\n'.format(n)) # write out all of the links, in 'from to' format. n_edges = 0 for v, N in sorted(neighbors.items()): for u in sorted(N): gxtfp.write('{} {}\n'.format(aliases[v], aliases[u])) n_edges += 1 print('{} vertices, {} edges'.format(n, n_edges)) info_fp.write('contig_id,offset,mean_abund,n_kmers\n') for v, i in aliases.items(): info_fp.write('{},{},{:.3f},{}\n'.format(i, offsets[v], mean_abunds[v], sizes[v])) # output two sourmash signatures: one for input contigs, one for # output contigs. in_sig = sourmash.SourmashSignature(in_mh, filename=args.bcalm_unitigs) sourmash.save_signatures([in_sig], open(args.bcalm_unitigs + '.sig', 'wt')) out_sig = sourmash.SourmashSignature(out_mh, filename=args.contigs_out) sourmash.save_signatures([out_sig], open(args.contigs_out + '.sig', 'wt'))
def main(): p = argparse.ArgumentParser() p.add_argument('hashfile') # file that contains hashes p.add_argument('-o', '--output', default=None, help='file to output signature to') p.add_argument('-k', '--ksize', default=None, type=int) p.add_argument('--scaled', default=None, type=int) p.add_argument('--num', default=None, type=int) p.add_argument('--name', default='', help='signature name') p.add_argument('--filename', default='', help='filename to add to signature') args = p.parse_args() # check arguments. if args.scaled and args.num: error('cannot specify both --num and --scaled! exiting.') return -1 if not args.ksize: error('must specify --ksize') return -1 if not args.output: error('must specify --output') return -1 # first, load in all the hashes hashes = set() for line in open(args.hashfile, 'rt'): hashval = int(line.strip()) hashes.add(hashval) if not hashes: error("ERROR, no hashes loaded from {}!", args.hashfile) return -1 notify('loaded {} distinct hashes from {}', len(hashes), args.hashfile) # now, create the MinHash object that we'll use. scaled = 0 num = 0 if args.scaled: scaled = args.scaled elif args.num: num = args.num else: notify('setting --num automatically from the number of hashes.') num = len(hashes) # construct empty MinHash object according to args minhash = MinHash(n=num, ksize=args.ksize, scaled=scaled) # add hashes into! minhash.add_many(hashes) if len(minhash) < len(hashes): notify("WARNING: loaded {} hashes, but only {} made it into MinHash.", len(hashes), len(minhash)) if scaled: notify("This is probably because of the scaled argument.") elif args.num: notify("This is probably because your --num is set to {}", args.num) if num > len(minhash): notify("WARNING: --num set to {}, but only {} hashes in signature.", num, len(minhash)) sigobj = sourmash.SourmashSignature(minhash, name=args.name, filename=args.filename) with open(args.output, 'wt') as fp: sourmash.save_signatures([sigobj], fp) notify('wrote signature to {}', args.output)
def downsample(args): """ downsample a scaled signature. """ p = SourmashArgumentParser(prog='sourmash signature downsample') p.add_argument('signatures', nargs="+") p.add_argument('--scaled', type=int, default=0, help='scaled value to downsample to') p.add_argument('--num', type=int, default=0, help='num value to downsample to') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) if not args.num and not args.scaled: error('must specify either --num or --scaled value') sys.exit(-1) if args.num and args.scaled: error('cannot specify both --num and --scaled') sys.exit(-1) output_list = [] total_loaded = 0 for sigfile in args.signatures: siglist = sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True) for sigobj in siglist: mh = sigobj.minhash notify('loading and downsampling signature from {}...', sigfile, end='\r') total_loaded += 1 if args.scaled: if mh.scaled: mh_new = mh.downsample_scaled(args.scaled) else: # try to turn a num into a scaled # first check: can we? max_hash = get_max_hash_for_scaled(args.scaled) mins = mh.get_mins() if max(mins) < max_hash: raise ValueError("this num MinHash does not have enough hashes to convert it into a scaled MinHash.") mh_new = copy.copy(mh) _set_num_scaled(mh_new, 0, args.scaled) elif args.num: if mh.num: mh_new = mh.downsample_n(args.num) else: # try to turn a scaled into a num # first check: can we? if len(mh) < args.num: raise ValueError("this scaled MinHash has only {} hashes") mh_new = copy.copy(mh) _set_num_scaled(mh_new, args.num, 0) sigobj.minhash = mh_new output_list.append(sigobj) output_json = sourmash.save_signatures(output_list, fp=args.output) notify("loaded and downsampled {} signatures", total_loaded)
def merge(args): """ merge one or more signatures. """ p = SourmashArgumentParser(prog='sourmash signature merge') p.add_argument('signatures', nargs='+') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') p.add_argument('--flatten', action='store_true', help='Remove abundances from all signatures.') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) first_sig = None mh = None total_loaded = 0 # iterate over all the sigs from all the files. for sigfile in args.signatures: notify('loading signatures from {}...', sigfile, end='\r') this_n = 0 for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True): # first signature? initialize a bunch of stuff if first_sig is None: first_sig = sigobj mh = first_sig.minhash.copy_and_clear() # forcibly remove abundance? if mh.track_abundance and args.flatten: _flatten(mh) try: if not args.flatten: _check_abundance_compatibility(first_sig, sigobj) mh.merge(sigobj.minhash) except: error("ERROR when merging signature '{}' ({}) from file {}", sigobj.name(), sigobj.md5sum()[:8], sigfile) raise this_n += 1 total_loaded += 1 if this_n: notify('loaded and merged {} signatures from {}...', this_n, sigfile, end='\r') if not total_loaded: error("no signatures to merge!?") sys.exit(-1) merged_sigobj = sourmash.SourmashSignature(mh) output_json = sourmash.save_signatures([merged_sigobj], fp=args.output) notify('loaded and merged {} signatures', total_loaded)
out.write(str(hsh) + '\n') if len(new_mins) > 0: minhash = MinHash( n=0, ksize=ksize, scaled=scaled ) # scaled=1 so we keep all (though these were previously at some other scaled val) minhash.add_many(set(counts.keys())) # write sig to file sigobj = sourmash.SourmashSignature( minhash, name=f"aggregated_hashvals_above_{min_count}", filename=f"generated with drop_unique_hashes.py") sigobjs += [sigobj] ## this part only handles one output file -- doesn't take care of case with many ksizes/moltypes with open(outsig, 'wt') as sigout: sourmash.save_signatures(sigobjs, sigout) #notify('wrote signature to {}', args.output) # write out hashes to a text file # this part is from # https://github.com/dib-lab/sourmash/blob/7661087aa0b0e81bfec82a58002463d7c699528a/utils/hashvals-to-signature.py #ksize = int(snakemake.params.get("ksize", 7)) #do some checking here? #if scaled==0: # num=int(snakemake.params.get("num_hashes", 0)) # if num==0: # notify('setting --num automatically from the number of hashes.') # num = len(counts.keys()) # can you access keys this was from Counter object?
def downsample(args): """ downsample a scaled signature. """ p = SourmashArgumentParser(prog='sourmash signature downsample') p.add_argument('signatures', nargs="+") p.add_argument('--scaled', type=int, default=0, help='scaled value to downsample to') p.add_argument('--num', type=int, default=0, help='num value to downsample to') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) if not args.num and not args.scaled: error('must specify either --num or --scaled value') sys.exit(-1) if args.num and args.scaled: error('cannot specify both --num and --scaled') sys.exit(-1) output_list = [] total_loaded = 0 for sigfile in args.signatures: siglist = sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True) for sigobj in siglist: mh = sigobj.minhash notify('loading and downsampling signature from {}...', sigfile, end='\r') total_loaded += 1 if args.scaled: if mh.scaled: mh_new = mh.downsample_scaled(args.scaled) else: # try to turn a num into a scaled # first check: can we? max_hash = get_max_hash_for_scaled(args.scaled) mins = mh.get_mins() if max(mins) < max_hash: raise ValueError( "this num MinHash does not have enough hashes to convert it into a scaled MinHash." ) mh_new = copy.copy(mh) _set_num_scaled(mh_new, 0, args.scaled) elif args.num: if mh.num: mh_new = mh.downsample_n(args.num) else: # try to turn a scaled into a num # first check: can we? if len(mh) < args.num: raise ValueError( "this scaled MinHash has only {} hashes") mh_new = copy.copy(mh) _set_num_scaled(mh_new, args.num, 0) sigobj.minhash = mh_new output_list.append(sigobj) output_json = sourmash.save_signatures(output_list, fp=args.output) notify("loaded and downsampled {} signatures", total_loaded)