def test_similarity_downsample(track_abundance): e = sourmash.MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=2**63) f = sourmash.MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=2**2) e.add_hash(1) e.add_hash(5) assert len(e.get_mins()) == 2 f.add_hash(1) f.add_hash(5) # should be discarded due to max_hash assert len(f.get_mins()) == 1 ee = SourmashSignature(e) ff = SourmashSignature(f) with pytest.raises(ValueError): # mismatch in max_hash ee.similarity(ff) x = ee.similarity(ff, downsample=True) assert round(x, 1) == 1.0
def test_compare_ne(track_abundance): # same content, different names -> different e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig1 = SourmashSignature(e, name='foo') f = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) f.add("AT" * 10) sig2 = SourmashSignature(f, name='bar') assert sig1 != sig2
def test_load_one_fail_multisig(track_abundance): e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1) e2 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig2 = SourmashSignature(e2) x = save_signatures([sig1, sig2]) with pytest.raises(ValueError): y = load_one_signature(x)
def test_compare(track_abundance): # same content, same name -> equal e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig1 = SourmashSignature(e, name='foo') f = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) f.add("AT" * 10) sig2 = SourmashSignature(f, name='foo') assert e == f
def test_compare_ne2_reverse(track_abundance): # same content, one has filename, other does not -> different e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig1 = SourmashSignature(e, name='foo') f = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) f.add("AT" * 10) sig2 = SourmashSignature(f, filename='b') assert sig2 != sig1 assert sig1 != sig2
def test_memmap(): e1 = sourmash.MinHash(n=1, ksize=20) sig1 = SourmashSignature(e1) e2 = sourmash.MinHash(n=1, ksize=25) sig2 = SourmashSignature(e2) siglist = [sig1, sig2] memmapped, filename = to_memmap(np.array(siglist)) # Assert that the data didn't change as a result of memory-mapping np.testing.assert_array_equal(memmapped, siglist) assert filename.endswith(".mmap")
def test_save_minified(track_abundance): e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1, name="foo") e2 = sourmash.MinHash(n=1, ksize=25, track_abundance=track_abundance) sig2 = SourmashSignature(e2, name="bar baz") x = save_signatures([sig1, sig2]) assert '\n' not in x assert len(x.split('\n')) == 1 y = list(load_signatures(x)) assert len(y) == 2 assert any(sig.name() == 'foo' for sig in y) assert any(sig.name() == 'bar baz' for sig in y)
def build_signature(p): header, seq = p mg_minhash = sourmash.MinHash(n=0, ksize=51, scaled=100) mg_minhash.add_sequence(str(seq), force=True) mg_sig = sourmash.SourmashSignature(mg_minhash, name=header) return mg_sig
def test_save_load_multisig_json(): e1 = sourmash.MinHash(n=1, ksize=20) sig1 = SourmashSignature(e1) e2 = sourmash.MinHash(n=1, ksize=25) sig2 = SourmashSignature(e2) x = save_signatures_json([sig1, sig2]) y = list(load_signatures_json(x)) print(x) assert len(y) == 2 assert sig1 in y # order not guaranteed, note. assert sig2 in y assert sig1 != sig2
def main(argv): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('contigs') parser.add_argument('picklefile') parser.add_argument('-k', '--ksize', type=int, default=31) parser.add_argument('--scaled', type=int, default=10000) args = parser.parse_args(argv) mh = sourmash.MinHash(0, args.ksize, scaled=args.scaled) hashval_to_contig_id = {} notify('reading contigs from {}', args.contigs) for record in screed.open(args.contigs): contig_id = int(record.name) this_mh = mh.copy_and_clear() this_mh.add_sequence(record.sequence, force=True) mins = this_mh.get_mins() for hashval in mins: hashval_to_contig_id[hashval] = contig_id notify('saving {} hashval -> cdbg_id mappings to {}', len(hashval_to_contig_id), args.picklefile) with open(args.picklefile, 'wb') as dumpfp: dump(hashval_to_contig_id, dumpfp)
def sig_import(args): """ import a signature into sourmash format. """ set_quiet(args.quiet) siglist = [] for filename in args.filenames: with open(filename) as fp: x = json.loads(fp.read()) ksize = x['kmer'] num = x['sketchSize'] assert x['hashType'] == "MurmurHash3_x64_128" assert x['hashBits'] == 64 assert x['hashSeed'] == 42 xx = x['sketches'][0] hashes = xx['hashes'] mh = sourmash.MinHash(ksize=ksize, n=num, is_protein=False) mh.add_many(hashes) s = sourmash.SourmashSignature(mh, filename=filename) siglist.append(s) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures(siglist, fp)
def test_save_load_multisig(track_abundance): e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1) e2 = sourmash.MinHash(n=1, ksize=25, track_abundance=track_abundance) sig2 = SourmashSignature(e2) x = save_signatures([sig1, sig2]) y = list(load_signatures(x)) print(x) assert len(y) == 2 assert sig1 in y # order not guaranteed, note. assert sig2 in y assert sig1 != sig2
def clustermap(prefix, outdir): """ Computes the pairwise comparison between kmers (k = 31) of missing regions and mapped regions using Jaccard similarity. Finally, generates a cluster map for those comparisons. Parameters ---------- prefix: str Name of the genome outdir: str Output directory """ logging.info("Running clustermap analysis with Sourmash") regions_fasta = [ '{outdir}/{prefix}_unmappedregions.fasta'.format(outdir=outdir, prefix=prefix), '{outdir}/{prefix}_mappedregions.fasta'.format(outdir=outdir, prefix=prefix) ] minhashes = list() id_records = list() for r in regions_fasta: E = sourmash.MinHash(n=1000, ksize=31) for record in SeqIO.parse(r, format='fasta'): E.add_sequence(str(record.seq)) if r == '{outdir}/{prefix}_unmappedregions.fasta'.format( outdir=outdir, prefix=prefix): newid = ''.join([record.id, '_Um']) id_records.append(newid) else: newid = ''.join([record.id, '_M']) id_records.append(newid) minhashes.append(E) simil = dict() for i, e in enumerate(minhashes): jac = list() for j, e2 in enumerate(minhashes): x = e.jaccard(minhashes[j]) jac.append(x) simil[id_records[i]] = jac array = {k: np.array(v) for k, v in simil.items()} X = pd.DataFrame.from_dict(array, orient='index') sour_dist = pd.DataFrame.from_dict(simil) sour_path = '{outdir}/kmer'.format(outdir=outdir) sour_dist.to_csv(os.path.join( sour_path, '{prefix}_sourmash_distances.tsv'.format(prefix=prefix)), sep='\t', index=False) plt.figure(figsize=(15, 10)) sns.set(style='white', font_scale=1.2, palette='Spectral') ax = sns.clustermap(X) sns.despine() ax.savefig(os.path.join(sour_path, 'sourmash_clustermap.jpg')) plt.clf() logging.info("Clustermap analysis complete")
def get_target_sig(sample_name): genome = sample_name mh = sourmash.MinHash(n=1000, ksize=31) for record in screed.open(genome): mh.add_sequence(record.sequence, True) sig = SourmashSignature(mh, name=genome) with open(sample_name + '.sig', 'wt') as fp: save_signatures([sig], fp)
def test_binary_fp(tmpdir, track_abundance): e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) path = tmpdir.join("1.sig") with open(str(path), 'wb') as fp: sig = SourmashSignature(e) s = save_signatures([sig], fp)
def test_load_one_succeed(track_abundance): e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1) x = save_signatures([sig1]) y = load_one_signature(x) assert sig1 == y
def test_hashable(track_abundance): # check: can we use signatures as keys in dictionaries and sets? e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig = SourmashSignature(e) x = set() x.add(sig)
def test_sourmash_signature_api(): e = sourmash.MinHash(n=1, ksize=20) sig = sourmash.SourmashSignature(e) s = sourmash.save_signatures([sig]) sig_x1 = sourmash.load_one_signature(s) sig_x2 = list(sourmash.load_signatures(s))[0] assert sig_x1 == sig assert sig_x2 == sig
def test_roundtrip(track_abundance): e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig = SourmashSignature(e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.minhash assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0
def test_load_compressed(track_abundance): e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1) x = save_signatures([sig1], compression=5) y = load_one_signature(x) assert sig1 == y sigfile = utils.get_test_data('genome-s10+s11.sig.gz') sigs = load_signatures(sigfile)
def test_roundtrip_empty(track_abundance): # edge case, but: empty minhash? :) e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig = SourmashSignature(e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.minhash assert sig.similarity(sig2) == 0 assert sig2.similarity(sig) == 0
def main(): p = argparse.ArgumentParser() p.add_argument('lca_db') p.add_argument('genome', nargs='+') p.add_argument('output') p.add_argument('--fragment', default=100000, type=int) args = p.parse_args() db, ksize, scaled = lca_utils.load_single_database(args.lca_db) mh_factory = sourmash.MinHash(n=0, ksize=ksize, scaled=scaled) print('**', ksize, scaled) n = 0 m = 0 sum_bp = 0 sum_missed_bp = 0 outfp = open(args.output, 'wt') w = csv.writer(outfp) w.writerow(['filename', 'contig', 'begin', 'end', 'lca', 'lca_rank']) # # iterate over all contigs in genome file # for genome in args.genome: for record in screed.open(genome): # fragment longer contigs into smaller regions? for start in range(0, len(record.sequence), args.fragment): seq = record.sequence[start:start + args.fragment] n += 1 sum_bp += len(seq) mh = mh_factory.copy_and_clear() mh.add_sequence(seq, force=True) if not mh: sum_missed_bp += len(seq) continue lineage_counts = summarize(mh.get_mins(), [db], 1) for k in lineage_counts: lca = lca_utils.display_lineage(k, truncate_empty=False) try: lca_rank = k[-1].rank except IndexError: lca_rank = "none" w.writerow((genome, record.name, start, start + args.fragment, lca, lca_rank)) m += 1 min_value = min(mh.get_mins()) return 0
def determine_appropriate_fresh_minhash(alphabet, ksize, scaled_val, ignore_abundance=False): # default behavior is to track abundance abund = not ignore_abundance if alphabet == "nucleotide": mh = sourmash.MinHash(ksize=ksize, n=0, scaled=scaled_val, track_abundance=abund, is_protein=False) elif alphabet == "protein": k = ksize * 3 ## need to multiply bt 3 to get same ksize, bc add_protein method does k/3 mh = sourmash.MinHash(ksize=k, n=0, scaled=scaled_val, track_abundance=abund, is_protein=True, dayhoff=False, hp=False) elif alphabet == "dayhoff": k = ksize * 3 mh = sourmash.MinHash(ksize=k, n=0, scaled=scaled_val, track_abundance=abund, is_protein=True, dayhoff=True, hp=False) elif alphabet == "hp": k = ksize * 3 mh = sourmash.MinHash(ksize=k, n=0, scaled=scaled_val, track_abundance=abund, is_protein=True, dayhoff=False, hp=True) return mh
def test_str(track_abundance): # signatures should be printable e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig = SourmashSignature(e) print(sig) assert str(sig) == 'SourmashSignature(59502a74)' assert repr(sig) == 'SourmashSignature(59502a74)' sig._name = 'fizbar' assert str(sig) == 'SourmashSignature(\'fizbar\', 59502a74)' assert repr(sig) == 'SourmashSignature(\'fizbar\', 59502a74)'
def test_roundtrip_max_hash(track_abundance): e = sourmash.MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=10) e.add_hash(5) sig = SourmashSignature(e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.minhash assert e.max_hash == e2.max_hash assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0
def create_signatures(file_list, ksize=21, verbose=False): file_list = [Path(str(f) + '.sig') for f in file_list] gt = GenomeTools() if verbose: file_list = tqdm(file_list, total=len(file_list)) for f in file_list: if f.is_file(): sig = sourmash.load_one_signature(str(f)) if sig.minhash.ksize == ksize: continue minhash = sourmash.MinHash(n=1000, ksize=ksize) genome = gt.read_fasta(f.with_suffix('')) minhash.add_sequence(genome, True) sig = sourmash.SourmashSignature(minhash, name=f.stem) with f.open('wt') as handle: sourmash.save_signatures([sig], handle)
def test_sourmash_scaled(datadir, ksize): import sourmash rfile = datadir('random-20-a.fa') goetia_sig = SourmashSketch.Sketch.build(0, 31, False, False, False, 42, 1000) sourmash_sig = sourmash.MinHash(0, 31, scaled=1000) processor = SourmashSketch.Processor.build(goetia_sig) processor.process(rfile) for record in read_fastx(rfile): sourmash_sig.add_sequence(record.sequence) goetia_mh = goetia_sig.to_sourmash() assert goetia_mh.similarity(sourmash_sig) == 1.0
def sig_import(args): """ import a signature into sourmash format. """ p = SourmashArgumentParser(prog='sourmash signature import') p.add_argument('filenames', nargs='+') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') args = p.parse_args(args) set_quiet(args.quiet) siglist = [] for filename in args.filenames: with open(filename) as fp: x = json.loads(fp.read()) ksize = x['kmer'] num = x['sketchSize'] assert x['hashType'] == "MurmurHash3_x64_128" assert x['hashBits'] == 64 assert x['hashSeed'] == 42 xx = x['sketches'][0] hashes = xx['hashes'] mh = sourmash.MinHash(ksize=ksize, n=num, is_protein=False) mh.add_many(hashes) s = sourmash.SourmashSignature(mh, filename=filename) siglist.append(s) sourmash.save_signatures(siglist, args.output)
def compare_sigs(sag_id, sag_file, mhr_path, sig_path, mg_sig_list, jacc_threshold): sag_subcontigs = s_utils.get_seqs(sag_file) if isfile(o_join(mhr_path, sag_id + '.mhr_recruits.tsv')): logging.info('[SABer]: Loading %s and MetaG signature recruit list\n' % sag_id) with open(o_join(mhr_path, sag_id + '.mhr_recruits.tsv'), 'r') as mhr_in: pass_list = [ x.rstrip('\n').split('\t') for x in mhr_in.readlines() ] else: # Calculate\Load MinHash Signatures with SourMash for SAG subseqs if isfile(o_join(sig_path, sag_id + '.SAG.sig')): logging.info('[SABer]: Loading Signature for %s\n' % sag_id) sag_sig = sourmash.signature.load_one_signature( o_join(sig_path, sag_id + '.SAG.sig')) else: logging.info('[SABer]: Building Signature for %s\n' % sag_id) sag_minhash = sourmash.MinHash(n=0, ksize=51, scaled=100) for sg_head in sag_subcontigs: sag_subseq = str(sag_subcontigs[sg_head].seq) sag_minhash.add_sequence(sag_subseq, force=True) sag_sig = sourmash.SourmashSignature(sag_minhash, name=sag_id) with open(o_join(sig_path, sag_id + '.SAG.sig'), 'w') as sags_out: sourmash.signature.save_signatures([sag_sig], fp=sags_out) logging.info('[SABer]: Comparing %s and MetaG signature\n' % sag_id) pass_list = [] for mg_sig in mg_sig_list: jacc_sim = mg_sig.similarity(sag_sig) mg_nm = mg_sig.name() if jacc_sim >= jacc_threshold: pass_list.append([sag_id, mg_nm, mg_nm.rsplit('_', 1)[0]]) with open(o_join(mhr_path, sag_id + '.mhr_recruits.tsv'), 'w') as mhr_out: mhr_out.write('\n'.join(['\t'.join(x) for x in pass_list])) pass_list = tuple(pass_list) return pass_list
### K = 21 import sys, screed import mmh3 import sourmash print('imported sourmash:', sourmash, file=sys.stderr) from sourmash import MinHash import sourmash.signature record = next(iter(screed.open(sys.argv[1]))) print('loaded', record.name, file=sys.stderr) mh = sourmash.MinHash(ksize=K, n=500, is_protein=True) prot_ksize = int(K / 3) for kmer in kmers(record.sequence, prot_ksize): hash = mmh3.hash64(kmer, seed=42)[0] # convert to unsigned int if negative if hash < 0: hash += 2**64 mh.add_hash(hash) s = sourmash.signature.SourmashSignature('', mh, name=record.name) print(sourmash.signature.save_signatures([s]))