def make_estimators(): seed = args.seed max_hash = 0 if args.scaled: max_hash = 2**64 / float(args.scaled) # one estimator for each ksize Elist = [] for k in ksizes: if args.protein: E = sourmash_lib.Estimators( ksize=k, n=args.num_hashes, is_protein=True, track_abundance=args.track_abundance, max_hash=max_hash, seed=seed) Elist.append(E) if args.dna: E = sourmash_lib.Estimators( ksize=k, n=args.num_hashes, is_protein=False, with_cardinality=args.with_cardinality, track_abundance=args.track_abundance, max_hash=max_hash, seed=seed) Elist.append(E) return Elist
def make_estimators(): # one estimator for each ksize Elist = [] for k in ksizes: if args.protein: E = sourmash_lib.Estimators(ksize=k, n=args.num_hashes, protein=True, track_abundance=args.track_abundance) Elist.append(E) if args.dna: E = sourmash_lib.Estimators(ksize=k, n=args.num_hashes, protein=False, with_cardinality=args.with_cardinality, track_abundance=args.track_abundance) Elist.append(E) return Elist
def test_name_3(track_abundance): e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance) sig = SourmashSignature('*****@*****.**', e, name='foo', filename='foo.txt') assert sig.name() == 'foo'
def test_save_load_multisig(): e1 = sourmash_lib.Estimators(n=1, ksize=20) sig1 = SourmashSignature('*****@*****.**', e1) e2 = sourmash_lib.Estimators(n=1, ksize=20) sig2 = SourmashSignature('*****@*****.**', e2) x = save_signatures([sig1, sig2]) y = load_signatures(x) print(x) assert len(y) == 2 assert sig1 in y # order not guaranteed, note. assert sig2 in y assert sig1 != sig2
def to_sourmashsignature(obj, is_protein=False, email=None, name='', filename=''): if not isinstance(obj, MinSketch): raise ValueError("The obj must be a MinSketch.") if not obj._hashfun is mash_hashfun: raise ValueError("The only accepted hash function is %s." % str(mash_hashfun)) estimator = sourmash_lib.Estimators( n=obj.maxsize, ksize=obj.nsize, is_protein=is_protein, with_cardinality=False, track_abundance=False, max_hash=0, # ??? seed=obj.seed) for h in obj._heapset: estimator.mh.add_hash(h) return sourmash_lib.signature.SourmashSignature(email, estimator, name, filename)
def test_roundtrip(track_abundance): e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig = SourmashSignature('*****@*****.**', e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.estimator assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0
def test_roundtrip_empty_email(): e = sourmash_lib.Estimators(n=1, ksize=20) e.add("AT" * 10) sig = SourmashSignature('', e) s = save_signatures([sig]) siglist = load_signatures(s) sig2 = siglist[0] e2 = sig2.estimator assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0
def test_roundtrip_empty(track_abundance): # edge case, but: empty estimator? :) e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance) sig = SourmashSignature('*****@*****.**', e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.estimator assert sig.similarity(sig2) == 0 assert sig2.similarity(sig) == 0
def minhash(sequence_gen, ksize, n, **kwargs): ''' as kwargs we can pass (=default) - with_cardinality=False - track_abundance=False Note that a tuple is returned: (_id, estimator), i.e. we continue to carry the _id along. ''' for name, seq in sequence_gen: e = sourmash_lib.Estimators(n=n, ksize=ksize, **kwargs) e.add_sequence(seq, force=True) yield name, e
def test_roundtrip_seed(track_abundance): e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance, seed=10) e.mh.add_hash(5) sig = SourmashSignature('*****@*****.**', e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.estimator assert e.seed == e2.seed assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0
def _load_one_signature(sketch, email, name, filename, ignore_md5sum=False): """Helper function to unpack and check one signature block only.""" ksize = sketch['ksize'] mins = list(map(int, sketch['mins'])) n = int(sketch['num']) molecule = sketch.get('molecule', 'dna') seed = sketch.get('seed', sourmash_lib.DEFAULT_SEED) if molecule == 'protein': is_protein = True elif molecule == 'dna': is_protein = False else: raise Exception("unknown molecule type: {}".format(molecule)) max_hash = int(sketch.get('max_hash', 0)) seed = int(sketch.get('seed', sourmash_lib.DEFAULT_SEED)) track_abundance = 'abundances' in sketch e = sourmash_lib.Estimators(ksize=ksize, n=n, is_protein=is_protein, track_abundance=track_abundance, max_hash=max_hash, seed=seed) if track_abundance: abundances = list(map(int, sketch['abundances'])) e.mh.set_abundances(dict(zip(mins, abundances))) else: for m in mins: e.mh.add_hash(m) if 'cardinality' in sketch: e.hll = FakeHLL(int(sketch['cardinality'])) sig = SourmashSignature(email, e) if not ignore_md5sum: md5sum = sketch['md5sum'] if md5sum != sig.md5sum(): raise Exception('error loading - md5 of estimator does not match') if name: sig.d['name'] = name if filename: sig.d['filename'] = filename return sig
def import_csv(self, args): "Import a CSV file full of signatures/hashes." p = argparse.ArgumentParser() p.add_argument('mash_csvfile') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout) p.add_argument('--email', type=str, default='') args = p.parse_args(args) with open(args.mash_csvfile, 'r') as fp: reader = csv.reader(fp) siglist = [] for row in reader: hashfn = row[0] hashseed = int(row[1]) # only support a limited import type, for now ;) assert hashfn == 'murmur64' assert hashseed == 42 _, _, ksize, name, hashes = row ksize = int(ksize) hashes = hashes.strip() hashes = list(map(int, hashes.split(' '))) e = sourmash_lib.Estimators(len(hashes), ksize) for h in hashes: e.mh.add_hash(h) s = sig.SourmashSignature(args.email, e, filename=name) siglist.append(s) print('loaded signature:', name, s.md5sum()[:8], file=sys.stderr) print('saving %d signatures to YAML' % (len(siglist), ), file=sys.stderr) sig.save_signatures(siglist, args.output)
def _load_one_signature(sketch, email, name, filename, ignore_md5sum=False): """Helper function to unpack and check one signature block only.""" ksize = sketch['ksize'] prime = sketch['prime'] if sketch.get('type') == 'composition': prefixsize = sketch['prefixsize'] n = int(sketch['subsketches']['num']) e = sourmash_lib.CompositionSketch(ksize=ksize, max_prime=prime, n=n, prefixsize=prefixsize) for item in sketch['subsketches']: n = item['num'] mins = item['mins'] n = int(n) for m in map(int, mins): e.sketches[n].mh.add_hash(m) sig = SourmashCompositeSignature(email, e) else: mins = list(map(int, sketch['mins'])) n = len(mins) e = sourmash_lib.Estimators(ksize=ksize, max_prime=prime, n=n) for m in mins: e.mh.add_hash(m) sig = SourmashSignature(email, e) if not ignore_md5sum: md5sum = sketch['md5sum'] if md5sum != sig.md5sum(): raise Exception('error loading - md5 of estimator does not match') if name: sig.d['name'] = name if filename: sig.d['filename'] = filename return sig
def import_csv(args): "Import a CSV file full of signatures/hashes." p = argparse.ArgumentParser() p.add_argument('mash_csvfile') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='(default: stdout)') p.add_argument('--email', type=str, default='', help='(default: %(default)s)') args = p.parse_args(args) with open(args.mash_csvfile, 'r') as fp: reader = csv.reader(fp) siglist = [] for row in reader: hashfn = row[0] hashseed = int(row[1]) # only support a limited import type, for now ;) assert hashfn == 'murmur64' assert hashseed == 42 _, _, ksize, name, hashes = row ksize = int(ksize) hashes = hashes.strip() hashes = list(map(int, hashes.split(' '))) e = sourmash_lib.Estimators(len(hashes), ksize) e.add_many(hashes) s = sig.SourmashSignature(args.email, e, filename=name) siglist.append(s) notify('loaded signature: {} {}', name, s.md5sum()[:8]) notify('saving {} signatures to JSON', len(siglist)) sig.save_signatures(siglist, args.output)
def _load_one_signature(sketch, email, name, filename, ignore_md5sum=False): """Helper function to unpack and check one signature block only.""" ksize = sketch['ksize'] mins = list(map(int, sketch['mins'])) n = int(sketch['num']) e = sourmash_lib.Estimators(ksize=ksize, n=n) for m in mins: e.mh.add_hash(m) sig = SourmashSignature(email, e) if not ignore_md5sum: md5sum = sketch['md5sum'] if md5sum != sig.md5sum(): raise Exception('error loading - md5 of estimator does not match') if name: sig.d['name'] = name if filename: sig.d['filename'] = filename return sig
def handler(event, context): print("Received Event: " + json.dumps(event, indent=2)) # TODO: parse args from event args = { 'protein': True, 'n': 500, 'k': 31, # 'url': 'http://athyra.oxli.org/~luizirber/missing.fa', 'url': 'http://athyra.oxli.org/~luizirber/reads_lt_90.fasta', 'email': '*****@*****.**', } print("Creating estimators") E = sourmash_lib.Estimators(ksize=args['k'], n=args['n'], protein=args['protein']) print("Opening file") with closing(requests.get(args['url'], stream=True)) as r: for n, record in enumerate(screed.fasta.fasta_iter(r.raw)): if n % 500 == 0: print("%d reads" % n) if args['protein']: E.mh.add_protein(record.sequence) else: E.add_sequence(record.sequence) print("Outputing signature") sig = signature.SourmashSignature( args['email'], E, filename=args['url']) out = StringIO("") signature.save_signatures([sig], out) return out.getvalue()
def test_name_2(): e = sourmash_lib.Estimators(n=1, ksize=20) sig = SourmashSignature('*****@*****.**', e, filename='foo.txt') assert sig.name() == 'foo.txt'
def _json_next_signature(iterable, email=None, name=None, filename=None, ignore_md5sum=False, prefix_item='abundances.item', ijson=ijson): """Helper function to unpack and check one signature block only. - iterable: an iterable such the one returned by ijson.parse() - email: - name: - filename: - ignore_md5sum: - prefix_item: required when parsing nested JSON structures - ijson: ijson backend to use. """ from .signature import FakeHLL, SourmashSignature d = dict() prefix, event, value = next(iterable) if event == 'start_map': prefix, event, value = next(iterable) while event != 'end_map': key = value if key == 'mins': value = _json_next_atomic_array(iterable, prefix_item=prefix_item, ijson=ijson) elif key == 'abundances': value = _json_next_atomic_array(iterable, prefix_item=prefix_item, ijson=ijson) else: prefix, event, value = next(iterable) d[key] = value prefix, event, value = next(iterable) ksize = d['ksize'] mins = d['mins'] n = d['num'] molecule = d.get('molecule', 'dna') if molecule == 'protein': is_protein = True elif molecule == 'dna': is_protein = False else: raise Exception("unknown molecule type: {}".format(molecule)) track_abundance = False if 'abundances' in d: track_abundance = True e = sourmash_lib.Estimators(ksize=ksize, n=n, protein=is_protein, track_abundance=track_abundance) if not track_abundance: for m in mins: e.mh.add_hash(m) else: abundances = list(map(int, d['abundances'])) e.mh.set_abundances(dict(zip(mins, abundances))) if 'cardinality' in d: e.hll = FakeHLL(d['cardinality']) sig = SourmashSignature(email, e) if not ignore_md5sum: md5sum = d['md5sum'] if md5sum != sig.md5sum(): raise Exception('error loading - md5 of estimator does not match') if name: sig.d['name'] = name if filename: sig.d['filename'] = filename return sig
def test_name_4(track_abundance): e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance) sig = SourmashSignature('*****@*****.**', e) assert sig.name() == sig.md5sum()[:8]
def watch(self, args): "Build a signature from raw FASTA/FASTQ coming in on stdin, search." from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBest parser = argparse.ArgumentParser() parser.add_argument('sbt_name') parser.add_argument('-o', '--output', type=argparse.FileType('wt')) parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) parser.add_argument('--threshold', default=0.05, type=float) parser.add_argument('--input-is-protein', action='store_true') sourmash_args.add_moltype_args(parser, default_dna=True) parser.add_argument('-n', '--num-hashes', type=int, default=DEFAULT_N, help='number of hashes to use in each sketch (default: %(default)i)') parser.add_argument('--name', type=str, default='stdin') args = parser.parse_args(args) if args.input_is_protein and args.dna: print('WARNING: input is protein, turning off DNA hash computing.', file=sys.stderr) args.dna = False args.protein = True if args.dna and args.protein: notify('ERROR: cannot use "watch" with both DNA and protein.') if args.dna: moltype = 'DNA' is_protein = False else: moltype = 'protein' is_protein = True E = sourmash_lib.Estimators(ksize=args.ksize, n=args.num_hashes, protein=is_protein) streamsig = sig.SourmashSignature('', E, filename='stdin', name=args.name) notify('Computing signature for k={}, {} from stdin', args.ksize, moltype) tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) def do_search(): search_fn = SearchMinHashesFindBest().search results = [] for leaf in tree.find(search_fn, streamsig, args.threshold): results.append((streamsig.similarity(leaf.data), leaf.data)) return results notify('reading sequences from stdin') screed_iter = screed.open('/dev/stdin') watermark = WATERMARK_SIZE # iterate over input records n = 0 for n, record in enumerate(screed_iter): # at each watermark, print status & check cardinality if n >= watermark: notify('... read {} sequences', n) watermark += WATERMARK_SIZE if do_search(): break if args.input_is_protein: E.mh.add_protein(record.sequence) else: E.add_sequence(record.sequence, False) results = do_search() if not results: notify('... read {} sequences, no matches found.', n) else: results.sort(key=lambda x: -x[0]) # take best similarity, found_sig = results[0] notify('FOUND: {}, at {:.3f}', found_sig.name(), similarity) if args.output: sig.save_signatures([streamsig], args.output)
def sbt_gather(args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBestIgnoreMaxHash parser = argparse.ArgumentParser() parser.add_argument('sbt_name', help='name of SBT to search') parser.add_argument('query', help='query signature') parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) parser.add_argument('--threshold', default=0.05, type=float) parser.add_argument('-o', '--output', type=argparse.FileType('wt')) parser.add_argument('--csv', type=argparse.FileType('wt')) parser.add_argument('--save-matches', type=argparse.FileType('wt')) sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) moltype = sourmash_args.calculate_moltype(args) tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) query = sourmash_args.load_query_signature(args.query, select_ksize=args.ksize, select_moltype=moltype) query_moltype = sourmash_args.get_moltype(query) query_ksize = query.estimator.ksize notify('loaded query: {}... (k={}, {})', query.name()[:30], query_ksize, query_moltype) if query.estimator.max_hash == 0: error('query signature needs to be created with --scaled') error('or using --with-cardinality.') sys.exit(-1) notify('query signature has max_hash: {}', query.estimator.max_hash) orig_query = query R_metagenome = 2**64 / float(orig_query.estimator.max_hash) new_mins = query.estimator.get_hashes() e = sourmash_lib.Estimators(ksize=args.ksize, n=len(new_mins)) e.update(query.estimator) query = sig.SourmashSignature('', e) sum_found = 0. found = [] while 1: search_fn = SearchMinHashesFindBestIgnoreMaxHash().search results = [] # use super low threshold for this part of the search for leaf in tree.find(search_fn, query, 0.00001): results.append((query.estimator.similarity_ignore_maxhash( leaf.data.estimator), leaf.data)) if not len(results): # no matches at all! break # take the best result results.sort(key=lambda x: -x[0]) # reverse sort on similarity best_sim, best_ss = results[0] # subtract found hashes from search hashes, construct new search new_mins = set(query.estimator.get_hashes()) found_mins = best_ss.estimator.get_hashes() if best_ss.estimator.max_hash: R_genome = 2**64 / float(best_ss.estimator.max_hash) elif best_ss.estimator.hll: genome_size = best_ss.estimator.hll.estimate_cardinality() genome_max_hash = max(found_mins) R_genome = float(genome_size) / float(genome_max_hash) else: error('Best hash match in sbt_gather has no cardinality') error('Please prepare database of sequences with --scaled') error('...or with --with-cardinality') sys.exit(-1) R_comparison = max(R_metagenome, R_genome) new_max_hash = 2**64 / float(R_comparison) new_mins = set([i for i in new_mins if i < new_max_hash]) found_mins = set([i for i in found_mins if i < new_max_hash]) # intersection: intersect_mins = new_mins.intersection(found_mins) if len(intersect_mins) < 5: # hard cutoff for now notify('found only {} hashes in common.', len(intersect_mins)) notify('this is below a sane threshold => exiting.') break # first denominator - genome size genome_n_mins = len(found_mins) f_genome = len(intersect_mins) / float(genome_n_mins) # second denominator - metagenome size query_n_mins = len(orig_query.estimator.get_hashes()) f_query = len(intersect_mins) / float(query_n_mins) # print interim & save notify('found: {:.2f} {:.2f} {}', f_genome, f_query, best_ss.name()) found.append((f_genome, best_ss)) new_mins -= set(found_mins) e = sourmash_lib.Estimators(ksize=args.ksize, n=len(new_mins)) e.add_many(new_mins) query = sig.SourmashSignature('', e) notify('found {}, total fraction {:.3f}', len(found), sum_found) notify('') if not found: sys.exit(0) found.sort(key=lambda x: x[0]) found.reverse() notify('Composition:') for (frac, leaf_sketch) in found: notify('{:.2f} {}', frac, leaf_sketch.name()) if args.output: print('Composition:', file=args.output) for (frac, leaf_sketch) in found: print('{:.2f} {}'.format(frac, leaf_sketch.name()), file=args.output) if args.csv: fieldnames = ['fraction', 'name', 'sketch_kmers'] w = csv.DictWriter(args.csv, fieldnames=fieldnames) w.writeheader() for (frac, leaf_sketch) in found: cardinality = leaf_sketch.estimator.hll.estimate_cardinality() w.writerow( dict(fraction=frac, name=leaf_sketch.name(), sketch_kmers=cardinality)) if args.save_matches: outname = args.save_matches.name notify('saving all matches to "{}"', outname) sig.save_signatures([ss for (f, ss) in found], args.save_matches)
def compute(self, args): "Compute the signature for one or more files." parser = argparse.ArgumentParser() parser.add_argument('filenames', nargs='+') parser.add_argument('--protein', action='store_true') parser.add_argument('--input-is-protein', action='store_true') parser.add_argument('-k', '--ksizes', default=str(DEFAULT_K), help='comma-separated list of k-mer sizes') parser.add_argument('-n', '--num-hashes', type=int, default=DEFAULT_N, help='number of hashes to use in each sketch') parser.add_argument('-f', '--force', action='store_true') parser.add_argument('-o', '--output', type=argparse.FileType('wt')) parser.add_argument('--email', type=str, default='') args = parser.parse_args(args) print('computing signatures for files:', args.filenames, file=sys.stderr) # get list of k-mer sizes for which to compute sketches ksizes = args.ksizes if ',' in ksizes: ksizes = ksizes.split(',') ksizes = list(map(int, ksizes)) else: ksizes = [int(ksizes)] print('Computing signature for ksizes: %s' % str(ksizes), file=sys.stderr) # for each file, load & compute sketch. for filename in args.filenames: sigfile = os.path.basename(filename) + '.sig' if not args.output and os.path.exists(sigfile) and not args.force: print('skipping', filename, '- already done', file=sys.stderr) continue # one estimator for each ksize Elist = [] for k in ksizes: E = sourmash_lib.Estimators(ksize=k, n=args.num_hashes, protein=args.protein) Elist.append(E) # consume & calculate signatures print('... reading sequences from', filename, file=sys.stderr) for n, record in enumerate(screed.open(filename)): if n % 10000 == 0 and n: print('...', filename, n, file=sys.stderr) s = record.sequence for E in Elist: if args.input_is_protein: E.mh.add_protein(s) else: E.add_sequence(s, args.force) # convert into a signature siglist = [ sig.SourmashSignature(args.email, E, filename=filename) for E in Elist ] # save! if args.output: data = sig.save_signatures(siglist, args.output) else: with open(sigfile, 'w') as fp: data = sig.save_signatures(siglist, fp)
def kmers(seq, k): for start in range(len(seq) - k + 1): yield seq[start:start + k] ### K = 21 import sys, screed import mmh3 import sourmash_lib print('imported sourmash:', sourmash_lib, file=sys.stderr) from sourmash_lib import MinHash import sourmash_lib.signature record = next(iter(screed.open(sys.argv[1]))) print('loaded', record.name, file=sys.stderr) E = sourmash_lib.Estimators(ksize=K, n=500, protein=True) prot_ksize = int(K / 3) mh = E.mh for trans in translate(record.sequence): for kmer in kmers(trans, prot_ksize): hash = mmh3.hash128(kmer, seed=42) mh.add_hash(hash) s = sourmash_lib.signature.SourmashSignature('', E, name=record.name) print(sourmash_lib.signature.save_signatures([s]))
p_index[i] + "_sequences.txt", 'r') # /data/scratch/kjacks21/ print("Working on bag " + str(i) + "...") for j, seq in enumerate(seqs): if ((j % 1000) == 0): now = dt.datetime.now() print("Currently on bag " + str(i) + ", instance " + str(j) + " at time " + str(now - start)) # set max_sim to 1 since the sequence is in the same bag if (j >= patients[i, 1] and j <= patients[i, 2]): max_sim = 1 #print("max_sim = 1") else: # write seq file (test1.txt) in fasta format #with open("test1.txt", 'w') as f: # f.write(">\n"+str(seq)) E1 = sourmash_lib.Estimators(n=50, ksize=10) E1.add(seq.strip()) max_sim = 0 for b_seq in bag: prev_max_sim = max_sim # write b_seq fasta file (test2.txt) for mash #with open("test2.txt", 'w') as f: # f.write(">\n"+str(b_seq)) # calculate mash dist between #proc = sp.Popen(["/home/kyle/Documents/cs584/mash/mash-Linux64-v1.1.1/mash","dist","-k","15","-r","-p","3","test1.txt","test2.txt"], stdout=sp.PIPE) #output = str((proc.stdout.readline()), 'UTF8') #mash_dist = float((output.split('\t'))[2]) E2 = sourmash_lib.Estimators(n=50, ksize=10)
def test_md5(track_abundance): e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance) e.mh.add_hash(5) sig = SourmashSignature('*****@*****.**', e) print(sig._save()) assert sig.md5sum() == 'eae27d77ca20db309e056e3d2dcd7d69', sig.md5sum()
def test_md5(): e = sourmash_lib.Estimators(n=1, ksize=20) e.mh.add_hash(5) sig = SourmashSignature('*****@*****.**', e) print(sig.save()) assert sig.md5sum() == 'e4da3b7fbbce2345d7772b0674a318d5', sig.md5sum()
def sbt_gather(self, args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBest parser = argparse.ArgumentParser() parser.add_argument('sbt_name') parser.add_argument('query') parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) parser.add_argument('--threshold', default=0.05, type=float) parser.add_argument('-o', '--output', type=argparse.FileType('wt')) parser.add_argument('--csv', type=argparse.FileType('wt')) sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) if args.protein: if args.dna is True: raise Exception('cannot specify both --dna and --protein!') args.dna = False moltype = None if args.protein: moltype = 'protein' elif args.dna: moltype = 'dna' tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) sl = sig.load_signatures(args.query, select_ksize=args.ksize, select_moltype=moltype) sl = list(sl) if len(sl) != 1: print('When loading query from "{}",'.format(args.query), file=sys.stderr) print('{} query signatures matching ksize and molecule type; need exactly one.'.format(len(sl))) sys.exit(-1) query = sl[0] query_moltype = 'UNKNOWN' if query.estimator.is_molecule_type('dna'): query_moltype = 'DNA' elif query.estimator.is_molecule_type('protein'): query_moltype = 'protein' query_ksize = query.estimator.ksize print('loaded query: {}... (k={}, {})'.format(query.name()[:30], query_ksize, query_moltype)) tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) #s = sig.load_signatures(args.query, select_ksize=args.ksize) orig_query = query sum_found = 0. found = [] while 1: search_fn = SearchMinHashesFindBest().search results = [] # use super low threshold for this part of the search for leaf in tree.find(search_fn, query, 0.00001): results.append((query.similarity(leaf.data), leaf.data)) #results.append((leaf.data.similarity(ss), leaf.data)) if not len(results): # no matches at all! break # take the best result results.sort(key=lambda x: -x[0]) # reverse sort on similarity best_sim, best_ss = results[0] sim = best_ss.similarity(orig_query) # adjust by size of leaf (kmer cardinality of original genome) if best_ss.estimator.hll: leaf_kmers = best_ss.estimator.hll.estimate_cardinality() query_kmers = orig_query.estimator.hll.estimate_cardinality() f_of_total = leaf_kmers / query_kmers * sim else: f_of_total = 0 if not found and sim < args.threshold: print('best match: {}'.format(best_ss.name())) print('similarity is {:.5f} of db signature;'.format(sim)) print('this is below specified threshold => exiting.') break # subtract found hashes from search hashes, construct new search new_mins = set(query.estimator.mh.get_mins()) found_mins = best_ss.estimator.mh.get_mins() # print interim & save print('found: {:.2f} {} {}'.format(f_of_total, len(new_mins), best_ss.name())) found.append((f_of_total, best_ss, sim)) sum_found += f_of_total new_mins -= set(found_mins) e = sourmash_lib.Estimators(ksize=args.ksize, n=len(new_mins)) for m in new_mins: e.mh.add_hash(m) new_ss = sig.SourmashSignature('foo', e) query = new_ss print('found {}, total fraction {:.3f}'.format(len(found), sum_found)) print('') if not found: sys.exit(0) found.sort() found.reverse() print('Composition:') for (frac, leaf_sketch, sim) in found: print('{:.2f} {}'.format(frac, leaf_sketch.name())) if args.output: print('Composition:', file=args.output) for (frac, leaf_sketch, sim) in found: print('{:.2f} {}'.format(frac, leaf_sketch.name()), file=args.output) if args.csv: fieldnames = ['fraction', 'name', 'similarity', 'sketch_kmers'] w = csv.DictWriter(args.csv, fieldnames=fieldnames) w.writeheader() for (frac, leaf_sketch, sim) in found: cardinality = leaf_sketch.estimator.hll.estimate_cardinality() w.writerow(dict(fraction=frac, name=leaf_sketch.name(), similarity=sim, sketch_kmers=cardinality))