def save_minhash(minhash_gen, handle=None, email=''): # ''' issue: https://github.com/dib-lab/sourmash/issues/131 suggested lead: https://github.com/dib-lab/sourmash/blob/master/utils/compute-dna-mh-another-way.py relevant set of functions: https://github.com/dib-lab/sourmash/blob/master/sourmash_lib/signature.py from itertools import islice print(save_minhash(islice(gen, 2))) fp = '/some/path/to.json' with open(fp, 'w+') as outfile: save_minhash(islice(gen, 2), handle=outfile) ''' l = [] bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength) counter = 0 print('Generating signatures ...') for mh in minhash_gen: name, e = mh s = signature.SourmashSignature(email, e, name=name, filename='zoo') l.append(s) # load all to memory ... bad, TODO: stream into file handle counter += 1 bar.update(counter) print('\nSaving signatures ...') if handle is None: return signature.save_signatures(l) # [s] instead of l else: return signature.save_signatures(l, handle)
def test_sourmash_compare_with_abundance_1(): with utils.TempDirectory() as location: # create two signatures E1 = Estimators(ksize=5, n=5, protein=False, track_abundance=True) E2 = Estimators(ksize=5, n=5, protein=False, track_abundance=True) E1.mh.add_sequence('ATGGA') E2.mh.add_sequence('ATGGA') s1 = signature.SourmashSignature('', E1, filename='e1', name='e1') s2 = signature.SourmashSignature('', E2, filename='e2', name='e2') signature.save_signatures([s1], open(os.path.join(location, 'e1.sig'), 'w')) signature.save_signatures([s2], open(os.path.join(location, 'e2.sig'), 'w')) status, out, err = utils.runscript( 'sourmash', ['search', 'e1.sig', 'e2.sig', '-k', '5'], in_directory=location) assert '1.000' in out
def import_csv(self, args): "Import a CSV file full of signatures/hashes." p = argparse.ArgumentParser() p.add_argument('mash_csvfile') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout) p.add_argument('--email', type=str, default='') args = p.parse_args(args) with open(args.mash_csvfile, 'r') as fp: reader = csv.reader(fp) siglist = [] for row in reader: hashfn = row[0] hashseed = int(row[1]) # only support a limited import type, for now ;) assert hashfn == 'murmur64' assert hashseed == 42 _, _, ksize, name, hashes = row ksize = int(ksize) hashes = hashes.strip() hashes = list(map(int, hashes.split(' '))) e = sourmash_lib.Estimators(len(hashes), ksize) for h in hashes: e.mh.add_hash(h) s = sig.SourmashSignature(args.email, e, filename=name) siglist.append(s) print('loaded signature:', name, s.md5sum()[:8], file=sys.stderr) print('saving %d signatures to YAML' % (len(siglist), ), file=sys.stderr) sig.save_signatures(siglist, args.output)
def handler(event, context): print("Received Event: " + json.dumps(event, indent=2)) # TODO: parse args from event args = { 'protein': True, 'n': 500, 'k': 31, # 'url': 'http://athyra.oxli.org/~luizirber/missing.fa', 'url': 'http://athyra.oxli.org/~luizirber/reads_lt_90.fasta', 'email': '*****@*****.**', } print("Creating estimators") E = sourmash_lib.Estimators(ksize=args['k'], n=args['n'], protein=args['protein']) print("Opening file") with closing(requests.get(args['url'], stream=True)) as r: for n, record in enumerate(screed.fasta.fasta_iter(r.raw)): if n % 500 == 0: print("%d reads" % n) if args['protein']: E.mh.add_protein(record.sequence) else: E.add_sequence(record.sequence) print("Outputing signature") sig = signature.SourmashSignature( args['email'], E, filename=args['url']) out = StringIO("") signature.save_signatures([sig], out) return out.getvalue()
def commit(file, client, db, cell, ksize, n): '''Dump a (mongodb) cursor to a data cell. For each document, start a new line in the output. file argument: Filename prefix w/o extension. \b {"_id":"86853586-5e9... {"_id":"689e59b8-514... {"_id":"6d9bff35-aab... This is important bc/ it circumvents the need to hold more than one record in memory, both on import and export. Note also that this is the same output format as ... \b $ mongoexport --db foo --collection bar --out bar.json ... and can be reimported by $ mongoimport --db foo --collection bar2 bar.json Example: $ zoo commit --db zika --cell survey --n 5 surveytest ''' click.echo('Dumping data cell.') db = MongoClient(client)[db] # initialize minhash ksize = [int(i) for i in ksize.split(',')] dk = {k: Estimators(ksize=k, n=n) for k in ksize} bar = ProgressBar(max_value=UnknownLength) counter = 0 with open(file + '.json', 'w+') as f: for d in db[cell].find(): counter += 1 # calculate fresh md5 hash for each record _id = d.pop('_id') # Neither the primary key (because it is random) # nor the checksum should figure in the checksum. try: del d['md5'] except KeyError: pass d['md5'] = hash_dict(d) d['_id'] = _id f.write(json.dumps(d, indent=None, sort_keys=True) + '\n') # update aggregate minhash for collection for v in dk.values(): v.add_sequence(d['sequence'], force=True) # update progress bar bar.update(counter) # save minhash for k, v in dk.items(): dk.update({ k: signature.SourmashSignature( estimator=v, name=cell, email='', filename='')}) # print('\n', ksize[0], ksize[1], n) with open(file + '.zoo', 'w+') as f: signature_json.save_signatures_json( dk.values(), fp=f, indent=4, sort_keys=True) click.echo('\nDone.')
def watch(self, args): "Build a signature from raw FASTA/FASTQ coming in on stdin, search." from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBest parser = argparse.ArgumentParser() parser.add_argument('sbt_name') parser.add_argument('-o', '--output', type=argparse.FileType('wt')) parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) parser.add_argument('--threshold', default=0.05, type=float) parser.add_argument('--input-is-protein', action='store_true') sourmash_args.add_moltype_args(parser, default_dna=True) parser.add_argument('-n', '--num-hashes', type=int, default=DEFAULT_N, help='number of hashes to use in each sketch (default: %(default)i)') parser.add_argument('--name', type=str, default='stdin') args = parser.parse_args(args) if args.input_is_protein and args.dna: print('WARNING: input is protein, turning off DNA hash computing.', file=sys.stderr) args.dna = False args.protein = True if args.dna and args.protein: notify('ERROR: cannot use "watch" with both DNA and protein.') if args.dna: moltype = 'DNA' is_protein = False else: moltype = 'protein' is_protein = True E = sourmash_lib.Estimators(ksize=args.ksize, n=args.num_hashes, protein=is_protein) streamsig = sig.SourmashSignature('', E, filename='stdin', name=args.name) notify('Computing signature for k={}, {} from stdin', args.ksize, moltype) tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) def do_search(): search_fn = SearchMinHashesFindBest().search results = [] for leaf in tree.find(search_fn, streamsig, args.threshold): results.append((streamsig.similarity(leaf.data), leaf.data)) return results notify('reading sequences from stdin') screed_iter = screed.open('/dev/stdin') watermark = WATERMARK_SIZE # iterate over input records n = 0 for n, record in enumerate(screed_iter): # at each watermark, print status & check cardinality if n >= watermark: notify('... read {} sequences', n) watermark += WATERMARK_SIZE if do_search(): break if args.input_is_protein: E.mh.add_protein(record.sequence) else: E.add_sequence(record.sequence, False) results = do_search() if not results: notify('... read {} sequences, no matches found.', n) else: results.sort(key=lambda x: -x[0]) # take best similarity, found_sig = results[0] notify('FOUND: {}, at {:.3f}', found_sig.name(), similarity) if args.output: sig.save_signatures([streamsig], args.output)
def sbt_gather(self, args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBest parser = argparse.ArgumentParser() parser.add_argument('sbt_name') parser.add_argument('query') parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) parser.add_argument('--threshold', default=0.05, type=float) parser.add_argument('-o', '--output', type=argparse.FileType('wt')) parser.add_argument('--csv', type=argparse.FileType('wt')) sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) if args.protein: if args.dna is True: raise Exception('cannot specify both --dna and --protein!') args.dna = False moltype = None if args.protein: moltype = 'protein' elif args.dna: moltype = 'dna' tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) sl = sig.load_signatures(args.query, select_ksize=args.ksize, select_moltype=moltype) sl = list(sl) if len(sl) != 1: print('When loading query from "{}",'.format(args.query), file=sys.stderr) print('{} query signatures matching ksize and molecule type; need exactly one.'.format(len(sl))) sys.exit(-1) query = sl[0] query_moltype = 'UNKNOWN' if query.estimator.is_molecule_type('dna'): query_moltype = 'DNA' elif query.estimator.is_molecule_type('protein'): query_moltype = 'protein' query_ksize = query.estimator.ksize print('loaded query: {}... (k={}, {})'.format(query.name()[:30], query_ksize, query_moltype)) tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) #s = sig.load_signatures(args.query, select_ksize=args.ksize) orig_query = query sum_found = 0. found = [] while 1: search_fn = SearchMinHashesFindBest().search results = [] # use super low threshold for this part of the search for leaf in tree.find(search_fn, query, 0.00001): results.append((query.similarity(leaf.data), leaf.data)) #results.append((leaf.data.similarity(ss), leaf.data)) if not len(results): # no matches at all! break # take the best result results.sort(key=lambda x: -x[0]) # reverse sort on similarity best_sim, best_ss = results[0] sim = best_ss.similarity(orig_query) # adjust by size of leaf (kmer cardinality of original genome) if best_ss.estimator.hll: leaf_kmers = best_ss.estimator.hll.estimate_cardinality() query_kmers = orig_query.estimator.hll.estimate_cardinality() f_of_total = leaf_kmers / query_kmers * sim else: f_of_total = 0 if not found and sim < args.threshold: print('best match: {}'.format(best_ss.name())) print('similarity is {:.5f} of db signature;'.format(sim)) print('this is below specified threshold => exiting.') break # subtract found hashes from search hashes, construct new search new_mins = set(query.estimator.mh.get_mins()) found_mins = best_ss.estimator.mh.get_mins() # print interim & save print('found: {:.2f} {} {}'.format(f_of_total, len(new_mins), best_ss.name())) found.append((f_of_total, best_ss, sim)) sum_found += f_of_total new_mins -= set(found_mins) e = sourmash_lib.Estimators(ksize=args.ksize, n=len(new_mins)) for m in new_mins: e.mh.add_hash(m) new_ss = sig.SourmashSignature('foo', e) query = new_ss print('found {}, total fraction {:.3f}'.format(len(found), sum_found)) print('') if not found: sys.exit(0) found.sort() found.reverse() print('Composition:') for (frac, leaf_sketch, sim) in found: print('{:.2f} {}'.format(frac, leaf_sketch.name())) if args.output: print('Composition:', file=args.output) for (frac, leaf_sketch, sim) in found: print('{:.2f} {}'.format(frac, leaf_sketch.name()), file=args.output) if args.csv: fieldnames = ['fraction', 'name', 'similarity', 'sketch_kmers'] w = csv.DictWriter(args.csv, fieldnames=fieldnames) w.writeheader() for (frac, leaf_sketch, sim) in found: cardinality = leaf_sketch.estimator.hll.estimate_cardinality() w.writerow(dict(fraction=frac, name=leaf_sketch.name(), similarity=sim, sketch_kmers=cardinality))
def build_siglist(email, Elist, filename, name=None): return [ sig.SourmashSignature(email, E, filename=filename, name=name) for E in Elist ]
def compute(self, args): "Compute the signature for one or more files." parser = argparse.ArgumentParser() parser.add_argument('filenames', nargs='+') parser.add_argument('--protein', action='store_true') parser.add_argument('--input-is-protein', action='store_true') parser.add_argument('-k', '--ksizes', default=str(DEFAULT_K), help='comma-separated list of k-mer sizes') parser.add_argument('-n', '--num-hashes', type=int, default=DEFAULT_N, help='number of hashes to use in each sketch') parser.add_argument('-f', '--force', action='store_true') parser.add_argument('-o', '--output', type=argparse.FileType('wt')) parser.add_argument('--email', type=str, default='') args = parser.parse_args(args) print('computing signatures for files:', args.filenames, file=sys.stderr) # get list of k-mer sizes for which to compute sketches ksizes = args.ksizes if ',' in ksizes: ksizes = ksizes.split(',') ksizes = list(map(int, ksizes)) else: ksizes = [int(ksizes)] print('Computing signature for ksizes: %s' % str(ksizes), file=sys.stderr) # for each file, load & compute sketch. for filename in args.filenames: sigfile = os.path.basename(filename) + '.sig' if not args.output and os.path.exists(sigfile) and not args.force: print('skipping', filename, '- already done', file=sys.stderr) continue # one estimator for each ksize Elist = [] for k in ksizes: E = sourmash_lib.Estimators(ksize=k, n=args.num_hashes, protein=args.protein) Elist.append(E) # consume & calculate signatures print('... reading sequences from', filename, file=sys.stderr) for n, record in enumerate(screed.open(filename)): if n % 10000 == 0 and n: print('...', filename, n, file=sys.stderr) s = record.sequence for E in Elist: if args.input_is_protein: E.mh.add_protein(s) else: E.add_sequence(s, args.force) # convert into a signature siglist = [ sig.SourmashSignature(args.email, E, filename=filename) for E in Elist ] # save! if args.output: data = sig.save_signatures(siglist, args.output) else: with open(sigfile, 'w') as fp: data = sig.save_signatures(siglist, fp)
record['sequence'] = str(i) record['_id'] = _id db.survey.insert_one(record) # db.survey.count() # 33 json_dump('survey.json', db.survey.find()) # now create a minhash of the entire sequences to search e16 = Estimators(ksize=16, n=1000) e31 = Estimators(ksize=31, n=1000) fn = 'survey.sig' cursor = db.survey.find({}, {'sequence': 1, '_id': 0}) for record in cursor: for k, v in record.items(): e16.add_sequence(v, force=True) e31.add_sequence(v, force=True) # force bc/ ValueError: invalid DNA character in sequence: Y s16 = signature.SourmashSignature(email='', estimator=e16, name='survey', filename=fn) s31 = signature.SourmashSignature(email='', estimator=e31, name='survey', filename=fn) with open(fn, 'w+') as outsig: signature.save_signatures([s16, s31], fp=outsig)