def make_minhashes(): seed = args.seed max_hash = 0 if args.scaled and args.scaled > 1: max_hash = sourmash_lib.MAX_HASH / float(args.scaled) max_hash = int(round(max_hash, 0)) # one minhash for each ksize Elist = [] for k in ksizes: if args.protein: E = sourmash_lib.MinHash(ksize=k, n=args.num_hashes, is_protein=True, track_abundance=args.track_abundance, max_hash=max_hash, seed=seed) Elist.append(E) if args.dna: E = sourmash_lib.MinHash(ksize=k, n=args.num_hashes, is_protein=False, track_abundance=args.track_abundance, max_hash=max_hash, seed=seed) Elist.append(E) return Elist
def test_similarity_downsample(track_abundance): e = sourmash_lib.MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=2**63) f = sourmash_lib.MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=2**2) e.add_hash(1) e.add_hash(5) assert len(e.get_mins()) == 2 f.add_hash(1) f.add_hash(5) # should be discarded due to max_hash assert len(f.get_mins()) == 1 ee = SourmashSignature(e) ff = SourmashSignature(f) with pytest.raises(ValueError): # mismatch in max_hash ee.similarity(ff) x = ee.similarity(ff, downsample=True) assert round(x, 1) == 1.0
def test_load_one_fail_multisig(track_abundance): e1 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1) e2 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig2 = SourmashSignature(e2) x = save_signatures([sig1, sig2]) with pytest.raises(ValueError): y = load_one_signature(x)
def test_compare_ne(track_abundance): # same content, different names -> different e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig1 = SourmashSignature(e, name='foo') f = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) f.add("AT" * 10) sig2 = SourmashSignature(f, name='bar') assert sig1 != sig2
def test_compare(track_abundance): # same content, same name -> equal e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig1 = SourmashSignature(e, name='foo') f = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) f.add("AT" * 10) sig2 = SourmashSignature(f, name='foo') assert e == f
def test_compare_ne2_reverse(track_abundance): # same content, one has filename, other does not -> different e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig1 = SourmashSignature(e, name='foo') f = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) f.add("AT" * 10) sig2 = SourmashSignature(f, filename='b') assert sig2 != sig1 assert sig1 != sig2
def test_save_minified(track_abundance): e1 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1, name="foo") e2 = sourmash_lib.MinHash(n=1, ksize=25, track_abundance=track_abundance) sig2 = SourmashSignature(e2, name="bar baz") x = save_signatures([sig1, sig2]) assert '\n' not in x assert len(x.split('\n')) == 1 y = list(load_signatures(x)) assert len(y) == 2 assert any(sig.name() == 'foo' for sig in y) assert any(sig.name() == 'bar baz' for sig in y)
def test_save_load_multisig(track_abundance): e1 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1) e2 = sourmash_lib.MinHash(n=1, ksize=25, track_abundance=track_abundance) sig2 = SourmashSignature(e2) x = save_signatures([sig1, sig2]) y = list(load_signatures(x)) print(x) assert len(y) == 2 assert sig1 in y # order not guaranteed, note. assert sig2 in y assert sig1 != sig2
def test_save_load_multisig_json(): e1 = sourmash_lib.MinHash(n=1, ksize=20) sig1 = SourmashSignature('*****@*****.**', e1) e2 = sourmash_lib.MinHash(n=1, ksize=20) sig2 = SourmashSignature('*****@*****.**', e2) x = save_signatures_json([sig1, sig2]) y = list(load_signatures_json(x)) print(x) assert len(y) == 2 assert sig1 in y # order not guaranteed, note. assert sig2 in y assert sig1 != sig2
def test_load_one_succeed(track_abundance): e1 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig1 = SourmashSignature(e1) x = save_signatures([sig1]) y = load_one_signature(x) assert sig1 == y
def test_hashable(track_abundance): # check: can we use signatures as keys in dictionaries and sets? e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig = SourmashSignature(e) x = set() x.add(sig)
def test_sourmash_signature_api(): e = sourmash.MinHash(n=1, ksize=20) sig = sourmash.SourmashSignature(e) s = sourmash.save_signatures([sig]) sig_x1 = sourmash.load_one_signature(s) sig_x2 = list(sourmash.load_signatures(s))[0] assert sig_x1 == sig assert sig_x2 == sig
def test_roundtrip(track_abundance): e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig = SourmashSignature(e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.minhash assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0
def test_roundtrip_empty(track_abundance): # edge case, but: empty minhash? :) e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig = SourmashSignature(e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.minhash assert sig.similarity(sig2) == 0 assert sig2.similarity(sig) == 0
def test_str(track_abundance): # signatures should be printable e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add("AT" * 10) sig = SourmashSignature(e) print(sig) assert str(sig) == 'SourmashSignature(59502a74)' assert repr(sig) == 'SourmashSignature(59502a74)' sig.d['name'] = 'fizbar' assert str(sig) == 'SourmashSignature(\'fizbar\', 59502a74)' assert repr(sig) == 'SourmashSignature(\'fizbar\', 59502a74)'
def make_minhashes(): seed = args.seed # one minhash for each ksize Elist = [] for k in ksizes: if args.protein: E = sourmash_lib.MinHash(ksize=k, n=args.num_hashes, is_protein=True, track_abundance=args.track_abundance, scaled=args.scaled, seed=seed) Elist.append(E) if args.dna: E = sourmash_lib.MinHash(ksize=k, n=args.num_hashes, is_protein=False, track_abundance=args.track_abundance, scaled=args.scaled, seed=seed) Elist.append(E) return Elist
def test_roundtrip_max_hash(track_abundance): e = sourmash_lib.MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=10) e.add_hash(5) sig = SourmashSignature(e) s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] e2 = sig2.minhash assert e.max_hash == e2.max_hash assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0
def import_csv(args): "Import a CSV file full of signatures/hashes." p = argparse.ArgumentParser() p.add_argument('mash_csvfile') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='(default: stdout)') p.add_argument('--email', type=str, default='', help='(default: %(default)s)') args = p.parse_args(args) with open(args.mash_csvfile, 'r') as fp: reader = csv.reader(fp) siglist = [] for row in reader: hashfn = row[0] hashseed = int(row[1]) # only support a limited import type, for now ;) assert hashfn == 'murmur64' assert hashseed == 42 _, _, ksize, name, hashes = row ksize = int(ksize) hashes = hashes.strip() hashes = list(map(int, hashes.split(' '))) e = sourmash_lib.MinHash(len(hashes), ksize) e.add_many(hashes) s = sig.SourmashSignature(args.email, e, filename=name) siglist.append(s) notify('loaded signature: {} {}', name, s.md5sum()[:8]) notify('saving {} signatures to JSON', len(siglist)) sig.save_signatures(siglist, args.output)
def main(args=sys.argv[1:]): p = argparse.ArgumentParser() p.add_argument('catlas_prefix', help='catlas prefix') p.add_argument('output') p.add_argument('--maxsize', type=float, default=20000) p.add_argument('--minsize', type=float, default=5000) p.add_argument('--min-abund', type=float, default=0) p.add_argument('-k', '--ksize', default=5, type=int, help='k-mer size for vectors') p.add_argument('--scaled', type=int, default=1000) args = p.parse_args(args) print('minsize: {:g}'.format(args.minsize)) print('maxsize: {:g}'.format(args.maxsize)) print('ksize: {}'.format(args.ksize)) print('min_abund: {}'.format(args.min_abund)) contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz') catlas = CAtlas(args.catlas_prefix, load_sizefile=True, min_abund=args.min_abund) catlas.decorate_with_shadow_sizes() # everything is loaded! # find highest nodes with kmer size less than given max_size print('finding terminal nodes for {}.'.format(args.maxsize)) nodes = partition_catlas(catlas, args.maxsize) nodes = {n for n in nodes if catlas.kmer_sizes[n] > args.minsize} print('{} nodes between {} and {} in k-mer size'.format( len(nodes), args.minsize, args.maxsize)) print('containing {} level1 nodes of {} total'.format( len(catlas.shadow(nodes)), sum(map(len, catlas.layer1_to_cdbg.values())))) node_kmers = sum([catlas.kmer_sizes[n] for n in nodes]) total_kmers = catlas.kmer_sizes[catlas.root] print('containing {} kmers of {} total ({:.1f}%)'.format( node_kmers, total_kmers, node_kmers / total_kmers * 100)) # now build cdbg -> subtree/group ID cdbg_to_group = {} for n in nodes: shadow = catlas.shadow([n]) for cdbg_id in shadow: # TODO remove cdbg vertices with no kmers # for cdbg_id in catlas.layer1_to_cdbg[level1_node]: # if cdbg_id in catlas.kmer_sizes: assert cdbg_id not in cdbg_to_group cdbg_to_group[cdbg_id] = n # record group info - here we are using the MinHash class to track # k-mer abundances in group_info, as well as using group_ident to # to track k=31 MinHashes for identification of each group. group_info = {} group_ident = {} for n in nodes: group_info[n] = sourmash_lib.MinHash(n=0, ksize=args.ksize, scaled=1, track_abundance=1) group_ident[n] = sourmash_lib.MinHash(n=0, ksize=31, scaled=args.scaled) # aaaaaand iterate over contigs, collecting abundances from all contigs # in a group. for record_n, record in enumerate(screed.open(contigs)): if record_n % 10000 == 0: print('...', record_n, end='\r') cdbg_id = int(record.name) group_id = cdbg_to_group.get(cdbg_id) # if this is under a node that meets minsize criteria, track: if group_id is not None: # keep/measure abundances! @CTB are actually doing anything abund? mh = group_info[group_id] mh.add_sequence(record.sequence, True) # update group idents. group_ident[group_id].add_sequence(record.sequence, True) # ok, now we have a pile of k-mer vectors of size 4**args.ksize; # output in numpy format. compute_matrix(group_info, group_ident, args.ksize, args.output)
def main(args=sys.argv[1:]): p = argparse.ArgumentParser() p.add_argument('catlas_prefix', help='catlas prefix') p.add_argument('output') p.add_argument('--minsize', type=float, default=100) p.add_argument('--maxsize', type=float, default=10000) p.add_argument('--keep-fraction', type=float, default=0.1) p.add_argument('-k', '--ksize', default=31, type=int, help='k-mer size (default: 31)') args = p.parse_args(args) print('minsize: {:g}'.format(args.minsize)) print('maxsize: {:g}'.format(args.maxsize)) basename = os.path.basename(args.catlas_prefix) catlas = os.path.join(args.catlas_prefix, 'catlas.csv') domfile = os.path.join(args.catlas_prefix, 'first_doms.txt') # load catlas DAG top_node_id, dag, dag_up, dag_levels, cdbg_to_catlas = search_utils.load_dag( catlas) print('loaded {} nodes from catlas {}'.format(len(dag), catlas)) # load mapping between dom nodes and cDBG/graph nodes: layer1_to_cdbg = search_utils.load_layer1_to_cdbg(cdbg_to_catlas, domfile) print('loaded {} layer 1 catlas nodes'.format(len(layer1_to_cdbg))) # calculate the cDBG shadow sizes for each catlas node. print('decorating catlas with shadow size info.') node_shadow_sizes = search_utils.decorate_catlas_with_shadow_sizes( layer1_to_cdbg, dag, dag_levels) # ...and load cdbg node sizes print('loading contig size info') cdbg_kmer_sizes, cdbg_weighted_kmer_sizes = search_utils.load_cdbg_size_info( args.catlas_prefix) # decorate catlas with cdbg node sizes underneath them print('decorating catlas with contig size info.') node_kmer_sizes, node_weighted_kmer_sizes = search_utils.decorate_catlas_with_kmer_sizes( layer1_to_cdbg, dag, dag_levels, cdbg_kmer_sizes, cdbg_weighted_kmer_sizes) ### ok, the real work: look at articulation of cDBG graph. # find highest nodes with kmer size less than given max_size def find_terminal_nodes(node_id, max_size): node_list = set() for sub_id in dag[node_id]: # shadow size size = node_kmer_sizes[sub_id] if size < max_size: node_list.add(sub_id) else: children = find_terminal_nodes(sub_id, max_size) node_list.update(children) return node_list print('finding terminal nodes for {}.'.format(args.maxsize)) terminal = find_terminal_nodes(top_node_id, args.maxsize) print('...got {}'.format(len(terminal))) terminal = {n for n in terminal if node_kmer_sizes[n] > args.minsize} print('...down to {} between {} and {} in size.'.format( len(terminal), args.minsize, args.maxsize)) # now, go through and calculate ratios x = [] for node_id in terminal: # calculate: how many k-mers per cDBG node? kmer_size = node_kmer_sizes[node_id] shadow_size = node_shadow_sizes[node_id] ratio = math.log(kmer_size, 2) - math.log(shadow_size, 2) # track basic info x.append((ratio, node_id, shadow_size, kmer_size)) print('terminal node stats for maxsize: {:g}'.format(args.maxsize)) print('n tnodes:', len(terminal)) print('total k-mers:', node_kmer_sizes[top_node_id]) x.sort(reverse=True) for (k, v, a, b) in x[:10]: print('ratio: {:.3f}'.format(2**k), '/ shadow size:', a, '/ kmers:', b) print('... eliding {} nodes'.format(len(x) - 20)) for (k, v, a, b) in x[-10:]: print('ratio: {:.3f}'.format(2**k), '/ shadow size:', a, '/ kmers:', b) # keep the last keep-fraction (default 10%) for examination keep_sum_kmer = args.keep_fraction * node_kmer_sizes[top_node_id] sofar = 0 keep_terminal = set() for (k, v, a, b) in reversed(x): sofar += b if sofar > keep_sum_kmer: break keep_terminal.add(v) print( 'keeping last {} k-mers worth of nodes for examination.'.format(sofar)) # build cDBG shadow ID list. cdbg_shadow = set() terminal_shadow = find_shadow(keep_terminal, dag) for x in terminal_shadow: cdbg_shadow.update(layer1_to_cdbg.get(x)) #### extract contigs print('extracting contigs & building a sourmash signature') contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz') # track results as signature contigs_mh = sourmash_lib.MinHash(n=0, ksize=args.ksize, scaled=1000) total_bp = 0 total_seqs = 0 outfp = open(args.output, 'wt') for n, record in enumerate(screed.open(contigs)): if n and n % 10000 == 0: offset_f = total_seqs / len(cdbg_shadow) print('...at n {} ({:.1f}% of shadow)'.format( total_seqs, offset_f * 100), end='\r') # contig names == cDBG IDs contig_id = int(record.name) if contig_id not in cdbg_shadow: continue outfp.write('>{}\n{}\n'.format(record.name, record.sequence)) contigs_mh.add_sequence(record.sequence) # track retrieved sequences in a minhash total_bp += len(record.sequence) total_seqs += 1 # done - got all contigs! print('') print('fetched {} contigs, {} bp.'.format(total_seqs, total_bp)) print('wrote contigs to {}'.format(args.output)) with open(args.output + '.sig', 'wt') as fp: ss = sourmash_lib.SourmashSignature(contigs_mh) sourmash_lib.save_signatures([ss], fp)
### K = 21 import sys, screed import mmh3 import sourmash_lib print('imported sourmash:', sourmash_lib, file=sys.stderr) from sourmash_lib import MinHash import sourmash_lib.signature record = next(iter(screed.open(sys.argv[1]))) print('loaded', record.name, file=sys.stderr) revcomp = reverse(complement((record.sequence))) mh = sourmash_lib.MinHash(ksize=K, n=500, is_protein=False) # # compute the actual hashes to insert by breaking down the sequence # into k-mers and applying MurmurHash to each one; here, the only # interesting thing that is done by add_hash is to keep only the # (numerically) lowest n=500 hashes. # # this method of hash computation is exactly how sourmash does it # internally, and should be approximately the same as what mash does. # for fwd_kmer in kmers(record.sequence, K): rev_kmer = reverse(complement(fwd_kmer)) if fwd_kmer < rev_kmer: kmer = fwd_kmer
def _json_next_signature(iterable, name=None, filename=None, ignore_md5sum=False, prefix_item='abundances.item', ijson=ijson): """Helper function to unpack and check one signature block only. - iterable: an iterable such the one returned by ijson.parse() - name: - filename: - ignore_md5sum: - prefix_item: required when parsing nested JSON structures - ijson: ijson backend to use. """ from .signature import SourmashSignature d = dict() prefix, event, value = next(iterable) if event == 'start_map': prefix, event, value = next(iterable) while event != 'end_map': key = value if key == 'mins': value = _json_next_atomic_array(iterable, prefix_item=prefix_item, ijson=ijson) elif key == 'abundances': value = _json_next_atomic_array(iterable, prefix_item=prefix_item, ijson=ijson) else: prefix, event, value = next(iterable) d[key] = value prefix, event, value = next(iterable) ksize = d['ksize'] mins = d['mins'] n = d['num'] if n == 0xffffffff: # load legacy signatures where n == -1 n = 0 max_hash = d.get('max_hash', 0) seed = d.get('seed', sourmash_lib.DEFAULT_SEED) molecule = d.get('molecule', 'DNA') if molecule == 'protein': is_protein = True elif molecule.upper() == 'DNA': is_protein = False else: raise Exception("unknown molecule type: {}".format(molecule)) track_abundance = False if 'abundances' in d: track_abundance = True e = sourmash_lib.MinHash(ksize=ksize, n=n, is_protein=is_protein, track_abundance=track_abundance, max_hash=max_hash, seed=seed) if not track_abundance: for m in mins: e.add_hash(m) else: abundances = list(map(int, d['abundances'])) e.set_abundances(dict(zip(mins, abundances))) sig = SourmashSignature(e) if not ignore_md5sum: md5sum = d['md5sum'] if md5sum != sig.md5sum(): raise Exception('error loading - md5 of minhash does not match') if name: sig.d['name'] = name if filename: sig.d['filename'] = filename return sig
def test_name_3(track_abundance): e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig = SourmashSignature(e, name='foo', filename='foo.txt') assert sig.name() == 'foo'
def main(args=sys.argv[1:]): p = argparse.ArgumentParser() p.add_argument('catlas_prefix', help='catlas prefix') p.add_argument('query') p.add_argument('output') p.add_argument('--threshold', default=0.0, type=float) p.add_argument('--minsize', default=0, type=int) p.add_argument('-k', '--ksize', default=31, type=int, help='k-mer size (default: 31)') args = p.parse_args(args) print('threshold: {:.3f}'.format(args.threshold)) basename = os.path.basename(args.catlas_prefix) catlas = os.path.join(args.catlas_prefix, 'catlas.csv') domfile = os.path.join(args.catlas_prefix, 'first_doms.txt') # load catlas DAG top_node_id, dag, dag_up, dag_levels, cdbg_to_catlas = search_utils.load_dag(catlas) print('loaded {} nodes from catlas {}'.format(len(dag), catlas)) # load mapping between dom nodes and cDBG/graph nodes: layer1_to_cdbg = search_utils.load_layer1_to_cdbg(cdbg_to_catlas, domfile) print('loaded {} layer 1 catlas nodes'.format(len(layer1_to_cdbg))) # calculate the cDBG shadow sizes for each catlas node. print('decorating catlas with shadow size info.') node_shadow_sizes = search_utils.decorate_catlas_with_shadow_sizes(layer1_to_cdbg, dag, dag_levels) # ...and load cdbg node sizes print('loading contig size info') cdbg_kmer_sizes, cdbg_weighted_kmer_sizes = search_utils.load_cdbg_size_info(args.catlas_prefix) # decorate catlas with cdbg node sizes underneath them print('decorating catlas with contig size info.') node_kmer_sizes, node_weighted_kmer_sizes = search_utils.decorate_catlas_with_kmer_sizes(layer1_to_cdbg, dag, dag_levels, cdbg_kmer_sizes, cdbg_weighted_kmer_sizes) # load k-mer index, query, etc. etc. kmer_idx = search_utils.load_kmer_index(args.catlas_prefix) bf = khmer.Nodetable(args.ksize, 1, 1) query_kmers = set() for record in screed.open(args.query): query_kmers.update(bf.get_kmer_hashes(record.sequence)) print('got {} k-mers from {}'.format(len(query_kmers), args.query)) # construct dict cdbg_id -> # of query k-mers cdbg_match_counts = kmer_idx.get_match_counts(query_kmers) total_match_kmers = sum(cdbg_match_counts.values()) f_found = total_match_kmers / len(query_kmers) print('=> containment: {:.1f}%'.format(f_found * 100)) print('done loading & counting query k-mers in cDBG.') if total_match_kmers == 0: print('no match k-mers!?') sys.exit(-1) # calculate the cDBG matching k-mers sizes for each catlas node. catlas_match_counts = kmer_idx.build_catlas_match_counts(cdbg_match_counts, dag, dag_levels, layer1_to_cdbg) ### ok, the real work: find nodes that have low # of k-mers in the query. def find_unassembled_nodes(node_id, threshold=0.0): node_list = set() for sub_id in dag[node_id]: n_matched = catlas_match_counts.get(sub_id, 0) size = node_kmer_sizes[sub_id] f_assembled = n_matched / size # if the fraction of unassembled k-mers under this node is below # our threshold, KEEP the node. Otherwise, descend into children. if f_assembled <= threshold: node_list.add(sub_id) else: children = find_unassembled_nodes(sub_id, threshold) node_list.update(children) return node_list print('finding unassembled nodes for threshold {}.'.format(args.threshold)) terminal = find_unassembled_nodes(top_node_id, args.threshold) sum_kmers = sum([ node_kmer_sizes[n] for n in terminal ]) sum_match_kmers = sum([ catlas_match_counts.get(n, 0) for n in terminal ]) print('...got {} nodes, representing {} k-mers'.format(len(terminal), sum_kmers)) # now, go through all nodes and print out characteristics print('writing node info to {}'.format(args.output + '.csv')) with open(args.output + '.csv', 'wt') as fp: w = csv.writer(fp) w.writerow(['node_id', 'contained', 'n_kmers', 'n_weighted_kmers', 'average_weight','shadow_size']) for n in terminal: f_contained = catlas_match_counts.get(n, 0) / node_kmer_sizes[n] w.writerow([n, '{:.3f}'.format(f_contained), node_kmer_sizes[n], '{:.1f}'.format(node_weighted_kmer_sizes[n]), '{:.2f}'.format(node_weighted_kmer_sizes[n] / node_kmer_sizes[n]), node_shadow_sizes[n]]) if args.minsize: print('minsize set: {}. filtering.'.format(args.minsize)) new_terminal = set() for n in terminal: if node_kmer_sizes[n] >= args.minsize: new_terminal.add(n) print('removed {} nodes => {}'.format(len(terminal)-len(new_terminal), len(new_terminal))) terminal = new_terminal # build cDBG shadow ID list, tagged by parent catlas node. cdbg_id_to_node = {} for n in terminal: this_shadow = find_shadow([n], dag) for x in this_shadow: v = layer1_to_cdbg[x] for vv in v: cdbg_id_to_node[vv] = n #### extract contigs print('extracting contigs & building a sourmash signature') contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz') # track results as signature contigs_mh = sourmash_lib.MinHash(n=0, ksize=args.ksize, scaled=1000) total_bp = 0 total_seqs = 0 print('writing contigs to {}'.format(args.output + '.fa')) outfp = open(args.output + '.fa', 'wt') for n, record in enumerate(screed.open(contigs)): if n and n % 10000 == 0: offset_f = total_seqs / len(cdbg_id_to_node) print('...at n {} ({:.1f}% of shadow)'.format(total_seqs, offset_f * 100), end='\r') # contig names == cDBG IDs contig_id = int(record.name) catlas_parent = cdbg_id_to_node.get(contig_id) if catlas_parent is None: continue outfp.write('>{} {}\n{}\n'.format(record.name, catlas_parent, record.sequence)) contigs_mh.add_sequence(record.sequence) # track retrieved sequences in a minhash total_bp += len(record.sequence) total_seqs += 1 # done - got all contigs! print('') print('fetched {} contigs, {} bp.'.format(total_seqs, total_bp)) print('writing sig to {}'.format(args.output + '.sig')) with open(args.output + '.sig', 'wt') as fp: ss = sourmash_lib.SourmashSignature(contigs_mh) sourmash_lib.save_signatures([ss], fp)
def build_new_signature(mins): e = sourmash_lib.MinHash(ksize=query_ksize, n=len(mins)) e.add_many(mins) return sig.SourmashSignature('', e)
def gather(args): from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBestIgnoreMaxHash parser = argparse.ArgumentParser() parser.add_argument('query', help='query signature') parser.add_argument('databases', help='signatures/SBTs to search', nargs='+') parser.add_argument('-o', '--output', type=argparse.FileType('wt'), help='output CSV containing matches to this file') parser.add_argument( '--save-matches', type=argparse.FileType('wt'), help='save the matched signatures from the database to this file.') parser.add_argument('--threshold-bp', type=float, default=5e4, help='threshold (in bp) for reporting results') parser.add_argument( '--output-unassigned', type=argparse.FileType('wt'), help= 'output unassigned portions of the query as a signature to this file') parser.add_argument('--scaled', type=float, help='downsample query to this scaled factor') parser.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) # load the query signature & figure out all the things query = sourmash_args.load_query_signature(args.query, select_ksize=args.ksize, select_moltype=moltype) query_moltype = sourmash_args.get_moltype(query) query_ksize = query.minhash.ksize notify('loaded query: {}... (k={}, {})', query.name()[:30], query_ksize, query_moltype) # verify signature was computed right. if query.minhash.max_hash == 0: error('query signature needs to be created with --scaled') sys.exit(-1) # downsample if requested if args.scaled: notify('downsampling query from scaled={} to {}', query.minhash.scaled, int(args.scaled)) query.minhash = query.minhash.downsample_scaled(args.scaled) # empty? if not query.minhash.get_mins(): error('no query hashes!? exiting.') sys.exit(-1) # set up the search databases databases = sourmash_args.load_sbts_and_sigs(args.databases, query_ksize, query_moltype) if not len(databases): error('Nothing found to search!') sys.exit(-1) orig_query = query orig_mins = orig_query.minhash.get_hashes() # calculate the band size/resolution R for the genome R_metagenome = sourmash_lib.MAX_HASH / float(orig_query.minhash.max_hash) # define a function to do a 'best' search and get only top match. def find_best(dblist, query): results = [] for (sbt_or_siglist, filename, is_sbt) in dblist: search_fn = SearchMinHashesFindBestIgnoreMaxHash().search if is_sbt: tree = sbt_or_siglist for leaf in tree.find(search_fn, query, 0.0): leaf_e = leaf.data.minhash similarity = query.minhash.similarity_ignore_maxhash( leaf_e) if similarity > 0.0: results.append((similarity, leaf.data)) else: for ss in sbt_or_siglist: similarity = query.minhash.similarity_ignore_maxhash( ss.minhash) if similarity > 0.0: results.append((similarity, ss)) if not results: return None, None, None # take the best result results.sort(key=lambda x: -x[0]) # reverse sort on similarity best_similarity, best_leaf = results[0] return best_similarity, best_leaf, filename # define a function to build new signature object from set of mins def build_new_signature(mins): e = sourmash_lib.MinHash(ksize=query_ksize, n=len(mins)) e.add_many(mins) return sig.SourmashSignature('', e) # xxx def format_bp(bp): bp = float(bp) if bp < 500: return '{:.0f} bp '.format(bp) elif bp <= 500e3: return '{:.1f} kbp'.format(round(bp / 1e3, 1)) elif bp < 500e6: return '{:.1f} Mbp'.format(round(bp / 1e6, 1)) elif bp < 500e9: return '{:.1f} Gbp'.format(round(bp / 1e9, 1)) return '???' # construct a new query that doesn't have the max_hash attribute set. new_mins = query.minhash.get_hashes() query = build_new_signature(new_mins) sum_found = 0. found = [] GatherResult = namedtuple( 'GatherResult', 'intersect_bp, f_orig_query, f_match, f_unique_to_query, filename, name, md5, leaf' ) while 1: best_similarity, best_leaf, filename = find_best(databases, query) if not best_leaf: # no matches at all! break # subtract found hashes from search hashes, construct new search query_mins = set(query.minhash.get_hashes()) found_mins = best_leaf.minhash.get_hashes() # figure out what the resolution of the banding on the genome is, # based either on an explicit --scaled parameter, or on genome # cardinality (deprecated) if not best_leaf.minhash.max_hash: error('Best hash match in sbt_gather has no max_hash') error('Please prepare database of sequences with --scaled') sys.exit(-1) R_genome = best_leaf.minhash.scaled # pick the highest R / lowest resolution R_comparison = max(R_metagenome, R_genome) # CTB: these could probably be replaced by minhash.downsample_scaled. new_max_hash = sourmash_lib.MAX_HASH / float(R_comparison) query_mins = set([i for i in query_mins if i < new_max_hash]) found_mins = set([i for i in found_mins if i < new_max_hash]) orig_mins = set([i for i in orig_mins if i < new_max_hash]) # calculate intersection: intersect_mins = query_mins.intersection(found_mins) intersect_orig_mins = orig_mins.intersection(found_mins) intersect_bp = R_comparison * len(intersect_orig_mins) sum_found += len(intersect_mins) if intersect_bp < args.threshold_bp: # hard cutoff for now notify('found less than {} in common. => exiting', format_bp(intersect_bp)) break # calculate fractions wrt first denominator - genome size genome_n_mins = len(found_mins) f_match = len(intersect_mins) / float(genome_n_mins) f_orig_query = len(intersect_orig_mins) / float(len(orig_mins)) # calculate fractions wrt second denominator - metagenome size query_n_mins = len(orig_query.minhash.get_hashes()) f_unique_to_query = len(intersect_mins) / float(query_n_mins) if not len(found): # first result? print header. print_results("") print_results("overlap p_query p_match ") print_results("--------- ------- --------") result = GatherResult(intersect_bp=intersect_bp, f_orig_query=f_orig_query, f_match=f_match, f_unique_to_query=f_unique_to_query, filename=filename, md5=best_leaf.md5sum(), name=best_leaf.name(), leaf=best_leaf) # print interim result & save in a list for later use pct_query = '{:.1f}%'.format(result.f_orig_query * 100) pct_genome = '{:.1f}%'.format(result.f_match * 100) name = result.leaf._display_name(40) print_results('{:9} {:>6} {:>6} {}', format_bp(result.intersect_bp), pct_query, pct_genome, name) found.append(result) # construct a new query, minus the previous one. query_mins -= set(found_mins) query = build_new_signature(query_mins) # basic reporting print_results('\nfound {} matches total;', len(found)) sum_found /= len(orig_query.minhash.get_hashes()) print_results('the recovered matches hit {:.1f}% of the query', sum_found * 100) print_results('') if not found: sys.exit(0) if args.output: fieldnames = [ 'intersect_bp', 'f_orig_query', 'f_match', 'f_unique_to_query', 'name', 'filename', 'md5' ] w = csv.DictWriter(args.output, fieldnames=fieldnames) w.writeheader() for result in found: d = dict(result._asdict()) del d['leaf'] # actual signature not in CSV. w.writerow(d) if args.save_matches: outname = args.save_matches.name notify('saving all matches to "{}"', outname) sig.save_signatures([r.leaf for r in found], args.save_matches) if args.output_unassigned: if not found: notify('nothing found - entire query signature unassigned.') if not query.minhash.get_mins(): notify('no unassigned hashes! not saving.') else: outname = args.output_unassigned.name notify('saving unassigned hashes to "{}"', outname) e = sourmash_lib.MinHash(ksize=query_ksize, n=0, max_hash=new_max_hash) e.add_many(query.minhash.get_mins()) sig.save_signatures([sig.SourmashSignature('', e)], args.output_unassigned)
def test_name_4(track_abundance): e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance) sig = SourmashSignature(e) assert sig.name() == sig.md5sum()[:8]
def watch(args): "Build a signature from raw FASTA/FASTQ coming in on stdin, search." from sourmash_lib.sbt import SBT, GraphFactory from sourmash_lib.sbtmh import search_minhashes, SigLeaf from sourmash_lib.sbtmh import SearchMinHashesFindBest parser = argparse.ArgumentParser() parser.add_argument('sbt_name', help='name of SBT to search') parser.add_argument('inp_file', nargs='?', default='/dev/stdin') parser.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') parser.add_argument('-o', '--output', type=argparse.FileType('wt'), help='save signature generated from data here') parser.add_argument('--threshold', default=0.05, type=float, help='minimum threshold for matches') parser.add_argument( '--input-is-protein', action='store_true', help='Consume protein sequences - no translation needed') sourmash_args.add_construct_moltype_args(parser) parser.add_argument( '-n', '--num-hashes', type=int, default=DEFAULT_N, help='number of hashes to use in each sketch (default: %(default)i)') parser.add_argument('--name', type=str, default='stdin', help='name to use for generated signature') sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K) args = parser.parse_args(args) set_quiet(args.quiet) if args.input_is_protein and args.dna: notify('WARNING: input is protein, turning off DNA hashing.') args.dna = False args.protein = True if args.dna and args.protein: notify('ERROR: cannot use "watch" with both DNA and protein.') if args.dna: moltype = 'DNA' is_protein = False else: moltype = 'protein' is_protein = True tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load) def get_ksize(tree): """Walk nodes in `tree` to find out ksize""" for node in tree.nodes.values(): if isinstance(node, sourmash_lib.sbtmh.SigLeaf): return node.data.minhash.ksize # deduce ksize from the SBT we are loading ksize = args.ksize if ksize is None: ksize = get_ksize(tree) E = sourmash_lib.MinHash(ksize=ksize, n=args.num_hashes, is_protein=is_protein) streamsig = sig.SourmashSignature('', E, filename='stdin', name=args.name) notify('Computing signature for k={}, {} from stdin', ksize, moltype) def do_search(): search_fn = SearchMinHashesFindBest().search results = [] for leaf in tree.find(search_fn, streamsig, args.threshold): results.append((streamsig.similarity(leaf.data), leaf.data)) return results notify('reading sequences from stdin') screed_iter = screed.open(args.inp_file) watermark = WATERMARK_SIZE # iterate over input records n = 0 for n, record in enumerate(screed_iter): # at each watermark, print status & check cardinality if n >= watermark: notify('\r... read {} sequences', n, end='') watermark += WATERMARK_SIZE if do_search(): break if args.input_is_protein: E.add_protein(record.sequence) else: E.add_sequence(record.sequence, False) results = do_search() if not results: notify('... read {} sequences, no matches found.', n) else: results.sort(key=lambda x: -x[0]) # take best similarity, found_sig = results[0] print_results('FOUND: {}, at {:.3f}', found_sig.name(), similarity) if args.output: notify('saving signature to {}', args.output.name) sig.save_signatures([streamsig], args.output)
def main(args=sys.argv[1:]): p = argparse.ArgumentParser() p.add_argument('catlas_prefix', help='catlas prefix') p.add_argument('output') p.add_argument('--maxsize', type=float, default=10000) p.add_argument('--minsize', type=float, default=2000) p.add_argument('--min-abund', type=float, default=0) p.add_argument('-k', '--ksize', default=5, type=int, help='k-mer size for vectors') p.add_argument('--scaled', type=int, default=1000) args = p.parse_args(args) print('minsize: {:g}'.format(args.minsize)) print('maxsize: {:g}'.format(args.maxsize)) print('ksize: {}'.format(args.ksize)) print('min_abund: {}'.format(args.min_abund)) basename = os.path.basename(args.catlas_prefix) catlas = os.path.join(args.catlas_prefix, 'catlas.csv') domfile = os.path.join(args.catlas_prefix, 'first_doms.txt') # load catlas DAG top_node_id, dag, dag_up, dag_levels, cdbg_to_catlas = search_utils.load_dag( catlas) print('loaded {} nodes from catlas {}'.format(len(dag), catlas)) # load mapping between dom nodes and cDBG/graph nodes: layer1_to_cdbg = search_utils.load_layer1_to_cdbg(cdbg_to_catlas, domfile) print('loaded {} layer 1 catlas nodes'.format(len(layer1_to_cdbg))) # find the contigs filename contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz') # ...and catlas node sizes print('loading contig size info') cdbg_kmer_sizes, cdbg_weighted_kmer_sizes = search_utils.load_cdbg_size_info( args.catlas_prefix, min_abund=args.min_abund) node_kmer_sizes, node_weighted_kmer_sizes = search_utils.decorate_catlas_with_kmer_sizes( layer1_to_cdbg, dag, dag_levels, cdbg_kmer_sizes, cdbg_weighted_kmer_sizes) ### everything is loaded! # find highest nodes with kmer size less than given max_size print('finding terminal nodes for {}.'.format(args.maxsize)) nodes = partition_catlas(dag, top_node_id, node_kmer_sizes, args.maxsize) nodes = {n for n in nodes if node_kmer_sizes[n] > args.minsize} print('{} nodes between {} and {} in k-mer size'.format( len(nodes), args.minsize, args.maxsize)) print('containing {} level1 nodes of {} total'.format( len(find_shadow(nodes, dag)), len(layer1_to_cdbg))) node_kmers = sum([node_kmer_sizes[n] for n in nodes]) print('containing {} kmers of {} total ({:.1f}%)'.format( node_kmers, node_kmer_sizes[top_node_id], node_kmers / node_kmer_sizes[top_node_id] * 100)) ### now build cdbg -> subtree/group ID cdbg_to_group = {} for n in nodes: shadow = find_shadow([n], dag) for level1_node in shadow: for cdbg_id in layer1_to_cdbg[level1_node]: if cdbg_kmer_sizes.get(cdbg_id): assert cdbg_id not in cdbg_to_group cdbg_to_group[cdbg_id] = n # record group info - here we are using the MinHash class to track # k-mer abundances in group_info, as well as using group_ident to # to track k=31 MinHashes for identification of each group. group_info = {} group_ident = {} for n in nodes: group_info[n] = sourmash_lib.MinHash(n=0, ksize=args.ksize, scaled=1, track_abundance=1) group_ident[n] = sourmash_lib.MinHash(n=0, ksize=31, scaled=args.scaled) # aaaaaand iterate over contigs, collecting abundances from all contigs # in a group. for record_n, record in enumerate(screed.open(contigs)): if record_n % 10000 == 0: print('...', record_n, end='\r') cdbg_id = int(record.name) group_id = cdbg_to_group.get(cdbg_id) # if this is under a node that meets minsize criteria, track: if group_id is not None: # keep/measure abundances! mh = group_info[group_id] mh.add_sequence(record.sequence, True) # update group idents. group_ident[group_id].add_sequence(record.sequence, True) # ok, now we have a pile of k-mer vectors of size 4**args.ksize; # output in numpy format. # first, make a consistently ordered list of all k-mers, and convert # them into hashes. all_kmers = make_all(args.ksize) all_kmer_hashes = list(set([hash_murmur(i) for i in all_kmers])) all_kmer_hashes.sort() # now, build a matrix of GROUP_N rows x 4**ksize columns, where each # row will be the set of k-mer abundances associated with each group. print('creating', len(group_info), 4**args.ksize) V = numpy.zeros((len(group_info), 4**args.ksize), dtype=numpy.uint16) node_id_to_group_idx = {} for i, n in enumerate(group_info): if i % 1000 == 0: print('...', i, len(group_info)) mh = group_info[n] vec = dict(mh.get_mins(with_abundance=True)) vec = [vec.get(hashval, 0) for hashval in all_kmer_hashes] vec = numpy.array(vec) V[i] = vec node_id_to_group_idx[n] = i # save! print('saving matrix of size {} to {}'.format(str(V.shape), args.output)) with open(args.output, 'wb') as fp: numpy.save(fp, V) with open(args.output + '.node_ids', 'wb') as fp: pickle.dump(node_id_to_group_idx, fp) with open(args.output + '.node_mh', 'wb') as fp: pickle.dump(group_ident, fp)
def gather(args): from .search import gather_databases, format_bp parser = argparse.ArgumentParser() parser.add_argument('query', help='query signature') parser.add_argument('databases', help='signatures/SBTs to search', nargs='+') parser.add_argument('--traverse-directory', action='store_true', help='search all signatures underneath directories.') parser.add_argument('-o', '--output', type=argparse.FileType('wt'), help='output CSV containing matches to this file') parser.add_argument( '--save-matches', type=argparse.FileType('wt'), help='save the matched signatures from the database to this file.') parser.add_argument('--threshold-bp', type=float, default=5e4, help='threshold (in bp) for reporting results') parser.add_argument( '--output-unassigned', type=argparse.FileType('wt'), help= 'output unassigned portions of the query as a signature to this file') parser.add_argument('--scaled', type=float, default=0, help='downsample query to this scaled factor') parser.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') parser.add_argument('--ignore-abundance', action='store_true', help='do NOT use k-mer abundances if present') parser.add_argument('-d', '--debug', action='store_true') sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) # load the query signature & figure out all the things query = sourmash_args.load_query_signature(args.query, ksize=args.ksize, select_moltype=moltype) notify('loaded query: {}... (k={}, {})', query.name()[:30], query.minhash.ksize, sourmash_args.get_moltype(query)) # verify signature was computed right. if query.minhash.max_hash == 0: error('query signature needs to be created with --scaled') sys.exit(-1) # downsample if requested if args.scaled: notify('downsampling query from scaled={} to {}', query.minhash.scaled, int(args.scaled)) query.minhash = query.minhash.downsample_scaled(args.scaled) # empty? if not query.minhash.get_mins(): error('no query hashes!? exiting.') sys.exit(-1) # set up the search databases databases = sourmash_args.load_sbts_and_sigs(args.databases, query, False, args.traverse_directory) if not len(databases): error('Nothing found to search!') sys.exit(-1) found = [] for result, weighted_missed, new_max_hash, next_query in gather_databases( query, databases, args.threshold_bp, args.ignore_abundance): # print interim result & save in a list for later use pct_query = '{:.1f}%'.format(result.f_orig_query * 100) pct_genome = '{:.1f}%'.format(result.f_match * 100) name = result.leaf._display_name(40) if not len(found): # first result? print header. print_results("") print_results("overlap p_query p_match ") print_results("--------- ------- --------") # print interim result & save in a list for later use pct_query = '{:.1f}%'.format(result.f_unique_weighted * 100) pct_genome = '{:.1f}%'.format(result.f_match * 100) name = result.leaf._display_name(40) print_results('{:9} {:>6} {:>6} {}', format_bp(result.intersect_bp), pct_query, pct_genome, name) found.append(result) # basic reporting print_results('\nfound {} matches total;', len(found)) print_results('the recovered matches hit {:.1f}% of the query', (1 - weighted_missed) * 100) print_results('') if not found: sys.exit(0) if args.output: fieldnames = [ 'intersect_bp', 'f_orig_query', 'f_match', 'f_unique_to_query', 'f_unique_weighted', 'average_abund', 'name', 'filename', 'md5' ] w = csv.DictWriter(args.output, fieldnames=fieldnames) w.writeheader() for result in found: d = dict(result._asdict()) del d['leaf'] # actual signature not in CSV. w.writerow(d) if args.save_matches: outname = args.save_matches.name notify('saving all matches to "{}"', outname) sig.save_signatures([r.leaf for r in found], args.save_matches) if args.output_unassigned: if not found: notify('nothing found - entire query signature unassigned.') elif not query.minhash.get_mins(): notify('no unassigned hashes! not saving.') else: outname = args.output_unassigned.name notify('saving unassigned hashes to "{}"', outname) e = sourmash_lib.MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash) e.add_many(next_query.minhash.get_mins()) sig.save_signatures([sig.SourmashSignature(e)], args.output_unassigned)