def _find_signatures(self, minhash, threshold, containment=False, ignore_scaled=False): """ Do a Jaccard similarity or containment search, yield results. This is essentially a fast implementation of find that collects all the signatures with overlapping hash values. Note that similarity searches (containment=False) will not be returned in sorted order. """ # make sure we're looking at the same scaled value as database if self.scaled > minhash.scaled: minhash = minhash.downsample_scaled(self.scaled) elif self.scaled < minhash.scaled and not ignore_scaled: # note that containment can be calculated w/o matching scaled. raise ValueError( "lca db scaled is {} vs query {}; must downsample".format( self.scaled, minhash.scaled)) query_mins = set(minhash.get_mins()) # collect matching hashes for the query: c = Counter() for hashval in query_mins: idx_list = self.hashval_to_idx.get(hashval, []) for idx in idx_list: c[idx] += 1 debug('number of matching signatures for hashes: {}', len(c)) # for each match, in order of largest overlap, for idx, count in c.most_common(): # pull in the hashes. This reconstructs & caches all input # minhashes, which is kinda memory intensive...! # NOTE: one future low-mem optimization could be to support doing # this piecemeal by iterating across all the hashes, instead. match_sig = self._signatures[idx] match_mh = match_sig.minhash match_size = len(match_mh) # calculate the containment or similarity if containment: score = count / len(query_mins) else: # query_mins is size of query signature # match_size is size of match signature # count is overlap score = count / (len(query_mins) + match_size - count) # ...and return. if score >= threshold: yield score, match_sig, self.filename
def _signatures(self): "Create a _signatures member dictionary that contains {idx: sigobj}." from sourmash import MinHash, SourmashSignature is_protein = False is_hp = False is_dayhoff = False if self.moltype == 'protein': is_protein = True elif self.moltype == 'hp': is_hp = True elif self.moltype == 'dayhoff': is_dayhoff = True minhash = MinHash(n=0, ksize=self.ksize, scaled=self.scaled, is_protein=is_protein, hp=is_hp, dayhoff=is_dayhoff) debug('creating signatures for LCA DB...') mhd = defaultdict(minhash.copy_and_clear) temp_vals = defaultdict(list) # invert the hashval_to_idx dictionary for (hashval, idlist) in self.hashval_to_idx.items(): for idx in idlist: temp_hashes = temp_vals[idx] temp_hashes.append(hashval) # 50 is an arbitrary number. If you really want # to micro-optimize, list is resized and grow in this pattern: # 0, 4, 8, 16, 25, 35, 46, 58, 72, 88, ... # (from https://github.com/python/cpython/blob/b2b4a51f7463a0392456f7772f33223e57fa4ccc/Objects/listobject.c#L57) if len(temp_hashes) > 50: mhd[idx].add_many(temp_hashes) # Sigh, python 2... when it goes away, # we can do `temp_hashes.clear()` instead. del temp_vals[idx] # We loop temp_vals again to add any remainder hashes # (each list of hashes is smaller than 50 items) for sig, vals in temp_vals.items(): mhd[sig].add_many(vals) sigd = {} for idx, mh in mhd.items(): ident = self.idx_to_ident[idx] name = self.ident_to_name[ident] sigd[idx] = SourmashSignature(mh, name=name) debug('=> {} signatures!', len(sigd)) return sigd
def make_lca_counts(dblist, lowest_rank='phylum', min_num=0, min_hashes=5, prefix='oddities'): """ Collect counts of all the LCAs in the list of databases. """ assert len(dblist) == 1 keep_ranks = ['root'] for rank in lca_utils.taxlist(): keep_ranks.append(rank) if rank == lowest_rank: break print('keeping hashvals at following ranks:', keep_ranks) print('min number of lineages:', min_num) print('min number of shared hashes:', min_hashes) print('---') # gather all hashvalue assignments from across all the databases assignments = defaultdict(set) for lca_db in dblist: for hashval, idx_list in lca_db.hashval_to_idx.items(): if min_num and len(idx_list) < min_num: continue for idx in idx_list: lid = lca_db.idx_to_lid.get(idx) if lid is not None: lineage = lca_db.lid_to_lineage[lid] assignments[hashval].add(lineage) # now convert to trees -> do LCA & counts counts = defaultdict(int) mixdict = defaultdict(set) for hashval, lineages in assignments.items(): # for each list of tuple_info [(rank, name), ...] build # a tree that lets us discover lowest-common-ancestor. debug("{}", lineages) tree = lca_utils.build_tree(lineages) # now find either a leaf or the first node with multiple # children; that's our lowest-common-ancestor node. lca, reason = lca_utils.find_lca(tree) # find cross-superkingdom hashes, and record combinations of lineages # that have them. rank = 'root' if lca: rank = lca[-1].rank if rank in keep_ranks: xx = [] for lineage in lineages: xx.append(tuple(lineage)) xx = tuple(xx) mixdict[xx].add(hashval) counts[lca] += 1 # sort on number of confused hash vals by combination of lineages. mixdict_items = list(mixdict.items()) mixdict_items.sort(key=lambda x: -len(x[1])) confused_hashvals = set() fp = open(prefix + '.csv', 'wt') w = csv.writer(fp) w.writerow([ 'cluster', 'num_lineages', 'shared_kmers', 'ksize', 'rank', 'lca', 'ident1', 'lineage1', 'ident2', 'lineage2' ]) # # find candidate lineages, then evaluate pairwise intersections. # for cluster_n, (lineages, hashvals) in enumerate(mixdict_items): # insist on more than N hash vals if len(hashvals) < min_hashes: continue # display summary: print('cluster {} has {} assignments for {} hashvals / {} bp'.format( cluster_n, len(lineages), len(hashvals), dblist[0].scaled * len(hashvals))) confused_hashvals.update(hashvals) tree = lca_utils.build_tree(lineages) lca, reason = lca_utils.find_lca(tree) if lca: rank = lca[-1].rank else: rank = 'root' print(' rank & lca:', rank, lca_utils.display_lineage(lca)) # for lineage_n, lineage in enumerate(lineages): # print('* ', lca_utils.display_lineage(lineage)) # now, identify all members of these lineages by their index: all_idxs = [] for lineage_n, lineage in enumerate(lineages): lids = dblist[0].lineage_to_lids[lineage] for lid in lids: idxs = dblist[0].lid_to_idx[lid] all_idxs.extend(idxs) for idx in idxs: ident = dblist[0].idx_to_ident[idx] # run through and look at all pairs of genomes in these lineages; # filter so that we're comparing across lineages with the right # LCA, and with significant intersection. pair_n = 0 candidates = [] for i in range(len(all_idxs)): idx1 = all_idxs[i] lid1 = dblist[0].idx_to_lid[idx1] lin1 = dblist[0].lid_to_lineage[lid1] for j in range(i): idx2 = all_idxs[j] lid2 = dblist[0].idx_to_lid[idx2] lin2 = dblist[0].lid_to_lineage[lid2] ident1 = dblist[0].idx_to_ident[idx1] ident2 = dblist[0].idx_to_ident[idx2] debug("{} x {}", ident1, ident2) this_tree = lca_utils.build_tree([lin1, lin2]) this_lca, this_reason = lca_utils.find_lca(this_tree) # weed out pairs that don't have the desired intersection if lca != this_lca: continue mh1 = dblist[0]._signatures[idx1] mh2 = dblist[0]._signatures[idx2] mins1 = set(mh1.get_mins()) mins2 = set(mh2.get_mins()) intersect_size = len(mins1.intersection(mins2)) # weed out pairs that don't have enough k-mer intersection if intersect_size < min_hashes: continue candidates.append( (pair_n, ident1, lin1, ident2, lin2, intersect_size)) # write summary to CSV for find-oddities-examine.py to use w.writerow([ 'cluster{}.{}'.format(cluster_n, pair_n), len(lineages), intersect_size * dblist[0].scaled, dblist[0].ksize, rank, lca_utils.display_lineage(lca), ident1, lca_utils.display_lineage(lin1), ident2, lca_utils.display_lineage(lin2) ]) pair_n += 1 print(' Candidate genome pairs for these lineages:') for pair_n, ident1, lin1, ident2, lin2, intersection_size in candidates: print(' cluster.pair {}.{} share {} bases'.format( cluster_n, pair_n, intersection_size * dblist[0].scaled)) print(' - {} ({})'.format(ident1, lca_utils.display_lineage(lin1))) print(' - {} ({})'.format(ident2, lca_utils.display_lineage(lin2))) print('') print('') return counts, confused_hashvals
def index(args): """ main function for building an LCA database. """ if args.start_column < 2: error('error, --start-column cannot be less than 2') sys.exit(-1) set_quiet(args.quiet, args.debug) args.scaled = int(args.scaled) if args.ksize is None: args.ksize = DEFAULT_LOAD_K moltype = sourmash_args.calculate_moltype(args, default='DNA') notify('Building LCA database with ksize={} scaled={} moltype={}.', args.ksize, args.scaled, moltype) # first, load taxonomy spreadsheet delimiter = ',' if args.tabs: delimiter = '\t' assignments, num_rows = load_taxonomy_assignments(args.csv, delimiter=delimiter, start_column=args.start_column, use_headers=not args.no_headers, force=args.force) notify('{} distinct identities in spreadsheet out of {} rows.', len(assignments), num_rows) notify('{} distinct lineages in spreadsheet out of {} rows.', len(set(assignments.values())), num_rows) db = LCA_Database(args.ksize, args.scaled, moltype) # notify('finding signatures...') if args.traverse_directory: yield_all_files = False # only pick up *.sig files? if args.force: yield_all_files = True inp_files = list(sourmash_args.traverse_find_sigs(args.signatures, yield_all_files=yield_all_files)) else: inp_files = list(args.signatures) # track duplicates md5_to_name = {} # # main loop, connecting lineage ID to signature. # n = 0 total_n = len(inp_files) record_duplicates = set() record_no_lineage = set() record_remnants = set(assignments) record_used_lineages = set() record_used_idents = set() n_skipped = 0 for filename in inp_files: n += 1 for sig in load_signatures(filename, ksize=args.ksize, select_moltype=moltype): notify(u'\r\033[K', end=u'') notify('\r... loading signature {} ({} of {}); skipped {} so far', sig.name()[:30], n, total_n, n_skipped, end='') debug(filename, sig.name()) # block off duplicates. if sig.md5sum() in md5_to_name: debug('WARNING: in file {}, duplicate md5sum: {}; skipping', filename, sig.md5sum()) record_duplicates.add(filename) continue md5_to_name[sig.md5sum()] = sig.name() # parse identifier, potentially with splitting ident = sig.name() if args.split_identifiers: # hack for NCBI-style names, etc. # split on space... ident = ident.split(' ')[0] # ...and on period. ident = ident.split('.')[0] lineage = assignments.get(ident) # punt if no lineage and --require-taxonomy if lineage is None and args.require_taxonomy: debug('(skipping, because --require-taxonomy was specified)') n_skipped += 1 continue # add the signature into the database. db.insert(sig, ident=ident, lineage=lineage) if lineage: # remove from our list of remaining ident -> lineage record_remnants.remove(ident) # track ident as used record_used_idents.add(ident) record_used_lineages.add(lineage) # track lineage info - either no lineage, or this lineage used. else: debug('WARNING: no lineage assignment for {}.', ident) record_no_lineage.add(ident) # end main add signatures loop if n_skipped: notify('... loaded {} signatures; skipped {} because of --require-taxonomy.', total_n, n_skipped) else: notify('... loaded {} signatures.', total_n) # check -- did we find any signatures? if n == 0: error('ERROR: no signatures found. ??') if args.traverse_directory and not args.force: error('(note, with --traverse-directory, you may want to use -f)') sys.exit(1) # check -- did the signatures we found have any hashes? if not db.hashval_to_idx: error('ERROR: no hash values found - are there any signatures?') sys.exit(1) notify('loaded {} hashes at ksize={} scaled={}', len(db.hashval_to_idx), args.ksize, args.scaled) # summarize: notify('{} assigned lineages out of {} distinct lineages in spreadsheet.', len(record_used_lineages), len(set(assignments.values()))) unused_lineages = set(assignments.values()) - record_used_lineages notify('{} identifiers used out of {} distinct identifiers in spreadsheet.', len(record_used_idents), len(set(assignments))) assert record_used_idents.issubset(set(assignments)) unused_identifiers = set(assignments) - record_used_idents # now, save! db_outfile = args.lca_db_out if not (db_outfile.endswith('.lca.json') or \ db_outfile.endswith('.lca.json.gz')): # logic -> db.save db_outfile += '.lca.json' notify('saving to LCA DB: {}'.format(db_outfile)) db.save(db_outfile) ## done! # output a record of stuff if requested/available: if record_duplicates or record_no_lineage or record_remnants or unused_lineages: if record_duplicates: notify('WARNING: {} duplicate signatures.', len(record_duplicates)) if record_no_lineage: notify('WARNING: no lineage provided for {} signatures.', len(record_no_lineage)) if record_remnants: notify('WARNING: no signatures for {} spreadsheet rows.', len(record_remnants)) if unused_lineages: notify('WARNING: {} unused lineages.', len(unused_lineages)) if unused_identifiers: notify('WARNING: {} unused identifiers.', len(unused_identifiers)) if args.report: notify("generating a report and saving in '{}'", args.report) generate_report(record_duplicates, record_no_lineage, record_remnants, unused_lineages, unused_identifiers, args.report) else: notify('(You can use --report to generate a detailed report.)')