def classify_signature(query_sig, dblist, threshold): """ Classify 'query_sig' using the given list of databases. Insist on at least 'threshold' counts of a given lineage before taking it seriously. Return (lineage, status) where 'lineage' is a tuple of LineagePairs and 'status' is either 'nomatch', 'found', or 'disagree'. This function proceeds in two stages: * first, build a list of assignments for all the lineages for each hashval. (For e.g. kraken, this is done in the database preparation step; here, we do it dynamically each time. * then, across all the hashvals, count the number of times each linage shows up, and filter out low-abundance ones (under threshold). Then, determine the LCA of all of those. """ # gather assignments from across all the databases assignments = lca_utils.gather_assignments(query_sig.minhash.get_mins(), dblist) # now convert to trees -> do LCA & counts counts = lca_utils.count_lca_for_assignments(assignments) debug(counts.most_common()) # ok, we now have the LCAs for each hashval, and their number of # counts. Now build a tree across "significant" LCAs - those above # threshold. tree = {} for lca, count in counts.most_common(): if count < threshold: break # update tree with this set of assignments lca_utils.build_tree([lca], tree) status = 'nomatch' if not tree: return [], status # now find lowest-common-ancestor of the resulting tree. lca, reason = lca_utils.find_lca(tree) if reason == 0: # leaf node debug('END', lca) status = 'found' else: # internal node => disagreement debug('MULTI', lca) status = 'disagree' debug('lineage is:', lca) return lca, status
def test_build_tree_4(): tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2a')], ]) tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2b')], ], tree) assert tree == { LineagePair('rank1', 'name1'): { LineagePair('rank2', 'name2a') : {}, LineagePair('rank2', 'name2b') : {}} }
def make_lca_counts(dblist): """ Collect counts of all the LCAs in the list of databases. CTB this could usefully be converted to a generator function. """ # gather all hashvalue assignments from across all the databases assignments = defaultdict(set) for lca_db in dblist: for hashval, lid_list in lca_db.hashval_to_lineage_id.items(): lineages = [lca_db.lineage_dict[lid] for lid in lid_list] assignments[hashval].update(lineages) # now convert to trees -> do LCA & counts counts = defaultdict(int) for hashval, lineages in assignments.items(): # for each list of tuple_info [(rank, name), ...] build # a tree that lets us discover lowest-common-ancestor. debug(lineages) tree = lca_utils.build_tree(lineages) # now find either a leaf or the first node with multiple # children; that's our lowest-common-ancestor node. lca, reason = lca_utils.find_lca(tree) counts[lca] += 1 return counts
def test_find_lca_2(): tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2a')], [LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2b')], ]) lca = find_lca(tree) assert lca == ((LineagePair('rank1', 'name1'),), 2)
def test_build_tree(): tree = build_tree( [[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2')]]) assert tree == { LineagePair('rank1', 'name1'): { LineagePair('rank2', 'name2'): {} } }
def test_find_lca(): tree = build_tree( [[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2')]]) lca = find_lca(tree) assert lca == (( LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2'), ), 0)
def test_build_tree_5(): with pytest.raises(ValueError): tree = build_tree([])
def test_build_tree_3(): # empty 'rank2' name tree = build_tree( [[LineagePair('rank1', 'name1'), LineagePair('rank2', '')]]) assert tree == {LineagePair('rank1', 'name1'): {}}
def main(): p = argparse.ArgumentParser() p.add_argument('-k', '--ksize', default=DEFAULT_KSIZE, type=int) p.add_argument('--sample-threshold', default=DEFAULT_SAMPLE_THRESHOLD, type=int) p.add_argument('--abundance-threshold', default=DEFAULT_ABUND_THRESHOLD, type=int) p.add_argument('revindex') p.add_argument('db', nargs='+') args = p.parse_args() idx = revindex_utils.HashvalRevindex(args.revindex) lca_db_list, ksize, scaled = lca_utils.load_databases(args.db, SCALED) cnt = collections.Counter() for k, v in idx.hashval_to_abunds.items(): cnt[k] += len([abund for abund in v \ if abund >= args.abundance_threshold]) total = 0 found = 0 unknown = collections.defaultdict(int) for hashval, count in cnt.most_common(): # break when we hit things in < 10 samples. if count < args.sample_threshold: break total += 1 lca_set = set() for lca_db in lca_db_list: lineages = lca_db.get_lineage_assignments(hashval) lca_set.update(lineages) if not lca_set: unknown[count] += 1 continue assert lca_set, lca_set # for each list of tuple_info [(rank, name), ...] build # a tree that lets us discover lowest-common-ancestor. tree = lca_utils.build_tree(lca_set) # now find either a leaf or the first node with multiple # children; that's our lowest-common-ancestor node. lca, reason = lca_utils.find_lca(tree) print('hash {}, in {} samples; lineage: {}'.format( hashval, count, ";".join(lca_utils.zip_lineage(lca))), file=sys.stderr) found += 1 print('found {} of {} ({:.2f}%)'.format(found, total, found / total * 100), file=sys.stderr) print('outputting distribution of unknowns', file=sys.stderr) print('commonality,n,sum_n') sofar = 0 for k, cnt in sorted(unknown.items()): sofar += cnt print('{},{},{}'.format(k, cnt, sofar))
def compare_csv(args): p = argparse.ArgumentParser() p.add_argument('csv1', help='taxonomy spreadsheet output by classify') p.add_argument('csv2', help='custom taxonomy spreadsheet') p.add_argument('-d', '--debug', action='store_true') p.add_argument('-C', '--start-column', default=2, type=int, help='column at which taxonomic assignments start') p.add_argument('--tabs', action='store_true', help='input spreadsheet is tab-delimited (default: commas)') p.add_argument('--no-headers', action='store_true', help='no headers present in taxonomy spreadsheet') p.add_argument('-f', '--force', action='store_true') args = p.parse_args(args) if args.start_column < 2: error('error, --start-column cannot be less than 2') sys.exit(-1) if args.debug: set_debug(args.debug) # first, load classify-style spreadsheet notify('loading classify output from: {}', args.csv1) assignments0, num_rows0 = load_taxonomy_assignments(args.csv1, start_column=3) notify('loaded {} distinct lineages, {} rows', len(set(assignments0.values())), num_rows0) notify('----') # next, load custom taxonomy spreadsheet delimiter = ',' if args.tabs: delimiter = '\t' notify('loading custom spreadsheet from: {}', args.csv2) assignments, num_rows = load_taxonomy_assignments( args.csv2, delimiter=delimiter, start_column=args.start_column, use_headers=not args.no_headers, force=args.force) notify('loaded {} distinct lineages, {} rows', len(set(assignments.values())), num_rows) # now, compute basic differences: missing_1 = set(assignments0.keys()) - set(assignments.keys()) missing_2 = set(assignments.keys()) - set(assignments0.keys()) if missing_2: notify('missing {} assignments in classify spreadsheet.', len(missing_2)) if missing_1: notify('missing {} assignments in custom spreadsheet.', len(missing_1)) if missing_1 or missing_2: notify('(these will not be evaluated any further)') else: notify('note: all IDs are in both spreadsheets!') # next, look at differences in lineages common = set(assignments0.keys()) common.intersection_update(assignments.keys()) n_total = 0 n_different = 0 n_compat = 0 n_incompat = 0 incompat_rank = defaultdict(int) for k in common: n_total += 1 v0 = assignments0[k] v1 = assignments[k] if v0 != v1: n_different += 1 tree = lca_utils.build_tree([v0]) lca_utils.build_tree([v1], tree) lca, reason = lca_utils.find_lca(tree) if reason == 0: # compatible lineages n_compat += 1 print_results("{},compatible,{}", k, ";".join(zip_lineage(lca))) else: n_incompat += 1 print_results("{},incompatible,{}", k, ";".join(zip_lineage(lca))) rank = next(iter(lca_utils.taxlist())) if lca: rank = lca[-1].rank incompat_rank[rank] += 1 notify("{} total assignments, {} differ between spreadsheets.", n_total, n_different) notify("{} are compatible (one lineage is ancestor of another.", n_compat) notify("{} are incompatible (there is a disagreement in the trees).", n_incompat) if n_incompat: for rank in lca_utils.taxlist(): notify('{} incompatible at rank {}', incompat_rank[rank], rank)