def lca_star(self, taxonomy_list, min_tree_depth=3, majority_threshold=0.51): """Find the LCA within a list of taxonomies after filtering those taxonomies by tree depth. One can also vary what constitutes a majority consensus for the counts, with the default being 51%. Args: taxonomy_list (list): list of taxonomy names or IDs min_tree_depth (int): the mininum allowable tree depth of taxon to be considered within the taxonomy list; those found sooner in the tree will be filtered out of consideration majority_threshold (float): 0-1; the fraction of taxonomy counts which constitutes a majority; a lower fraction will classify with less confidence deeper in the tree while a higher threshold will classify with more confidence higher in the tree Returns: dict of 'taxonomy' and 'pvalue' Example: >>> tree = Tree("ref/ncbi_taxonomy_tree.txt") >>> taxonomy_list = ['gamma subgroup', 'RNA similarity group I', 'purple photosynthetic bacteria and relatives', 'not Bacteria Haeckel 1894', 'purple photosynthetic bacteria and relatives', 'gamma subgroup', 'gamma subgroup', 'purple photosynthetic bacteria and relatives', 'purple photosynthetic bacteria and relatives'] >>> tree.lca_star(taxonomy_list) {'pvalue': 0.012791848981090311, 'taxonomy': '1224'} """ # tree depth based filter taxonomy_list = self.filter_taxonomy_list(taxonomy_list, min_tree_depth) # all have been filtered if not taxonomy_list: majority = "1" p = 1. else: taxonomy_counts = Counter(taxonomy_list) majority_cutoff = len(taxonomy_list) * majority_threshold # majority based on existing taxonomy counts alone if taxonomy_counts.most_common()[0][1] > majority_cutoff: majority = taxonomy_counts.most_common()[0][0] p = nettleton_pvalue(taxonomy_list, majority) # create majority from lca else: majority, lineages = self.lca_majority(taxonomy_list, majority_cutoff) aggregate_counts = self.counts_to_majority_list( taxonomy_counts, lineages, majority) p = nettleton_pvalue(aggregate_counts, majority) return {"taxonomy": majority, "pvalue": p}
def process_orfs_with_tree(orf_assignments, tree, output, aggregation_method, majority_threshold=0.51, table_name="refseq"): """Processing the already classified ORFs through secondary contig classification. Args: orf_assignments (dict): dict of dict for per ORF tax assignment per contig tree (Tree): taxonomic tree object output (filehandle): output file handle aggregation_method (str): lca, lca-majority, or majority majority_threshold (float): constitutes a majority fraction at tree node for 'lca-majority' ORF aggregation method """ print("contig", "orf", "taxonomy", "erfc", "orf_taxonomy", "%s_product" % table_name, "%s_evalue" % table_name, "%s_bitscore" % table_name, sep="\t", file=output) for contig, orfs in orf_assignments.items(): taxonomies = [x[1] for x in orfs.values()] if aggregation_method == "lca-majority": res = tree.lca_star(taxonomies, majority_threshold=majority_threshold) contig_taxonomy = res["taxonomy"] error_function = res["pvalue"] elif aggregation_method == "lca": # TODO incorporate threshold into LCAs? contig_taxonomy = tree.lca(taxonomies) error_function = nettleton_pvalue(taxonomies, contig_taxonomy) # simple majority else: contig_taxonomy = BlastHits(taxonomies).majority() error_function = nettleton_pvalue(taxonomies, contig_taxonomy) lineage = {} for item in tree.taxonomic_lineage(contig_taxonomy): node = tree.tree[item] if node.tax_level in TAX_LEVELS: # does not account for "no rank" and some other cases of "unclassified" lineage["k" if node.tax_level == "superkingdom" else node.tax_level[0]] = node.taxonomy lineage = validate_lineage(lineage) for idx in sorted(orfs.keys()): orf_function, orf_tax_id, bitscore, evalue = orfs[idx] orf_taxonomy = tree.tree[orf_tax_id].taxonomy print(contig, "%s_%s" % (contig, idx), lineage, error_function, orf_taxonomy, orf_function, evalue, bitscore, sep="\t", file=output)