def run(self, input_tree, rd_thresholds, output_dir): """Calculate number of taxa for specified relative divergence thresholds. Parameters ---------- input_tree : str Name of input tree. rd_thresholds : d[rank] -> threshold Relative divergence threshold for defining taxonomic ranks. output_dir : str Desired output directory. """ # get list of phyla level lineages tree = tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) phyla = get_phyla_lineages(tree) self.logger.info('Identified %d phyla for rooting.' % len(phyla)) self.logger.info('Reading taxonomy from tree.') taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv') taxonomy = Taxonomy().read_from_tree(input_tree) Taxonomy().write(taxonomy, taxonomy_file) rd = RelativeDistance() overall_ranks_below_taxon = defaultdict(lambda: defaultdict(list)) for p in phyla: phylum_children = Taxonomy().children(p, taxonomy) phylum = p.replace('p__', '') self.logger.info('Calculating information with rooting on %s.' % phylum) phylum_dir = os.path.join(output_dir, phylum) if not os.path.exists(phylum_dir): os.makedirs(phylum_dir) output_tree = os.path.join(phylum_dir, 'rerooted.tree') os.system('genometreetk outgroup %s %s %s %s' % (input_tree, taxonomy_file, p, output_tree)) # calculate relative distance for all nodes cur_tree = dendropy.Tree.get_from_path(output_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) rd.decorate_rel_dist(cur_tree) # determine ranks for n in cur_tree.postorder_node_iter( lambda n: n != tree.seed_node): ranks = [] for rank_prefix, threshold in rd_thresholds.items(): if n.rel_dist >= threshold and n.parent_node.rel_dist < threshold: ranks.append(rank_prefix.capitalize() + '__') if ranks: if not n.label: n.label = '|%s [rd=%.2f]' % (';'.join(ranks), n.rel_dist) else: n.label += '|%s [rd=%.2f]' % (';'.join(ranks), n.rel_dist) cur_tree.write_to_path(os.path.join(phylum_dir, 'rd_ranks.tree'), schema='newick', suppress_rooting=True, unquoted_underscores=True) # determine number of ranks below root and all named nodes ranks_below_taxon = defaultdict(lambda: defaultdict(int)) for cur_node in cur_tree.postorder_node_iter(): if cur_node == cur_tree.seed_node: cur_taxon = 'root' elif cur_node.label: _support, cur_taxon, _auxiliary_info = parse_label( cur_node.label) if not cur_taxon or cur_taxon.strip() == '': continue else: continue for n in cur_node.postorder_iter(): if not n.label: continue _support, _taxon, auxiliary_info = parse_label(n.label) if auxiliary_info: ranks = auxiliary_info[0:auxiliary_info.rfind('[')] ranks = [r.strip() for r in ranks.split(';')] for r in ranks: ranks_below_taxon[cur_taxon][r] += 1 for taxon in ranks_below_taxon: if taxon == p or taxon in phylum_children: # do not record results for named groups in the lineage # used for rooting continue for rank, count in ranks_below_taxon[taxon].items(): overall_ranks_below_taxon[taxon][rank].append(count) results_table = os.path.join(phylum_dir, 'rd_ranks.tsv') self.write_rank_count(ranks_below_taxon, results_table) results_table = os.path.join(output_dir, 'mean_rd_ranks.tsv') self.write_rank_count(overall_ranks_below_taxon, results_table)
def run(self, input_tree, rd_thresholds, output_dir): """Calculate number of taxa for specified relative divergence thresholds. Parameters ---------- input_tree : str Name of input tree. rd_thresholds : d[rank] -> threshold Relative divergence threshold for defining taxonomic ranks. output_dir : str Desired output directory. """ # get list of phyla level lineages tree = TreeNode.read(input_tree, convert_underscores=False) phyla = get_phyla_lineages(tree) self.logger.info('Identified %d phyla for rooting.' % len(phyla)) self.logger.info('Reading taxonomy from tree.') taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv') taxonomy = Taxonomy().read_from_tree(input_tree) Taxonomy().write(taxonomy, taxonomy_file) rd = RelativeDistance() overall_ranks_below_taxon = defaultdict(lambda: defaultdict(list)) for p in phyla: phylum_children = Taxonomy().children(p, taxonomy) phylum = p.replace('p__', '') self.logger.info('Calculating information with rooting on %s.' % phylum) phylum_dir = os.path.join(output_dir, phylum) if not os.path.exists(phylum_dir): os.makedirs(phylum_dir) output_tree = os.path.join(phylum_dir, 'rerooted.tree') os.system('genometreetk outgroup %s %s %s %s' % (input_tree, taxonomy_file, p, output_tree)) # calculate relative distance for all nodes cur_tree = dendropy.Tree.get_from_path(output_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) rd.decorate_rel_dist(cur_tree) # determine ranks for n in cur_tree.postorder_node_iter(lambda n: n != tree.seed_node): ranks = [] for rank_prefix, threshold in rd_thresholds.iteritems(): if n.rel_dist >= threshold and n.parent_node.rel_dist < threshold: ranks.append(rank_prefix.capitalize() + '__') if ranks: if not n.label: n.label = '|%s [rd=%.2f]' % (';'.join(ranks), n.rel_dist) else: n.label += '|%s [rd=%.2f]' % (';'.join(ranks), n.rel_dist) cur_tree.write_to_path(os.path.join(phylum_dir, 'rd_ranks.tree'), schema='newick', suppress_rooting=True, unquoted_underscores=True) # determine number of ranks below root and all named nodes ranks_below_taxon = defaultdict(lambda: defaultdict(int)) for cur_node in cur_tree.postorder_node_iter(): if cur_node == cur_tree.seed_node: cur_taxon = 'root' elif cur_node.label: _support, cur_taxon, _auxiliary_info = parse_label(cur_node.label) if not cur_taxon or cur_taxon.strip() == '': continue else: continue for n in cur_node.postorder_iter(): if not n.label: continue _support, _taxon, auxiliary_info = parse_label(n.label) if auxiliary_info: ranks = auxiliary_info[0:auxiliary_info.rfind('[')] ranks = [r.strip() for r in ranks.split(';')] for r in ranks: ranks_below_taxon[cur_taxon][r] += 1 for taxon in ranks_below_taxon: if taxon == p or taxon in phylum_children: # do not record results for named groups in the lineage # used for rooting continue for rank, count in ranks_below_taxon[taxon].iteritems(): overall_ranks_below_taxon[taxon][rank].append(count) results_table = os.path.join(phylum_dir, 'rd_ranks.tsv') self.write_rank_count(ranks_below_taxon, results_table) results_table = os.path.join(output_dir, 'mean_rd_ranks.tsv') self.write_rank_count(overall_ranks_below_taxon, results_table)
def median_rd_over_phyla(self, tree, taxa_for_dist_inference, taxonomy): """Calculate the median relative divergence over all phyla rootings. Parameters ---------- tree : Tree Dendropy tree. taxa_for_dist_inference : set Taxa to use for inference relative divergence distributions. taxonomy : d[taxon_id] -> [d__, p__, ..., s__] Taxonomy of extant taxa. """ # get list of phyla level lineages all_phyla = get_phyla_lineages(tree) self.logger.info('Identified %d phyla.' % len(all_phyla)) phyla = [p for p in all_phyla if p in taxa_for_dist_inference] self.logger.info('Using %d phyla as rootings for inferring distributions.' % len(phyla)) if len(phyla) < 2: self.logger.error('Rescaling requires at least 2 valid phyla.') sys.exit(-1) # give each node a unique id for i, n in enumerate(tree.preorder_node_iter()): n.id = i # calculate relative divergence for tree rooted on each phylum phylum_rel_dists = {} rel_node_dists = defaultdict(list) rd = RelativeDistance() for p in phyla: phylum = p.replace('p__', '').replace(' ', '_').lower() self.logger.info('Calculating information with rooting on %s.' % phylum.capitalize()) cur_tree = self.root_with_outgroup(tree, taxonomy, p) # calculate relative distance to taxa rel_dists = rd.rel_dist_to_named_clades(cur_tree) rel_dists.pop(0, None) # remove results for Domain # remove named groups in outgroup children = Taxonomy().children(p, taxonomy) for r in list(rel_dists.keys()): rel_dists[r].pop(p, None) for t in children: for r in list(rel_dists.keys()): rel_dists[r].pop(t, None) phylum_rel_dists[phylum] = rel_dists # calculate relative distance to all nodes rd.decorate_rel_dist(cur_tree) # determine which lineages represents the 'ingroup' ingroup_subtree = None for c in cur_tree.seed_node.child_node_iter(): _support, taxon_name, _auxiliary_info = parse_label(c.label) if not taxon_name or p not in taxon_name: ingroup_subtree = c break # do a preorder traversal of 'ingroup' and record relative divergence to nodes for n in ingroup_subtree.preorder_iter(): rel_node_dists[n.id].append(n.rel_dist) return phylum_rel_dists, rel_node_dists
def median_rd_over_phyla(self, tree, taxa_for_dist_inference, taxonomy): """Calculate the median relative divergence over all phyla rootings. Parameters ---------- tree : Tree Dendropy tree. taxa_for_dist_inference : set Taxa to use for inference relative divergence distributions. taxonomy : d[taxon_id] -> [d__, p__, ..., s__] Taxonomy of extant taxa. """ # get list of phyla level lineages all_phyla = get_phyla_lineages(tree) self.logger.info('Identified %d phyla.' % len(all_phyla)) phyla = [p for p in all_phyla if p in taxa_for_dist_inference] self.logger.info('Using %d phyla as rootings for inferring distributions.' % len(phyla)) if len(phyla) < 2: self.logger.error('Rescaling requires at least 2 valid phyla.') sys.exit(-1) # give each node a unique id for i, n in enumerate(tree.preorder_node_iter()): n.id = i # calculate relative divergence for tree rooted on each phylum phylum_rel_dists = {} rel_node_dists = defaultdict(list) rd = RelativeDistance() for p in phyla: phylum = p.replace('p__', '').replace(' ', '_').lower() self.logger.info('Calculating information with rooting on %s.' % phylum.capitalize()) cur_tree = self.root_with_outgroup(tree, taxonomy, p) # calculate relative distance to taxa rel_dists = rd.rel_dist_to_named_clades(cur_tree) rel_dists.pop(0, None) # remove results for Domain # remove named groups in outgroup children = Taxonomy().children(p, taxonomy) for r in rel_dists.keys(): rel_dists[r].pop(p, None) for t in children: for r in rel_dists.keys(): rel_dists[r].pop(t, None) phylum_rel_dists[phylum] = rel_dists # calculate relative distance to all nodes') rd.decorate_rel_dist(cur_tree) # determine which lineages represents the 'ingroup' ingroup_subtree = None for c in cur_tree.seed_node.child_node_iter(): _support, taxon_name, _auxiliary_info = parse_label(c.label) if not taxon_name or p not in taxon_name: ingroup_subtree = c break # do a preorder traversal of 'ingroup' and record relative divergence to nodes for n in ingroup_subtree.preorder_iter(): rel_node_dists[n.id].append(n.rel_dist) return phylum_rel_dists, rel_node_dists