def _shared_support(self, tree1, tree2, min_support, max_depth): """Determine supported bipartitions common to a pair of trees.""" assert tree1.taxon_namespace is tree2.taxon_namespace common_supported_splits = 0 common_supported_splits_w = 0 null_support = False for n in tree1.preorder_node_iter(lambda n: not n.is_leaf()): if not n.parent_node: continue support, label, aux_info = parse_label(n.label) if support is None: null_support = True if support is None or (support >= min_support and n.rel_dist <= max_depth): if n.bipartition in tree2.bipartition_encoding: edge2 = tree2.bipartition_edge_map[n.bipartition] support2, label2, aux_info2 = parse_label( edge2.head_node.label) if support2 is None or ( support2 >= min_support and edge2.head_node.rel_dist <= max_depth): common_supported_splits += 1 common_supported_splits_w += n.edge.length + edge2.length if null_support: self.logger.warning( 'Some internal nodes lack support values and were treated as supported.' ) return common_supported_splits, common_supported_splits_w
def _check_fractional_bootstraps(self, tree): """Check if bootstrap values are between [0, 1] and change to [0, 100].""" fractional_bootstrap = True for n in tree.preorder_node_iter(): support, label, aux_info = parse_label(n.label) if support is not None and support > 1.0: fractional_bootstrap = False break if fractional_bootstrap: for n in tree.preorder_node_iter(): support, label, aux_info = parse_label(n.label) if support is not None: n.label = create_label(int(support*100 + 0.5), label, aux_info)
def _reroot(self, tree, outgroup_node, max_support=100): """Reroot tree taking proper care of bootstrap values.""" # determine support values for each bipartition tree.encode_bipartitions() support_values = {} for nd in tree: support, taxon, aux_info = parse_label(nd.label) if nd.is_leaf(): support_values[nd.bipartition] = max_support else: if support is not None: support_values[nd.bipartition] = float(support) else: support_values[nd.bipartition] = None # move support values for desired re-rooting new_root = outgroup_node.parent_node tree.reseed_at(new_root) tree.encode_bipartitions() for nd in tree: _, taxon, aux_info = parse_label(nd.label) nd.label = create_label( support_values.get(nd.bipartition, "not_specified"), taxon, aux_info) tree.seed_node.edge.length = None # do a hard re-rooting of the tree # (this invalidates the previous bipartitions, so must be done seperately) tree.is_rooted = True tree.reroot_at_edge(outgroup_node.edge, length1=0.5 * outgroup_node.edge_length, length2=0.5 * outgroup_node.edge_length) # determine bootstrap for new node for child in tree.seed_node.child_node_iter(): if outgroup_node.is_leaf(): if not child.is_leaf(): support, taxon, aux_info = parse_label(child.label) child.label = create_label(max_support, taxon, aux_info) else: if child != outgroup_node: support, _taxon, _aux_info = parse_label( outgroup_node.label) _support, taxon, aux_info = parse_label(child.label) child.label = create_label(support, taxon, aux_info) return tree
def pull(self, options): """Create taxonomy file from a decorated tree.""" check_file_exists(options.input_tree) if options.no_validation: tree = dendropy.Tree.get_from_path(options.input_tree, schema='newick', rooting="force-rooted", preserve_underscores=True) taxonomy = {} for leaf in tree.leaf_node_iter(): taxon_id = leaf.taxon.label node = leaf.parent_node taxa = [] while node: support, taxon, aux_info = parse_label(node.label) if taxon: for t in list(map(str.strip, taxon.split(';')))[::-1]: taxa.append(t) node = node.parent_node taxonomy[taxon_id] = taxa[::-1] else: taxonomy = Taxonomy().read_from_tree(options.input_tree) Taxonomy().write(taxonomy, options.output_taxonomy) self.logger.info('Stripped tree written to: %s' % options.output_taxonomy)
def _leaf_taxa(self, leaf): """Get taxonomic information for leaf node. Parameters ---------- leaf : Node Node in tree. Returns ------- list Taxa for leaf in rank order. """ leaf_taxa = [] parent = leaf while parent: _support, taxon, _aux_info = parse_label(parent.label) if taxon: for t in taxon.split(';')[::-1]: leaf_taxa.append(t.strip()) parent = parent.parent_node ordered_taxa = leaf_taxa[::-1] # fill in missing ranks last_rank = ordered_taxa[-1][0:3] for i in xrange(Taxonomy.rank_prefixes.index(last_rank)+1,len(Taxonomy.rank_prefixes)): ordered_taxa.append(Taxonomy.rank_prefixes[i]) return ordered_taxa
def _assign_taxon_labels(self, fmeasure_for_taxa): """Assign taxon labels to nodes. Parameters ---------- fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall), ...] Node with highest F-measure for each taxon. Returns ------- set Taxon labels placed in tree. """ placed_taxon = set() for taxon in Taxonomy().sort_taxa(fmeasure_for_taxa.keys()): if len(fmeasure_for_taxa[taxon]) == 1: placed_taxon.add(taxon) node, fmeasure, precision, recall = fmeasure_for_taxa[taxon][0] support, taxon_label, aux_info = parse_label(node.label) if taxon_label: taxon_label += ';' + taxon else: taxon_label = taxon node.label = create_label(support, taxon_label, aux_info) return placed_taxon
def _get_phyla_lineages(self, tree): """Get list of phyla level lineages. Parameters ---------- tree : Dendropy Tree Phylogenetic tree. Returns ------- list List of phyla level lineages. """ phyla = [] for node in tree.preorder_node_iter(): if not node.label or node.is_leaf(): continue _support, taxon_name, _auxiliary_info = parse_label(node.label) if taxon_name: taxa = [x.strip() for x in taxon_name.split(';')] if taxa[-1].startswith('p__'): phyla.append(taxa[-1]) return phyla
def pull(self, options): """Create taxonomy file from a decorated tree.""" check_file_exists(options.input_tree) if options.no_validation: tree = dendropy.Tree.get_from_path(options.input_tree, schema='newick', rooting="force-rooted", preserve_underscores=True) taxonomy = {} for leaf in tree.leaf_node_iter(): taxon_id = leaf.taxon.label node = leaf.parent_node taxa = [] while node: support, taxon, aux_info = parse_label(node.label) if taxon: for t in map(str.strip, taxon.split(';'))[::-1]: taxa.append(t) node = node.parent_node taxonomy[taxon_id] = taxa[::-1] else: taxonomy = Taxonomy().read_from_tree(options.input_tree) Taxonomy().write(taxonomy, options.output_taxonomy) self.logger.info('Stripped tree written to: %s' % options.output_taxonomy)
def _assign_taxon_labels(self, fmeasure_for_taxa): """Assign taxon labels to nodes. Parameters ---------- fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall), ...] Node with highest F-measure for each taxon. Returns ------- set Taxon labels placed in tree. """ placed_taxon = set() for taxon in Taxonomy().sort_taxa(list(fmeasure_for_taxa.keys())): if len(fmeasure_for_taxa[taxon]) == 1: placed_taxon.add(taxon) node, fmeasure, precision, recall = fmeasure_for_taxa[taxon][0] support, taxon_label, aux_info = parse_label(node.label) if taxon_label: taxon_label += '; ' + taxon else: taxon_label = taxon node.label = create_label(support, taxon_label, aux_info) return placed_taxon
def _write_taxonomy(self, tree, out_taxonomy): """Write taxonomy decorated on tree to file. Parameters ---------- tree : Tree Dendropy Tree. out_taxonomy : str Output file. """ fout = open(out_taxonomy, 'w') for leaf in tree.leaf_node_iter(): leaf_taxa = [] parent = leaf while parent: _support, taxon, _aux_info = parse_label(parent.label) if taxon: for t in taxon.split(';')[::-1]: leaf_taxa.append(t) parent = parent.parent_node ordered_taxa = leaf_taxa[::-1] filled_ordered_taxa = Taxonomy().fill_missing_ranks(ordered_taxa) fout.write('%s\t%s\n' % (leaf.taxon.label, ';'.join(filled_ordered_taxa))) fout.close()
def _supported(self, ref_tree, compare_tree, min_support, max_depth): """Determine supported bipartitions in reference tree not in comparison tree.""" congruent = 0 congruent_w = 0 incongruent = 0 incongruent_w = 0 nontrivial_splits = 0 nontrivial_splits_w = 0 congruent_splits = {} incongruent_splits = {} for n in ref_tree.preorder_node_iter(lambda n: not n.is_leaf()): if not n.parent_node: continue nontrivial_splits += 1 nontrivial_splits_w += n.edge.length support, label, aux_info = parse_label(n.label) if support is None or (support >= min_support and n.rel_dist <= max_depth): split_lca = n.child_nodes()[0].leaf_nodes()[0].taxon.label split_lca += '|' split_lca += n.child_nodes()[1].leaf_nodes()[0].taxon.label if n.bipartition not in compare_tree.bipartition_encoding: incongruent += 1 incongruent_w += n.edge.length incongruent_splits[split_lca] = (n.edge.length, support) else: congruent += 1 congruent_w += n.edge.length congruent_splits[split_lca] = (n.edge.length, support) return congruent, congruent_w, incongruent, incongruent_w, nontrivial_splits, nontrivial_splits_w, congruent_splits, incongruent_splits
def _reroot(self, tree, outgroup_node, max_support=100): """Reroot tree taking proper care of bootstrap values.""" # determine support values for each bipartition tree.encode_bipartitions() support_values = {} for nd in tree: support, taxon, aux_info = parse_label(nd.label) if nd.is_leaf(): support_values[nd.bipartition] = max_support else: if support is not None: support_values[nd.bipartition] = float(support) else: support_values[nd.bipartition] = None # move support values for desired re-rooting new_root = outgroup_node.parent_node tree.reseed_at(new_root) tree.encode_bipartitions() for nd in tree: _, taxon, aux_info = parse_label(nd.label) nd.label = create_label(support_values.get(nd.bipartition, "not_specified"), taxon, aux_info) tree.seed_node.edge.length = None # do a hard re-rooting of the tree # (this invalidates the previous bipartitions, so must be done seperately) tree.is_rooted = True tree.reroot_at_edge(outgroup_node.edge, length1=0.5 * outgroup_node.edge_length, length2=0.5 * outgroup_node.edge_length) # determine bootstrap for new node for child in tree.seed_node.child_node_iter(): if outgroup_node.is_leaf(): if not child.is_leaf(): support, taxon, aux_info = parse_label(child.label) child.label = create_label(max_support, taxon, aux_info) else: if child != outgroup_node: support, _taxon, _aux_info = parse_label(outgroup_node.label) _support, taxon, aux_info = parse_label(child.label) child.label = create_label(support, taxon, aux_info) return tree
def bootstrap_support(input_tree, replicate_trees, output_tree): """ Calculate support for tree with replicates covering the same taxon set. Parameters ---------- input_tree : str Tree inferred from complete data. replicate_trees : iterable Files containing replicate trees. output_tree: str Name of output tree with support values. """ import dendropy # read tree and bootstrap replicates as unrooted, and # calculate bootstrap support orig_tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting="force-unrooted", preserve_underscores=True) orig_tree.bipartitions = True orig_tree.encode_bipartitions() rep_trees = dendropy.TreeArray(taxon_namespace=orig_tree.taxon_namespace, is_rooted_trees=False, ignore_edge_lengths=True, ignore_node_ages=True, use_tree_weights=False) rep_trees.read_from_files(files=replicate_trees, schema='newick', rooting="force-unrooted", preserve_underscores=True, taxon_namespace=orig_tree.taxon_namespace) rep_trees.summarize_splits_on_tree(orig_tree, is_bipartitions_updated=True, add_support_as_node_attribute=True, support_as_percentages=True) for node in orig_tree.internal_nodes(): if node.label: support, taxon, aux_info = parse_label(node.label) node.label = create_label(node.support, taxon, aux_info) else: node.label = str(int(node.support)) orig_tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def _strip_taxon_labels(self, tree): """Remove any previous taxon labels. Parameters ---------- tree : Tree Dendropy Tree. """ for node in tree.internal_nodes(): support, _taxon, _aux_info = parse_label(node.label) if support: node.label = create_label(support, None, None)
def report_missing_splits(self, ref_tree, compare_tree, min_support, taxa_list): """Report supported bipartitions in reference tree not in comparison tree.""" ref_tree, compare_tree = self._read_trees(ref_tree, compare_tree, taxa_list) incongruent = 0 print 'Missing splits with support >= %f:' % min_support for n in ref_tree.preorder_node_iter(lambda n: not n.is_leaf()): support, label, aux_info = parse_label(n.label) if support >= min_support: if n.bipartition not in compare_tree.bipartition_encoding: incongruent += 1 if label: print label, n.edge.length else: print ','.join([t.taxon.label for t in n.leaf_iter()]) print 'Missing splits: %d' % incongruent
def rel_dist_to_named_clades(self, tree): """Determine relative distance to specific taxa. Parameters ---------- tree : Dendropy Tree Phylogenetic tree. Returns ------- dict : d[rank_index][taxon] -> relative divergence """ # calculate relative distance for all nodes self.decorate_rel_dist(tree) # assign internal nodes with ranks from rel_dists = defaultdict(dict) for node in tree.preorder_node_iter(lambda n: n != tree.seed_node): if not node.label or node.is_leaf(): continue # check for support value _support, taxon_name, _auxiliary_info = parse_label(node.label) if not taxon_name: continue # get most-specific rank if a node represents multiple ranks if ';' in taxon_name: taxon_name = taxon_name.split(';')[-1].strip() most_specific_rank = taxon_name[0:3] rel_dists[Taxonomy.rank_index[most_specific_rank]][taxon_name] = node.rel_dist return rel_dists
def _resolve_ambiguous_placements(self, fmeasure_for_taxa, median_rank_rd, max_rd_diff=0.1): """Resolve ambiguous taxon label placements using median relative divergences. Parameters ---------- fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall)] Node with highest F-measure for each taxon. median_rank_rd : d[rank_index] -> float Median relative divergence for each taxonomic rank. max_rd_diff : float Maximum difference in relative divergence for assigning a taxonomic label. """ # For ambiguous nodes place them closest to median for rank # and within accepted relative divergence distance. Taxon labels # are placed in reverse taxonomic order (species to domain) and # this ordering used to ensure taxonomic consistency. for taxon in Taxonomy().sort_taxa(list(fmeasure_for_taxa.keys()), reverse=True): if len(fmeasure_for_taxa[taxon]) == 1: continue rank_prefix = taxon[0:3] rank_index = Taxonomy.rank_prefixes.index(rank_prefix) rd = median_rank_rd[rank_index] # Find node closest to median distance, but making sure # taxon is not placed below a more specific taxon label. # The 'fmeasure_for_taxa' stores node information in preorder. closest_index = None closest_dist = 1e9 closest_node = None for i, d in enumerate(fmeasure_for_taxa[taxon]): cur_node = d[0] cur_rank_index = -1 _support, cur_taxon, _aux_info = parse_label(cur_node.label) if cur_taxon: cur_prefix = cur_taxon.split(';')[-1].strip()[0:3] cur_rank_index = Taxonomy.rank_prefixes.index(cur_prefix) if cur_rank_index > rank_index: # reached a node with a more specific label so # label should be appended to this node or # placed above it if closest_index is None: closest_index = i closest_node = cur_node break rd_diff = abs(rd - cur_node.rel_dist) if rd_diff > max_rd_diff: continue if rd_diff < closest_dist: closest_dist = rd_diff closest_index = i closest_node = cur_node if closest_index is None: # no node is within an acceptable relative divergence distance # for this label so it should be placed at the most extant node # in order to be conservative closest_index = len(fmeasure_for_taxa[taxon]) - 1 closest_node = fmeasure_for_taxa[taxon][closest_index][0] # add label to node support, cur_taxon, aux_info = parse_label(closest_node.label) if not cur_taxon: taxa_str = taxon else: taxa = [t.strip() for t in cur_taxon.split(';')] + [taxon] taxa_str = '; '.join(Taxonomy().sort_taxa(taxa)) closest_node.label = create_label(support, taxa_str, aux_info) # remove other potential node assignments fmeasure_for_taxa[taxon] = [ fmeasure_for_taxa[taxon][closest_index] ]
def _filter_taxa_for_dist_inference(self, tree, taxonomy, trusted_taxa, min_children, min_support): """Determine taxa to use for inferring distribution of relative divergences. Parameters ---------- tree : Dendropy Tree Phylogenetic tree. taxonomy : d[taxon ID] -> [d__x; p__y; ...] Taxonomy for each taxon. trusted_taxa : iterable Trusted taxa to consider when inferring distribution. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. """ # determine children taxa for each named group taxon_children = Taxonomy().taxon_children(taxonomy) # get all named groups taxa_for_dist_inference = set() for taxon_id, taxa in taxonomy.iteritems(): for taxon in taxa: taxa_for_dist_inference.add(taxon) # sanity check species names as these are a common problem species = set() for taxon_id, taxa in taxonomy.iteritems(): if len(taxa) > Taxonomy.rank_index['s__']: species_name = taxa[Taxonomy.rank_index['s__']] valid, error_msg = True, None if species_name != 's__': valid, error_msg = Taxonomy().validate_species_name( species_name, require_full=True, require_prefix=True) if not valid: print '[Warning] Species name %s for %s is invalid: %s' % ( species_name, taxon_id, error_msg) continue species.add(species_name) # restrict taxa to those with a sufficient number of named children # Note: a taxonomic group with no children will not end up in the # taxon_children data structure so care must be taken when applying # this filtering criteria. if min_children > 0: valid_taxa = set() for taxon, children_taxa in taxon_children.iteritems(): if len(children_taxa) >= min_children: valid_taxa.add(taxon) taxa_for_dist_inference.intersection_update(valid_taxa) # explicitly add in the species since they have no # children and thus be absent from the taxon_child dictionary taxa_for_dist_inference.update(species) # restrict taxa used for inferring distribution to those with sufficient support if min_support > 0: for node in tree.preorder_node_iter(): if not node.label or node.is_leaf(): continue # check for support value support, taxon_name, _auxiliary_info = parse_label(node.label) if not taxon_name: continue if support and float(support) < min_support: taxa_for_dist_inference.difference_update([taxon_name]) elif not support and min_support > 0: # no support value, so inform user if they were trying to filter on this property print '[Error] Tree does not contain support values. As such, --min_support should be set to 0.' continue # restrict taxa used for inferring distribution to the trusted set if trusted_taxa: taxa_for_dist_inference = trusted_taxa.intersection( taxa_for_dist_inference) return taxa_for_dist_inference
def median_rd_over_phyla(self, tree, taxa_for_dist_inference, taxonomy): """Calculate the median relative divergence over all phyla rootings. Parameters ---------- tree : Tree Dendropy tree. taxa_for_dist_inference : set Taxa to use for inference relative divergence distributions. taxonomy : d[taxon_id] -> [d__, p__, ..., s__] Taxonomy of extant taxa. """ # get list of phyla level lineages all_phyla = self._get_phyla_lineages(tree) self.logger.info('Identified %d phyla.' % len(all_phyla)) phyla = [p for p in all_phyla if p in taxa_for_dist_inference] self.logger.info( 'Using %d phyla as rootings for inferring RED distributions.' % len(phyla)) if len(phyla) < 2: self.logger.error('Rescaling requires at least 2 valid phyla.') sys.exit(-1) # give each node a unique id for i, n in enumerate(tree.preorder_node_iter()): n.id = i # calculate relative divergence for tree rooted on each phylum phylum_rel_dists = {} rel_node_dists = defaultdict(list) rd = RelativeDistance() for p in phyla: phylum = p.replace('p__', '').replace(' ', '_').lower() status_msg = '==> Calculating information with rooting on %s. ' % phylum.capitalize( ) sys.stdout.write('%s\r' % status_msg) sys.stdout.flush() cur_tree = self.root_with_outgroup(tree, taxonomy, p) # calculate relative distance to taxa rel_dists = rd.rel_dist_to_named_clades(cur_tree) rel_dists.pop(0, None) # remove results for Domain # remove named groups in outgroup children = Taxonomy().children(p, taxonomy) for r in rel_dists.keys(): rel_dists[r].pop(p, None) for t in children: for r in rel_dists.keys(): rel_dists[r].pop(t, None) phylum_rel_dists[phylum] = rel_dists # calculate relative distance to all nodes rd.decorate_rel_dist(cur_tree) # determine which lineages represents the 'ingroup' ingroup_subtree = None for c in cur_tree.seed_node.child_node_iter(): _support, taxon_name, _auxiliary_info = parse_label(c.label) if not taxon_name or p not in taxon_name: ingroup_subtree = c break # do a preorder traversal of 'ingroup' and record relative divergence to nodes for n in ingroup_subtree.preorder_iter(): rel_node_dists[n.id].append(n.rel_dist) #status_msg = 'Inference of RED distribution finished' #sys.stdout.write('%s\r' % status_msg) sys.stdout.write( '==> Inference for RED distributions finished. ' ) sys.stdout.flush() #self.logger.info('Inference for RED distributions finished.') sys.stdout.write('\n') return phylum_rel_dists, rel_node_dists
def _select_taxa(self, tree, node_of_interest, outgroup_node, num_taxa_to_retain, keep_unclassified, genome_metadata): """Select genomes in named lineages on path from ingroup to outgroup.""" # get most recent common ancestor of outgroup and lineage of interest outgroup_leaf_taxon = outgroup_node.leaf_iter().next().taxon lineage_of_interest_taxon = node_of_interest.leaf_iter().next().taxon mrca = tree.mrca(taxa=[outgroup_leaf_taxon, lineage_of_interest_taxon]) # get taxon of lineage of interest taxa_of_interest = [] parent = node_of_interest while parent != mrca: _support, taxon, _auxiliary_info = parse_label(parent.label) if taxon: taxa_of_interest.append(taxon) parent = parent.parent_node self.logger.info('Taxonomy for lineage of interest: %s' % ';'.join(taxa_of_interest)) # select taxa from named lineages by traversing tree # in preorder and terminating descent at named taxa # not in path to lineage of interest selected_taxa = [] stack = [] for c in mrca.child_node_iter(): stack.append(c) while stack: cur_node = stack.pop() if cur_node.is_leaf() and keep_unclassified: selected_taxa.append(cur_node.taxon) _support, taxon, _auxiliary_info = parse_label(cur_node.label) if taxon and taxon not in taxa_of_interest: # select roughly equal taxa from each child lineage to # enure we retain the correct depth (and the named node) # for this lineage derep_taxa = [] num_children = sum([1 for c in cur_node.child_node_iter()]) child_taxa_to_sample = int( math.ceil((1.0 / num_children) * num_taxa_to_retain)) for i, c in enumerate(cur_node.child_node_iter()): taxa_to_sample = min(child_taxa_to_sample, num_taxa_to_retain - len(derep_taxa)) derep_taxa += self._derep_lineage(c, taxa_to_sample, genome_metadata) selected_taxa += derep_taxa self.logger.info('Selecting %d taxa from %s.' % (len(derep_taxa), taxon)) elif cur_node == node_of_interest: self.logger.info('Retaining all taxa in lineage of interest.') for leaf in node_of_interest.leaf_iter(): selected_taxa.append(leaf.taxon) else: for c in cur_node.child_node_iter(): stack.append(c) return selected_taxa
def run(self, input_tree, lineage_of_interest, outgroup, gtdb_metadata, num_taxa_to_retain, msa_file, keep_unclassified, output_dir): """Dereplicate tree. Parameters ---------- input_tree : str Tree to dereplicate lineage_of_interest : str Named lineage where all taxa should be retain. outgroup : str Named lineage to use as outgroup. gtdb_metadata : str File containing metadata for taxa in tree. num_taxa_to_retain: int Taxa to retain in dereplicated lineages. msa_file : str Multiple sequence alignment to dereplicate along with tree. keep_unclassified : boolean Keep all taxa in unclassified lineages. output_dir: Output dir. """ # read GTDB metadata self.logger.info('Reading metadata.') genome_metadata = read_gtdb_metadata(gtdb_metadata, [ 'checkm_completeness', 'checkm_contamination', 'gtdb_representative' ]) # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # locate node of interest and outgroup node self.logger.info('Identifying lineage of interest and outgroup.') node_of_interest = None outgroup_node = None for node in tree.preorder_node_iter(): _support, taxon_str, _auxiliary_info = parse_label(node.label) if not taxon_str: continue for taxon in [t.strip() for t in taxon_str.split(';')]: if taxon == lineage_of_interest: node_of_interest = node elif taxon == outgroup: outgroup_node = node if not node_of_interest: self.logger.error( 'Could not find specified lineage of interest: %s' % lineage_of_interest) sys.exit() if not outgroup_node: self.logger.error('Could not find outgroup: %s' % outgroup) sys.exit() # select taxa to retain self.logger.info('Selecting taxa to retain.') selected_taxa = self._select_taxa(tree, node_of_interest, outgroup_node, num_taxa_to_retain, keep_unclassified, genome_metadata) self.logger.info('Retaining %d taxa.' % len(selected_taxa)) # prune tree self.logger.info('Pruning tree.') tree.retain_taxa(selected_taxa) # dereplicate MSA if requested if msa_file: self.logger.info('Dereplicating MSA.') msa_name, msa_ext = os.path.splitext(os.path.basename(msa_file)) output_msa = os.path.join(output_dir, msa_name + '.derep' + msa_ext) self._derep_msa(msa_file, selected_taxa, output_msa) # write out results tree_name, tree_ext = os.path.splitext(os.path.basename(input_tree)) output_tree = os.path.join(output_dir, tree_name + '.derep' + tree_ext) tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def run(self, bac120_tree, ar122_tree): """Generate tree and iTOL files for producing iTOL tree image.""" self.logger.info('Creating trees with iTOL labels.') for tree_file, output_tree, itol_colors, itol_labels in [ (bac120_tree, f'bac120_r{self.release_number}.itol.tree', f'bac120_r{self.release_number}.itol_phyla_colors.txt', f'bac120_r{self.release_number}.itol_phyla_labels.txt'), (ar122_tree, f'ar122_r{self.release_number}.itol.tree', f'ar122_r{self.release_number}.itol_phyla_colors.txt', f'ar122_r{self.release_number}.itol_phyla_labels.txt') ]: self.logger.info(f'Reading {tree_file} reference tree.') domain_tree = dendropy.Tree.get_from_path( tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) self.logger.info(' ...tree contains {:,} genomes.'.format( sum([1 for leaf in domain_tree.leaf_node_iter()]))) phyla = set() for node in domain_tree.preorder_node_iter(): if node.is_leaf(): continue support, taxon, auxiliary_info = parse_label(node.label) if taxon: taxa = taxon.split(';')[0] if taxa.startswith('p__'): node.label = taxa[3:] phyla.add(taxa[3:]) else: node.label = None domain_tree.write_to_path(self.output_dir / output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True) self.logger.info('Identified {:,} phyla in {}.'.format( len(phyla), tree_file)) # create iTOL metadata for coloring phyla self.logger.info( f'Creating iTOL metadata for coloring phyla: {itol_colors}.') fout = open(self.output_dir / itol_colors, 'w') fout.write('TREE_COLORS\n') fout.write('SEPARATOR TAB\n') fout.write('DATA\n') color_index = 0 for phylum in phyla: fout.write('{}\trange\t{}\t{}\n'.format( phylum, self.colors[color_index], phylum)) color_index += 1 if color_index >= len(self.colors): color_index = 0 fout.close() # create iTOL metadata for phylum labels self.logger.info( f'Creating iTOL metadata for phyla labels: {itol_labels}.') fout = open(self.output_dir / itol_labels, 'w') fout.write('DATASET_TEXT\n') fout.write('SEPARATOR TAB\n') fout.write('DATASET_LABEL\tPhylum labels\n') fout.write('COLOR\t#000000\n') fout.write('MARGIN\t0\n') fout.write('SHOW_INTERNAL\t1\n') fout.write('LABEL_ROTATION\t0\n') fout.write('STRAIGHT_LABELS\t0\n') fout.write('ALIGN_TO_TREE\t0\n') fout.write('SIZE_FACTOR\t1\n') fout.write('DATA\n') color_index = 0 for phylum in phyla: fout.write('{}\t{}\t-1\t{}\tnormal\t1\t0\n'.format( phylum, phylum, self.colors[color_index])) color_index += 1 if color_index >= len(self.colors): color_index = 0 fout.close()
def run(self, genomes, align_dir, out_dir, prefix, debugopt=False): try: """Classify genomes based on position in reference tree.""" for marker_set_id in ('bac120', 'ar122'): user_msa_file = os.path.join( align_dir, prefix + '.%s.user_msa.fasta' % marker_set_id) if not os.path.exists(user_msa_file): # file will not exist if there are no User genomes from a given domain continue classify_tree = self.place_genomes(user_msa_file, marker_set_id, out_dir, prefix) # get taxonomic classification of each user genome tree = dendropy.Tree.get_from_path(classify_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) gtdb_taxonomy = Taxonomy().read(self.taxonomy_file) fout = open( os.path.join( out_dir, prefix + '.%s.classification.tsv' % marker_set_id), 'w') fastaniout = open( os.path.join( out_dir, prefix + '.%s.fastani_results.tsv' % marker_set_id), 'w') redfout = open( os.path.join(out_dir, prefix + '.%s.summary.tsv' % marker_set_id), 'w') if debugopt: parchiinfo = open( os.path.join( out_dir, prefix + '.%s.debug_file.tsv' % marker_set_id), 'w') reddictfile = open( os.path.join( out_dir, prefix + '.%s.red_dictionary.tsv' % marker_set_id), 'w') marker_dict = {} if marker_set_id == 'bac120': marker_dict = Config.RED_DIST_BAC_DICT elif marker_set_id == 'ar122': marker_dict = Config.RED_DIST_ARC_DICT reddictfile.write('Phylum\t{0}\n'.format( marker_dict.get('p__'))) reddictfile.write('Class\t{0}\n'.format( marker_dict.get('c__'))) reddictfile.write('Order\t{0}\n'.format( marker_dict.get('o__'))) reddictfile.write('Family\t{0}\n'.format( marker_dict.get('f__'))) reddictfile.write('Genus\t{0}\n'.format( marker_dict.get('g__'))) reddictfile.close() fastaniout.write("User genome\tReference genome\tANI\n") redfout.write( "user_genome\tclassification_method\tred_value\n") if debugopt: parchiinfo.write( "User genome\tHigher rank\tHigher value\tLower rank\tLower value\tcase\tclosest_rank\n" ) # Genomes can be classified by using Mash or RED values # We go through all leaves of the tree. if the leaf is a user genome we take it's parent node and look at all the leaves for this node. # If the parent node has only one Reference genome ( GB or RS ) we calculate the mash distance between the user genome and the reference genome analysed_nodes = [] fastani_dict = {} all_fastani_dict = {} fastani_list = [] # some genomes of Case C are handled here, if Mash distance is close enough self.logger.info( 'Calculating Average Nucleotide Identity using FastANI.') for nd in tree.preorder_node_iter(): #We store the prefixes of each leaves to check if one starts with GB_ or RS_ list_subnode_initials = [ subnd.taxon.label.replace("'", '')[0:3] for subnd in nd.leaf_iter() ] list_subnode = [ subnd.taxon.label.replace("'", '') for subnd in nd.leaf_iter() ] #if only one genome is a reference genome if (list_subnode_initials.count('RS_') + list_subnode_initials.count('GB_') + list_subnode_initials.count('UBA')) == 1 and len( list_subnode_initials ) > 1 and list_subnode[0] not in analysed_nodes: fastani_list.append(list_subnode) analysed_nodes.extend(list_subnode) manager = multiprocessing.Manager() out_q = manager.dict() procs = [] nprocs = self.cpus if len(fastani_list) > 0: for item in splitchunks_list(fastani_list, nprocs): p = multiprocessing.Process(target=self._fastaniWorker, args=(item, genomes, out_q)) procs.append(p) p.start() # Collect all results into a single result dict. We know how many dicts # with results to expect. #while out_q.empty(): # time.sleep(1) # Wait for all worker processes to finish for p in procs: p.join() if p.exitcode == 1: raise ValueError("Stop!!") all_fastani_dict = dict(out_q) for k, v in all_fastani_dict.iteritems(): fastaniout.write("{0}\t{1}\t{2}\n".format( k, v.get("ref_genome"), v.get("ani"))) if Config.FASTANI_SPECIES_THRESHOLD <= v.get("ani"): suffixed_name = add_ncbi_prefix(v.get("ref_genome")) taxa_str = ";".join(gtdb_taxonomy.get(suffixed_name)) if taxa_str.endswith("s__"): taxa_str = taxa_str + v.get("ref_genome") fout.write('%s\t%s\n' % (k, taxa_str)) fastani_dict[k] = v redfout.write("{0}\tani\tNone\n".format(k)) fastaniout.close() self.logger.info( '{0} genomes have been classify with FastANI.'.format( len(fastani_dict))) scaled_tree = self._calculate_red_distances( classify_tree, out_dir) user_genome_ids = set(read_fasta(user_msa_file).keys()) user_genome_ids = user_genome_ids.difference( set(fastani_dict.keys())) # for all other cases we measure the RED distance between a leaf and a parent node ( RED = 1-edge_length). This RED value will tell us # the rank level that can be associated with a User genome. # As an example if the RED value is close to the order level, the user genome will take the order level of the Reference genome under the same parent node. # Is there are multiple orders under the parent node. The user genome is considered as a new order for leaf in scaled_tree.leaf_node_iter(): if leaf.taxon.label in user_genome_ids: taxa = [] # In some cases , pplacer can associate 2 user genomes on the same parent node so we need to go up the tree to find a node with a reference genome as leaf. cur_node = leaf.parent_node list_subnode_initials = [ subnd.taxon.label.replace("'", '')[0:3] for subnd in cur_node.leaf_iter() ] while 'RS_' not in list_subnode_initials and 'GB_' not in list_subnode_initials and 'UBA' not in list_subnode_initials: cur_node = cur_node.parent_node list_subnode_initials = [ subnd.taxon.label.replace("'", '')[0:3] for subnd in cur_node.leaf_iter() ] current_rel_list = cur_node.rel_dist parent_taxon_node = cur_node.parent_node _support, parent_taxon, _aux_info = parse_label( parent_taxon_node.label) while parent_taxon_node is not None and not parent_taxon: parent_taxon_node = parent_taxon_node.parent_node _support, parent_taxon, _aux_info = parse_label( parent_taxon_node.label) parent_rank = parent_taxon.split(";")[-1][0:3] parent_rel_dist = parent_taxon_node.rel_dist genome_parent_child = [ leaf.taxon.label, parent_rank, parent_rel_dist, '', '', '', '' ] child_taxons = [] closest_rank = None detection = "RED" # if the genome is placed between the genus and specie ranks , it will be associated with the genus when _get_closest_red_rank is called if parent_rank != 'g__': child_rk = self.order_rank[ self.order_rank.index(parent_rank) + 1] list_subnode = [ childnd.taxon.label.replace("'", '') for childnd in cur_node.leaf_iter() if (childnd.taxon.label.startswith('RS_') or childnd.taxon.label.startswith('GB_')) ] list_ranks = [ gtdb_taxonomy.get(name)[self.order_rank.index( child_rk)] for name in list_subnode ] if len(set(list_ranks)) == 1: for subranknd in cur_node.preorder_iter(): _support, subranknd_taxon, _aux_info = parse_label( subranknd.label) if subranknd.is_internal( ) and subranknd_taxon is not None and subranknd_taxon.startswith( child_rk): child_taxons = subranknd_taxon.split( ";") child_taxon_node = subranknd child_rel_dist = child_taxon_node.rel_dist break else: #case 2a and 2b closest_rank = parent_rank detection = "Topology" else: #case 1a closest_rank = parent_rank detection = "Topology" #case 1b if len(child_taxons) == 0 and closest_rank is None: list_leaves = [ childnd.taxon.label.replace("'", '') for childnd in cur_node.leaf_iter() if (childnd.taxon.label.startswith('RS_') or childnd.taxon.label.startswith('GB_')) ] if len(list_leaves) != 1: self.logger.error( 'There should be only one leaf.') sys.exit(-1) list_leaf_ranks = gtdb_taxonomy.get( list_leaves[0])[self.order_rank.index(child_rk ):-1] for leaf_taxon in reversed(list_leaf_ranks): if leaf_taxon == list_leaf_ranks[0]: if abs(current_rel_list - marker_dict.get( leaf_taxon[:3])) < abs( (current_rel_list) - marker_dict.get(parent_rank)): #and current_rel_list - marker_dict.get(leaf_taxon[:3]) > 0 ): closest_rank = leaf_taxon[:3] genome_parent_child[3] = leaf_taxon genome_parent_child[ 5] = 'case 1b - III' break else: pchildrank = list_leaf_ranks[ list_leaf_ranks.index(leaf_taxon) - 1] if abs( current_rel_list - marker_dict.get(leaf_taxon[:3]) ) < abs(current_rel_list - marker_dict.get(pchildrank[:3])): #and current_rel_list - marker_dict.get(leaf_taxon[:3]) > 0 ) : closest_rank = leaf_taxon[:3] genome_parent_child[1] = pchildrank genome_parent_child[2] = 1.0 genome_parent_child[3] = leaf_taxon genome_parent_child[5] = 'case 1b - II' break if closest_rank is None: closest_rank = parent_rank genome_parent_child[3] = list_leaf_ranks[0] genome_parent_child[5] = 'case 1b - IV' #if there is multiple ranks on the child node (i.e genome between p__Nitrospirae and c__Nitrospiria;o__Nitrospirales;f__Nitropiraceae) #we loop through the list of rank from f_ to c_ rank for child_taxon in reversed(child_taxons): # if lower rank is c__Nitropiria if child_taxon == child_taxons[0]: if (abs(current_rel_list - marker_dict.get(child_taxon[:3])) < abs(child_rel_dist - marker_dict.get(child_taxon[:3])) and abs(current_rel_list - marker_dict.get(child_taxon[:3])) < abs(current_rel_list - marker_dict.get(parent_rank))): genome_parent_child[3] = ';'.join( child_taxons) genome_parent_child[4] = child_rel_dist genome_parent_child[5] = 'case 3b - II' closest_rank = child_taxon[:3] elif closest_rank is None: closest_rank = parent_rank genome_parent_child[3] = ';'.join( child_taxons) genome_parent_child[4] = child_rel_dist genome_parent_child[5] = 'case 3b - III' else: pchildrank = child_taxons[ child_taxons.index(child_taxon) - 1] if (abs(current_rel_list - marker_dict.get(child_taxon[:3])) < abs(current_rel_list - marker_dict.get(pchildrank[:3])) and abs(current_rel_list - marker_dict.get(child_taxon[:3])) < abs(child_rel_dist - marker_dict.get(child_taxon[:3]))): closest_rank = child_taxon genome_parent_child[3] = ';'.join( child_taxons) genome_parent_child[4] = child_rel_dist genome_parent_child[5] = 'case 3b - I' break # case 1b if closest_rank is None: print "IT SHOULDN'T HAPPEN!!!" genome_parent_child[6] = closest_rank list_subnode = [ subnd.taxon.label.replace("'", '') for subnd in cur_node.leaf_iter() ] red_taxonomy = self._get_redtax( list_subnode, closest_rank, gtdb_taxonomy) fout.write('{0}\t{1}\n'.format(leaf.taxon.label, red_taxonomy)) del genome_parent_child[0] redfout.write("{0}\t{1}\t{2}\n".format( leaf.taxon.label, detection, current_rel_list)) if debugopt: parchiinfo.write('{0}\t{1}\t{2}\t{3}\n'.format( leaf.taxon.label, current_rel_list, '\t'.join(str(x) for x in genome_parent_child), detection)) redfout.close() fout.close() if debugopt: parchiinfo.close() pplaceout = open( os.path.join( out_dir, prefix + '.%s.classification_pplacer.tsv' % marker_set_id), 'w') # We get the pplacer taxonomy for comparison user_genome_ids = set(read_fasta(user_msa_file).keys()) for leaf in tree.leaf_node_iter(): if leaf.taxon.label in user_genome_ids: taxa = [] cur_node = leaf while cur_node.parent_node: _support, taxon, _aux_info = parse_label( cur_node.label) if taxon: for t in taxon.split(';')[::-1]: taxa.append(t.strip()) cur_node = cur_node.parent_node taxa_str = ';'.join(taxa[::-1]) pplaceout.write('%s\t%s\n' % (leaf.taxon.label, taxa_str)) pplaceout.close() except ValueError as error: print "GTDB-Tk has stopped before finishing" sys.exit(-1) except Exception as error: print "GTDB-Tk has stopped before finishing" sys.exit(-1)
def check_tree(self, options): """Validate taxonomy of decorated tree and check for polyphyletic groups.""" check_file_exists(options.decorated_tree) # validate taxonomy taxonomy = Taxonomy() if options.taxonomy_file: t = taxonomy.read(options.taxonomy_file) else: t = taxonomy.read_from_tree(options.decorated_tree) taxonomy.validate(t, check_prefixes=True, check_ranks=True, check_hierarchy=True, check_species=True, check_group_names=True, check_duplicate_names=True, report_errors=True) # check for polyphyletic groups polyphyletic_groups = set() tree = dendropy.Tree.get_from_path(options.decorated_tree, schema='newick', rooting="force-rooted", preserve_underscores=True) if options.taxonomy_file: # reduce taxonomy to taxa in tree and map taxon labels to Taxon objects reduced_taxonomy = {} taxon_map = {} for leaf in tree.leaf_node_iter(): reduced_taxonomy[leaf.taxon.label] = t[leaf.taxon.label] taxon_map[leaf.taxon.label] = leaf.taxon # find taxa with an MRCA spanning additional taxa for rank_label in Taxonomy.rank_labels[1:]: extant_taxa = taxonomy.extant_taxa_for_rank( rank_label, reduced_taxonomy) for taxon, taxa_ids in extant_taxa.items(): mrca = tree.mrca(taxa=[taxon_map[t] for t in taxa_ids]) mrca_leaf_count = sum([1 for leaf in mrca.leaf_iter()]) if mrca_leaf_count != len(taxa_ids): polyphyletic_groups.add(taxon) else: # find duplicate taxon labels in tree taxa = set() for node in tree.preorder_node_iter(lambda n: not n.is_leaf()): _support, taxon_label, _aux_info = parse_label(node.label) if taxon_label: for taxon in [t.strip() for t in taxon_label.split(';')]: if taxon in taxa: polyphyletic_groups.add(taxon) taxa.add(taxon) if len(polyphyletic_groups): print('') print('Tree contains polyphyletic groups:') for taxon in polyphyletic_groups: print('%s' % (taxon)) self.logger.info('Finished performing validation tests.')
def _resolve_ambiguous_placements(self, tree, fmeasure_for_taxa, median_rank_rd): """Resolve ambiguous taxon label placements using median relative divergences. Parameters ---------- tree : Tree Dendropy tree. fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall)] Node with highest F-measure for each taxon. median_rank_rd : d[rank_index] -> float Median relative divergence for each taxonomic rank. """ # For ambiguous nodes place them closest to median for rank # and within accept relative divergence distance. Taxon labels # are placed in reverse taxonomic order (species to domain) and # this ordering used to ensure taxonomic consistency. for taxon in Taxonomy().sort_taxa(fmeasure_for_taxa.keys(), reverse=True): if len(fmeasure_for_taxa[taxon]) == 1: continue rank_prefix = taxon[0:3] rank_index = Taxonomy.rank_prefixes.index(rank_prefix) rd = median_rank_rd[rank_index] # Find node closest to median distance, but making sure # taxon is not placed below a more specific taxon label. # The 'fmeasure_for_taxa' stores node information in preorder. closest_index = None closest_dist = 1e9 closest_node = None for i, d in enumerate(fmeasure_for_taxa[taxon]): cur_node = d[0] cur_rank_index = -1 _support, cur_taxon, _aux_info = parse_label(cur_node.label) if cur_taxon: cur_prefix = cur_taxon.split(';')[-1][0:3] cur_rank_index = Taxonomy.rank_prefixes.index(cur_prefix) if cur_rank_index > rank_index: # reached a node with a more specific label so # label should be appended to this node or # placed above it if closest_index is None: closest_index = i closest_node = cur_node break rd_diff = abs(rd - cur_node.rel_dist) if rd_diff < 0.1 and rd_diff < closest_dist: closest_dist = rd_diff closest_index = i closest_node = cur_node if closest_index is None: # no node is within an acceptable relative divergence distance # for this label so it should be placed at the most extant node # which should be a leaf node closest_index = len(fmeasure_for_taxa[taxon]) - 1 closest_node = fmeasure_for_taxa[taxon][closest_index][0] if not closest_node.is_leaf(): self.logger.error('Leaf node expected!') sys.exit() # add label to node support, cur_taxon, aux_info = parse_label(closest_node.label) if not cur_taxon: taxa_str = taxon else: taxa = cur_taxon.split(';') + [taxon] taxa_str = ';'.join(Taxonomy().sort_taxa(taxa)) closest_node.label = create_label(support, taxa_str, aux_info) # remove other potential node assignments fmeasure_for_taxa[taxon] = [fmeasure_for_taxa[taxon][closest_index]]