Beispiel #1
0
    def _shared_support(self, tree1, tree2, min_support, max_depth):
        """Determine supported bipartitions common to a pair of trees."""

        assert tree1.taxon_namespace is tree2.taxon_namespace

        common_supported_splits = 0
        common_supported_splits_w = 0
        null_support = False
        for n in tree1.preorder_node_iter(lambda n: not n.is_leaf()):
            if not n.parent_node:
                continue

            support, label, aux_info = parse_label(n.label)
            if support is None:
                null_support = True

            if support is None or (support >= min_support
                                   and n.rel_dist <= max_depth):
                if n.bipartition in tree2.bipartition_encoding:
                    edge2 = tree2.bipartition_edge_map[n.bipartition]
                    support2, label2, aux_info2 = parse_label(
                        edge2.head_node.label)

                    if support2 is None or (
                            support2 >= min_support
                            and edge2.head_node.rel_dist <= max_depth):
                        common_supported_splits += 1
                        common_supported_splits_w += n.edge.length + edge2.length

        if null_support:
            self.logger.warning(
                'Some internal nodes lack support values and were treated as supported.'
            )

        return common_supported_splits, common_supported_splits_w
Beispiel #2
0
 def _check_fractional_bootstraps(self, tree):
     """Check if bootstrap values are between [0, 1] and change to [0, 100]."""
     
     fractional_bootstrap = True
     for n in tree.preorder_node_iter():
         support, label, aux_info = parse_label(n.label) 
         if support is not None and support > 1.0:
             fractional_bootstrap = False
             break
     
     if fractional_bootstrap:
         for n in tree.preorder_node_iter():
             support, label, aux_info = parse_label(n.label) 
             if support is not None:
                 n.label = create_label(int(support*100 + 0.5), label, aux_info)
Beispiel #3
0
    def _reroot(self, tree, outgroup_node, max_support=100):
        """Reroot tree taking proper care of bootstrap values."""

        # determine support values for each bipartition
        tree.encode_bipartitions()
        support_values = {}
        for nd in tree:
            support, taxon, aux_info = parse_label(nd.label)
            if nd.is_leaf():
                support_values[nd.bipartition] = max_support
            else:
                if support is not None:
                    support_values[nd.bipartition] = float(support)
                else:
                    support_values[nd.bipartition] = None

        # move support values for desired re-rooting
        new_root = outgroup_node.parent_node
        tree.reseed_at(new_root)
        tree.encode_bipartitions()
        for nd in tree:
            _, taxon, aux_info = parse_label(nd.label)
            nd.label = create_label(
                support_values.get(nd.bipartition, "not_specified"), taxon,
                aux_info)
        tree.seed_node.edge.length = None

        # do a hard re-rooting of the tree
        # (this invalidates the previous bipartitions, so must be done seperately)
        tree.is_rooted = True
        tree.reroot_at_edge(outgroup_node.edge,
                            length1=0.5 * outgroup_node.edge_length,
                            length2=0.5 * outgroup_node.edge_length)

        # determine bootstrap for new node
        for child in tree.seed_node.child_node_iter():
            if outgroup_node.is_leaf():
                if not child.is_leaf():
                    support, taxon, aux_info = parse_label(child.label)
                    child.label = create_label(max_support, taxon, aux_info)
            else:
                if child != outgroup_node:
                    support, _taxon, _aux_info = parse_label(
                        outgroup_node.label)
                    _support, taxon, aux_info = parse_label(child.label)
                    child.label = create_label(support, taxon, aux_info)

        return tree
Beispiel #4
0
    def pull(self, options):
        """Create taxonomy file from a decorated tree."""

        check_file_exists(options.input_tree)

        if options.no_validation:
            tree = dendropy.Tree.get_from_path(options.input_tree,
                                               schema='newick',
                                               rooting="force-rooted",
                                               preserve_underscores=True)

            taxonomy = {}
            for leaf in tree.leaf_node_iter():
                taxon_id = leaf.taxon.label

                node = leaf.parent_node
                taxa = []
                while node:
                    support, taxon, aux_info = parse_label(node.label)
                    if taxon:
                        for t in list(map(str.strip, taxon.split(';')))[::-1]:
                            taxa.append(t)
                    node = node.parent_node

                taxonomy[taxon_id] = taxa[::-1]
        else:
            taxonomy = Taxonomy().read_from_tree(options.input_tree)

        Taxonomy().write(taxonomy, options.output_taxonomy)

        self.logger.info('Stripped tree written to: %s' %
                         options.output_taxonomy)
Beispiel #5
0
    def _leaf_taxa(self, leaf):
        """Get taxonomic information for leaf node.
        
        Parameters
        ----------
        leaf : Node
          Node in tree.
          
        Returns
        -------
        list
          Taxa for leaf in rank order.
        """
        
        leaf_taxa = []
        
        parent = leaf
        while parent:
            _support, taxon, _aux_info = parse_label(parent.label)
            
            if taxon:
                for t in taxon.split(';')[::-1]:
                    leaf_taxa.append(t.strip())
                    
            parent = parent.parent_node 
                
        ordered_taxa = leaf_taxa[::-1]
        
        # fill in missing ranks
        last_rank = ordered_taxa[-1][0:3]
        for i in xrange(Taxonomy.rank_prefixes.index(last_rank)+1,len(Taxonomy.rank_prefixes)):
            ordered_taxa.append(Taxonomy.rank_prefixes[i])

        return ordered_taxa
Beispiel #6
0
    def _assign_taxon_labels(self, fmeasure_for_taxa):
        """Assign taxon labels to nodes.
        
        Parameters
        ----------
        fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall), ...]
          Node with highest F-measure for each taxon.
          
        Returns
        -------
        set
            Taxon labels placed in tree.
        """

        placed_taxon = set()
        for taxon in Taxonomy().sort_taxa(fmeasure_for_taxa.keys()):
            if len(fmeasure_for_taxa[taxon]) == 1:
                placed_taxon.add(taxon)
                node, fmeasure, precision, recall = fmeasure_for_taxa[taxon][0]
                
                support, taxon_label, aux_info = parse_label(node.label)
                if taxon_label:
                    taxon_label += ';' + taxon
                else:
                    taxon_label = taxon
                node.label = create_label(support, taxon_label, aux_info)

        return placed_taxon
Beispiel #7
0
    def _get_phyla_lineages(self, tree):
        """Get list of phyla level lineages.
    
        Parameters
        ----------
        tree : Dendropy Tree
            Phylogenetic tree.
    
        Returns
        -------
        list
            List of phyla level lineages.
        """
        phyla = []
        for node in tree.preorder_node_iter():
            if not node.label or node.is_leaf():
                continue

            _support, taxon_name, _auxiliary_info = parse_label(node.label)
            if taxon_name:
                taxa = [x.strip() for x in taxon_name.split(';')]
                if taxa[-1].startswith('p__'):
                    phyla.append(taxa[-1])

        return phyla
Beispiel #8
0
    def pull(self, options):
        """Create taxonomy file from a decorated tree."""

        check_file_exists(options.input_tree)

        if options.no_validation:
            tree = dendropy.Tree.get_from_path(options.input_tree, 
                                                schema='newick', 
                                                rooting="force-rooted", 
                                                preserve_underscores=True)

            taxonomy = {}
            for leaf in tree.leaf_node_iter():
                taxon_id = leaf.taxon.label
                
                node = leaf.parent_node
                taxa = []
                while node:
                    support, taxon, aux_info = parse_label(node.label)
                    if taxon:
                        for t in map(str.strip, taxon.split(';'))[::-1]:
                            taxa.append(t)
                    node = node.parent_node
                    
                taxonomy[taxon_id] = taxa[::-1]
        else:
            taxonomy = Taxonomy().read_from_tree(options.input_tree)
                                                
        Taxonomy().write(taxonomy, options.output_taxonomy)
            
        self.logger.info('Stripped tree written to: %s' % options.output_taxonomy)
Beispiel #9
0
    def _assign_taxon_labels(self, fmeasure_for_taxa):
        """Assign taxon labels to nodes.
        
        Parameters
        ----------
        fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall), ...]
          Node with highest F-measure for each taxon.
          
        Returns
        -------
        set
            Taxon labels placed in tree.
        """

        placed_taxon = set()
        for taxon in Taxonomy().sort_taxa(list(fmeasure_for_taxa.keys())):
            if len(fmeasure_for_taxa[taxon]) == 1:
                placed_taxon.add(taxon)
                node, fmeasure, precision, recall = fmeasure_for_taxa[taxon][0]

                support, taxon_label, aux_info = parse_label(node.label)
                if taxon_label:
                    taxon_label += '; ' + taxon
                else:
                    taxon_label = taxon
                node.label = create_label(support, taxon_label, aux_info)

        return placed_taxon
Beispiel #10
0
 def _write_taxonomy(self, tree, out_taxonomy):
     """Write taxonomy decorated on tree to file.
     
     Parameters
     ----------
     tree : Tree
       Dendropy Tree.
     out_taxonomy : str
       Output file.
     """
     
     fout = open(out_taxonomy, 'w')
     for leaf in tree.leaf_node_iter():
         leaf_taxa = []
         
         parent = leaf
         while parent:
             _support, taxon, _aux_info = parse_label(parent.label)
             
             if taxon:
                 for t in taxon.split(';')[::-1]:
                     leaf_taxa.append(t)
                     
             parent = parent.parent_node 
                 
         ordered_taxa = leaf_taxa[::-1]
         filled_ordered_taxa = Taxonomy().fill_missing_ranks(ordered_taxa)
               
         fout.write('%s\t%s\n' % (leaf.taxon.label, ';'.join(filled_ordered_taxa)))
     
     fout.close()
Beispiel #11
0
    def _supported(self, ref_tree, compare_tree, min_support, max_depth):
        """Determine supported bipartitions in reference tree not in comparison tree."""

        congruent = 0
        congruent_w = 0
        incongruent = 0
        incongruent_w = 0
        nontrivial_splits = 0
        nontrivial_splits_w = 0
        congruent_splits = {}
        incongruent_splits = {}
        for n in ref_tree.preorder_node_iter(lambda n: not n.is_leaf()):
            if not n.parent_node:
                continue

            nontrivial_splits += 1
            nontrivial_splits_w += n.edge.length

            support, label, aux_info = parse_label(n.label)
            if support is None or (support >= min_support
                                   and n.rel_dist <= max_depth):
                split_lca = n.child_nodes()[0].leaf_nodes()[0].taxon.label
                split_lca += '|'
                split_lca += n.child_nodes()[1].leaf_nodes()[0].taxon.label
                if n.bipartition not in compare_tree.bipartition_encoding:
                    incongruent += 1
                    incongruent_w += n.edge.length
                    incongruent_splits[split_lca] = (n.edge.length, support)
                else:
                    congruent += 1
                    congruent_w += n.edge.length
                    congruent_splits[split_lca] = (n.edge.length, support)

        return congruent, congruent_w, incongruent, incongruent_w, nontrivial_splits, nontrivial_splits_w, congruent_splits, incongruent_splits
Beispiel #12
0
    def _reroot(self, tree, outgroup_node, max_support=100):
        """Reroot tree taking proper care of bootstrap values."""

        # determine support values for each bipartition
        tree.encode_bipartitions()
        support_values = {}
        for nd in tree:
            support, taxon, aux_info = parse_label(nd.label)
            if nd.is_leaf():
                support_values[nd.bipartition] = max_support
            else:
                if support is not None:
                    support_values[nd.bipartition] = float(support)
                else:
                    support_values[nd.bipartition] = None
     
        # move support values for desired re-rooting
        new_root = outgroup_node.parent_node
        tree.reseed_at(new_root)
        tree.encode_bipartitions()
        for nd in tree:
            _, taxon, aux_info = parse_label(nd.label)
            nd.label = create_label(support_values.get(nd.bipartition, "not_specified"), taxon, aux_info)
        tree.seed_node.edge.length = None
     
        # do a hard re-rooting of the tree
        # (this invalidates the previous bipartitions, so must be done seperately)
        tree.is_rooted = True
        tree.reroot_at_edge(outgroup_node.edge,
                                    length1=0.5 * outgroup_node.edge_length,
                                    length2=0.5 * outgroup_node.edge_length)

        # determine bootstrap for new node
        for child in tree.seed_node.child_node_iter():
            if outgroup_node.is_leaf():
                if not child.is_leaf():
                    support, taxon, aux_info = parse_label(child.label)
                    child.label = create_label(max_support, taxon, aux_info)
            else:
                if child != outgroup_node:
                    support, _taxon, _aux_info = parse_label(outgroup_node.label)
                    _support, taxon, aux_info = parse_label(child.label)
                    child.label = create_label(support, taxon, aux_info)
 
        return tree
Beispiel #13
0
def bootstrap_support(input_tree, replicate_trees, output_tree):
    """ Calculate support for tree with replicates covering the same taxon set.

    Parameters
    ----------
    input_tree : str
      Tree inferred from complete data.
    replicate_trees : iterable
      Files containing replicate trees.
    output_tree: str
      Name of output tree with support values.
    """

    import dendropy

    # read tree and bootstrap replicates as unrooted, and
    # calculate bootstrap support
    orig_tree = dendropy.Tree.get_from_path(input_tree,
                                            schema='newick',
                                            rooting="force-unrooted",
                                            preserve_underscores=True)
    orig_tree.bipartitions = True
    orig_tree.encode_bipartitions()

    rep_trees = dendropy.TreeArray(taxon_namespace=orig_tree.taxon_namespace,
                                   is_rooted_trees=False,
                                   ignore_edge_lengths=True,
                                   ignore_node_ages=True,
                                   use_tree_weights=False)

    rep_trees.read_from_files(files=replicate_trees,
                              schema='newick',
                              rooting="force-unrooted",
                              preserve_underscores=True,
                              taxon_namespace=orig_tree.taxon_namespace)

    rep_trees.summarize_splits_on_tree(orig_tree,
                                       is_bipartitions_updated=True,
                                       add_support_as_node_attribute=True,
                                       support_as_percentages=True)

    for node in orig_tree.internal_nodes():
        if node.label:
            support, taxon, aux_info = parse_label(node.label)
            node.label = create_label(node.support, taxon, aux_info)
        else:
            node.label = str(int(node.support))

    orig_tree.write_to_path(output_tree,
                            schema='newick',
                            suppress_rooting=True,
                            unquoted_underscores=True)
Beispiel #14
0
 def _strip_taxon_labels(self, tree):
     """Remove any previous taxon labels.
     
     Parameters
     ----------
     tree : Tree
       Dendropy Tree.
     """
     
     for node in tree.internal_nodes():
         support, _taxon, _aux_info = parse_label(node.label)
         if support:
             node.label = create_label(support, None, None)
Beispiel #15
0
    def _strip_taxon_labels(self, tree):
        """Remove any previous taxon labels.
        
        Parameters
        ----------
        tree : Tree
          Dendropy Tree.
        """

        for node in tree.internal_nodes():
            support, _taxon, _aux_info = parse_label(node.label)
            if support:
                node.label = create_label(support, None, None)
Beispiel #16
0
 def report_missing_splits(self, ref_tree, compare_tree, min_support, taxa_list):
     """Report supported bipartitions in reference tree not in comparison tree."""
     
     ref_tree, compare_tree = self._read_trees(ref_tree, compare_tree, taxa_list)
     
     incongruent = 0
     print 'Missing splits with support >= %f:' % min_support
     for n in ref_tree.preorder_node_iter(lambda n: not n.is_leaf()):
         support, label, aux_info = parse_label(n.label)
         
         if support >= min_support:
             if n.bipartition not in compare_tree.bipartition_encoding:
                 incongruent += 1
                 if label:
                     print label, n.edge.length
                 else:
                     print ','.join([t.taxon.label for t in n.leaf_iter()])
                 
     print 'Missing splits: %d' % incongruent
Beispiel #17
0
    def rel_dist_to_named_clades(self, tree):
        """Determine relative distance to specific taxa.

        Parameters
        ----------
        tree : Dendropy Tree
            Phylogenetic tree.

        Returns
        -------
        dict : d[rank_index][taxon] -> relative divergence
        """

        # calculate relative distance for all nodes
        self.decorate_rel_dist(tree)

        # assign internal nodes with ranks from
        rel_dists = defaultdict(dict)
        for node in tree.preorder_node_iter(lambda n: n != tree.seed_node):
            if not node.label or node.is_leaf():
                continue

            # check for support value
            _support, taxon_name, _auxiliary_info = parse_label(node.label)

            if not taxon_name:
                continue

            # get most-specific rank if a node represents multiple ranks
            if ';' in taxon_name:
                taxon_name = taxon_name.split(';')[-1].strip()

            most_specific_rank = taxon_name[0:3]
            rel_dists[Taxonomy.rank_index[most_specific_rank]][taxon_name] = node.rel_dist

        return rel_dists
Beispiel #18
0
    def _resolve_ambiguous_placements(self,
                                      fmeasure_for_taxa,
                                      median_rank_rd,
                                      max_rd_diff=0.1):
        """Resolve ambiguous taxon label placements using median relative divergences.
        
        Parameters
        ----------
        fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall)]
          Node with highest F-measure for each taxon.
        median_rank_rd : d[rank_index] -> float
          Median relative divergence for each taxonomic rank.
        max_rd_diff : float
          Maximum difference in relative divergence for assigning a taxonomic label.
        """

        # For ambiguous nodes place them closest to median for rank
        # and within accepted relative divergence distance. Taxon labels
        # are placed in reverse taxonomic order (species to domain) and
        # this ordering used to ensure taxonomic consistency.
        for taxon in Taxonomy().sort_taxa(list(fmeasure_for_taxa.keys()),
                                          reverse=True):
            if len(fmeasure_for_taxa[taxon]) == 1:
                continue

            rank_prefix = taxon[0:3]
            rank_index = Taxonomy.rank_prefixes.index(rank_prefix)
            rd = median_rank_rd[rank_index]

            # Find node closest to median distance, but making sure
            # taxon is not placed below a more specific taxon label.
            # The 'fmeasure_for_taxa' stores node information in preorder.
            closest_index = None
            closest_dist = 1e9
            closest_node = None
            for i, d in enumerate(fmeasure_for_taxa[taxon]):
                cur_node = d[0]

                cur_rank_index = -1
                _support, cur_taxon, _aux_info = parse_label(cur_node.label)
                if cur_taxon:
                    cur_prefix = cur_taxon.split(';')[-1].strip()[0:3]
                    cur_rank_index = Taxonomy.rank_prefixes.index(cur_prefix)

                if cur_rank_index > rank_index:
                    # reached a node with a more specific label so
                    # label should be appended to this node or
                    # placed above it
                    if closest_index is None:
                        closest_index = i
                        closest_node = cur_node
                    break

                rd_diff = abs(rd - cur_node.rel_dist)
                if rd_diff > max_rd_diff:
                    continue

                if rd_diff < closest_dist:
                    closest_dist = rd_diff
                    closest_index = i
                    closest_node = cur_node

            if closest_index is None:
                # no node is within an acceptable relative divergence distance
                # for this label so it should be placed at the most extant node
                # in order to be conservative
                closest_index = len(fmeasure_for_taxa[taxon]) - 1
                closest_node = fmeasure_for_taxa[taxon][closest_index][0]

            # add label to node
            support, cur_taxon, aux_info = parse_label(closest_node.label)
            if not cur_taxon:
                taxa_str = taxon
            else:
                taxa = [t.strip() for t in cur_taxon.split(';')] + [taxon]
                taxa_str = '; '.join(Taxonomy().sort_taxa(taxa))

            closest_node.label = create_label(support, taxa_str, aux_info)

            # remove other potential node assignments
            fmeasure_for_taxa[taxon] = [
                fmeasure_for_taxa[taxon][closest_index]
            ]
Beispiel #19
0
    def _filter_taxa_for_dist_inference(self, tree, taxonomy, trusted_taxa,
                                        min_children, min_support):
        """Determine taxa to use for inferring distribution of relative divergences.
    
        Parameters
        ----------
        tree : Dendropy Tree
            Phylogenetic tree.
        taxonomy : d[taxon ID] -> [d__x; p__y; ...]
            Taxonomy for each taxon.
        trusted_taxa : iterable
            Trusted taxa to consider when inferring distribution.
        min_children : int
            Only consider taxa with at least the specified number of children taxa when inferring distribution.
        min_support : float
            Only consider taxa with at least this level of support when inferring distribution.
        """

        # determine children taxa for each named group
        taxon_children = Taxonomy().taxon_children(taxonomy)

        # get all named groups
        taxa_for_dist_inference = set()
        for taxon_id, taxa in taxonomy.iteritems():
            for taxon in taxa:
                taxa_for_dist_inference.add(taxon)

        # sanity check species names as these are a common problem
        species = set()
        for taxon_id, taxa in taxonomy.iteritems():
            if len(taxa) > Taxonomy.rank_index['s__']:
                species_name = taxa[Taxonomy.rank_index['s__']]
                valid, error_msg = True, None
                if species_name != 's__':
                    valid, error_msg = Taxonomy().validate_species_name(
                        species_name, require_full=True, require_prefix=True)
                if not valid:
                    print '[Warning] Species name %s for %s is invalid: %s' % (
                        species_name, taxon_id, error_msg)
                    continue

                species.add(species_name)

        # restrict taxa to those with a sufficient number of named children
        # Note: a taxonomic group with no children will not end up in the
        # taxon_children data structure so care must be taken when applying
        # this filtering criteria.
        if min_children > 0:
            valid_taxa = set()
            for taxon, children_taxa in taxon_children.iteritems():
                if len(children_taxa) >= min_children:
                    valid_taxa.add(taxon)

            taxa_for_dist_inference.intersection_update(valid_taxa)

            # explicitly add in the species since they have no
            # children and thus be absent from the taxon_child dictionary
            taxa_for_dist_inference.update(species)

        # restrict taxa used for inferring distribution to those with sufficient support
        if min_support > 0:
            for node in tree.preorder_node_iter():
                if not node.label or node.is_leaf():
                    continue

                # check for support value
                support, taxon_name, _auxiliary_info = parse_label(node.label)

                if not taxon_name:
                    continue

                if support and float(support) < min_support:
                    taxa_for_dist_inference.difference_update([taxon_name])
                elif not support and min_support > 0:
                    # no support value, so inform user if they were trying to filter on this property
                    print '[Error] Tree does not contain support values. As such, --min_support should be set to 0.'
                    continue

        # restrict taxa used for inferring distribution to the trusted set
        if trusted_taxa:
            taxa_for_dist_inference = trusted_taxa.intersection(
                taxa_for_dist_inference)

        return taxa_for_dist_inference
Beispiel #20
0
    def median_rd_over_phyla(self, tree, taxa_for_dist_inference, taxonomy):
        """Calculate the median relative divergence over all phyla rootings.
        
        Parameters
        ----------
        tree : Tree
          Dendropy tree.
        taxa_for_dist_inference : set
          Taxa to use for inference relative divergence distributions.
        taxonomy : d[taxon_id] -> [d__, p__, ..., s__]
          Taxonomy of extant taxa.
        """

        # get list of phyla level lineages
        all_phyla = self._get_phyla_lineages(tree)
        self.logger.info('Identified %d phyla.' % len(all_phyla))

        phyla = [p for p in all_phyla if p in taxa_for_dist_inference]
        self.logger.info(
            'Using %d phyla as rootings for inferring RED distributions.' %
            len(phyla))
        if len(phyla) < 2:
            self.logger.error('Rescaling requires at least 2 valid phyla.')
            sys.exit(-1)

        # give each node a unique id
        for i, n in enumerate(tree.preorder_node_iter()):
            n.id = i

        # calculate relative divergence for tree rooted on each phylum
        phylum_rel_dists = {}
        rel_node_dists = defaultdict(list)
        rd = RelativeDistance()
        for p in phyla:
            phylum = p.replace('p__', '').replace(' ', '_').lower()
            status_msg = '==> Calculating information with rooting on %s.              ' % phylum.capitalize(
            )
            sys.stdout.write('%s\r' % status_msg)
            sys.stdout.flush()

            cur_tree = self.root_with_outgroup(tree, taxonomy, p)

            # calculate relative distance to taxa
            rel_dists = rd.rel_dist_to_named_clades(cur_tree)
            rel_dists.pop(0, None)  # remove results for Domain

            # remove named groups in outgroup
            children = Taxonomy().children(p, taxonomy)
            for r in rel_dists.keys():
                rel_dists[r].pop(p, None)

            for t in children:
                for r in rel_dists.keys():
                    rel_dists[r].pop(t, None)

            phylum_rel_dists[phylum] = rel_dists

            # calculate relative distance to all nodes
            rd.decorate_rel_dist(cur_tree)

            # determine which lineages represents the 'ingroup'
            ingroup_subtree = None
            for c in cur_tree.seed_node.child_node_iter():
                _support, taxon_name, _auxiliary_info = parse_label(c.label)
                if not taxon_name or p not in taxon_name:
                    ingroup_subtree = c
                    break

            # do a preorder traversal of 'ingroup' and record relative divergence to nodes
            for n in ingroup_subtree.preorder_iter():
                rel_node_dists[n.id].append(n.rel_dist)

        #status_msg = 'Inference of RED distribution finished'
        #sys.stdout.write('%s\r' % status_msg)
        sys.stdout.write(
            '==> Inference for RED distributions finished.                         '
        )
        sys.stdout.flush()
        #self.logger.info('Inference for RED distributions finished.')
        sys.stdout.write('\n')

        return phylum_rel_dists, rel_node_dists
Beispiel #21
0
    def _select_taxa(self, tree, node_of_interest, outgroup_node,
                     num_taxa_to_retain, keep_unclassified, genome_metadata):
        """Select genomes in named lineages on path from ingroup to outgroup."""

        # get most recent common ancestor of outgroup and lineage of interest
        outgroup_leaf_taxon = outgroup_node.leaf_iter().next().taxon
        lineage_of_interest_taxon = node_of_interest.leaf_iter().next().taxon
        mrca = tree.mrca(taxa=[outgroup_leaf_taxon, lineage_of_interest_taxon])

        # get taxon of lineage of interest
        taxa_of_interest = []
        parent = node_of_interest
        while parent != mrca:
            _support, taxon, _auxiliary_info = parse_label(parent.label)
            if taxon:
                taxa_of_interest.append(taxon)
            parent = parent.parent_node

        self.logger.info('Taxonomy for lineage of interest: %s' %
                         ';'.join(taxa_of_interest))

        # select taxa from named lineages by traversing tree
        # in preorder and terminating descent at named taxa
        # not in path to lineage of interest
        selected_taxa = []

        stack = []
        for c in mrca.child_node_iter():
            stack.append(c)

        while stack:
            cur_node = stack.pop()

            if cur_node.is_leaf() and keep_unclassified:
                selected_taxa.append(cur_node.taxon)

            _support, taxon, _auxiliary_info = parse_label(cur_node.label)

            if taxon and taxon not in taxa_of_interest:
                # select roughly equal taxa from each child lineage to
                # enure we retain the correct depth (and the named node)
                # for this lineage
                derep_taxa = []
                num_children = sum([1 for c in cur_node.child_node_iter()])
                child_taxa_to_sample = int(
                    math.ceil((1.0 / num_children) * num_taxa_to_retain))
                for i, c in enumerate(cur_node.child_node_iter()):
                    taxa_to_sample = min(child_taxa_to_sample,
                                         num_taxa_to_retain - len(derep_taxa))
                    derep_taxa += self._derep_lineage(c, taxa_to_sample,
                                                      genome_metadata)

                selected_taxa += derep_taxa
                self.logger.info('Selecting %d taxa from %s.' %
                                 (len(derep_taxa), taxon))
            elif cur_node == node_of_interest:
                self.logger.info('Retaining all taxa in lineage of interest.')
                for leaf in node_of_interest.leaf_iter():
                    selected_taxa.append(leaf.taxon)
            else:
                for c in cur_node.child_node_iter():
                    stack.append(c)

        return selected_taxa
Beispiel #22
0
    def run(self, input_tree, lineage_of_interest, outgroup, gtdb_metadata,
            num_taxa_to_retain, msa_file, keep_unclassified, output_dir):
        """Dereplicate tree.

        Parameters
        ----------
        input_tree : str
            Tree to dereplicate
        lineage_of_interest : str
            Named lineage where all taxa should be retain.
        outgroup : str
            Named lineage to use as outgroup.
        gtdb_metadata : str
            File containing metadata for taxa in tree.
        num_taxa_to_retain: int
            Taxa to retain in dereplicated lineages.
        msa_file : str
            Multiple sequence alignment to dereplicate along with tree.
        keep_unclassified : boolean
            Keep all taxa in unclassified lineages.
        output_dir:
            Output dir.
        """

        # read GTDB metadata
        self.logger.info('Reading metadata.')
        genome_metadata = read_gtdb_metadata(gtdb_metadata, [
            'checkm_completeness', 'checkm_contamination',
            'gtdb_representative'
        ])

        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        # locate node of interest and outgroup node
        self.logger.info('Identifying lineage of interest and outgroup.')
        node_of_interest = None
        outgroup_node = None
        for node in tree.preorder_node_iter():
            _support, taxon_str, _auxiliary_info = parse_label(node.label)

            if not taxon_str:
                continue

            for taxon in [t.strip() for t in taxon_str.split(';')]:
                if taxon == lineage_of_interest:
                    node_of_interest = node
                elif taxon == outgroup:
                    outgroup_node = node

        if not node_of_interest:
            self.logger.error(
                'Could not find specified lineage of interest: %s' %
                lineage_of_interest)
            sys.exit()

        if not outgroup_node:
            self.logger.error('Could not find outgroup: %s' % outgroup)
            sys.exit()

        # select taxa to retain
        self.logger.info('Selecting taxa to retain.')
        selected_taxa = self._select_taxa(tree, node_of_interest,
                                          outgroup_node, num_taxa_to_retain,
                                          keep_unclassified, genome_metadata)

        self.logger.info('Retaining %d taxa.' % len(selected_taxa))

        # prune tree
        self.logger.info('Pruning tree.')
        tree.retain_taxa(selected_taxa)

        # dereplicate MSA if requested
        if msa_file:
            self.logger.info('Dereplicating MSA.')
            msa_name, msa_ext = os.path.splitext(os.path.basename(msa_file))
            output_msa = os.path.join(output_dir,
                                      msa_name + '.derep' + msa_ext)
            self._derep_msa(msa_file, selected_taxa, output_msa)

        # write out results
        tree_name, tree_ext = os.path.splitext(os.path.basename(input_tree))
        output_tree = os.path.join(output_dir, tree_name + '.derep' + tree_ext)
        tree.write_to_path(output_tree,
                           schema='newick',
                           suppress_rooting=True,
                           unquoted_underscores=True)
Beispiel #23
0
    def run(self, bac120_tree, ar122_tree):
        """Generate tree and iTOL files for producing iTOL tree image."""

        self.logger.info('Creating trees with iTOL labels.')
        for tree_file, output_tree, itol_colors, itol_labels in [
            (bac120_tree, f'bac120_r{self.release_number}.itol.tree',
             f'bac120_r{self.release_number}.itol_phyla_colors.txt',
             f'bac120_r{self.release_number}.itol_phyla_labels.txt'),
            (ar122_tree, f'ar122_r{self.release_number}.itol.tree',
             f'ar122_r{self.release_number}.itol_phyla_colors.txt',
             f'ar122_r{self.release_number}.itol_phyla_labels.txt')
        ]:

            self.logger.info(f'Reading {tree_file} reference tree.')
            domain_tree = dendropy.Tree.get_from_path(
                tree_file,
                schema='newick',
                rooting='force-rooted',
                preserve_underscores=True)
            self.logger.info(' ...tree contains {:,} genomes.'.format(
                sum([1 for leaf in domain_tree.leaf_node_iter()])))

            phyla = set()
            for node in domain_tree.preorder_node_iter():
                if node.is_leaf():
                    continue

                support, taxon, auxiliary_info = parse_label(node.label)
                if taxon:
                    taxa = taxon.split(';')[0]

                    if taxa.startswith('p__'):
                        node.label = taxa[3:]
                        phyla.add(taxa[3:])
                    else:
                        node.label = None

            domain_tree.write_to_path(self.output_dir / output_tree,
                                      schema='newick',
                                      suppress_rooting=True,
                                      unquoted_underscores=True)

            self.logger.info('Identified {:,} phyla in {}.'.format(
                len(phyla), tree_file))

            # create iTOL metadata for coloring phyla
            self.logger.info(
                f'Creating iTOL metadata for coloring phyla: {itol_colors}.')
            fout = open(self.output_dir / itol_colors, 'w')
            fout.write('TREE_COLORS\n')
            fout.write('SEPARATOR TAB\n')
            fout.write('DATA\n')

            color_index = 0
            for phylum in phyla:
                fout.write('{}\trange\t{}\t{}\n'.format(
                    phylum, self.colors[color_index], phylum))

                color_index += 1
                if color_index >= len(self.colors):
                    color_index = 0

            fout.close()

            # create iTOL metadata for phylum labels
            self.logger.info(
                f'Creating iTOL metadata for phyla labels: {itol_labels}.')
            fout = open(self.output_dir / itol_labels, 'w')
            fout.write('DATASET_TEXT\n')
            fout.write('SEPARATOR TAB\n')
            fout.write('DATASET_LABEL\tPhylum labels\n')
            fout.write('COLOR\t#000000\n')
            fout.write('MARGIN\t0\n')
            fout.write('SHOW_INTERNAL\t1\n')
            fout.write('LABEL_ROTATION\t0\n')
            fout.write('STRAIGHT_LABELS\t0\n')
            fout.write('ALIGN_TO_TREE\t0\n')
            fout.write('SIZE_FACTOR\t1\n')
            fout.write('DATA\n')

            color_index = 0
            for phylum in phyla:
                fout.write('{}\t{}\t-1\t{}\tnormal\t1\t0\n'.format(
                    phylum, phylum, self.colors[color_index]))

                color_index += 1
                if color_index >= len(self.colors):
                    color_index = 0

            fout.close()
Beispiel #24
0
    def run(self, genomes, align_dir, out_dir, prefix, debugopt=False):
        try:
            """Classify genomes based on position in reference tree."""

            for marker_set_id in ('bac120', 'ar122'):
                user_msa_file = os.path.join(
                    align_dir, prefix + '.%s.user_msa.fasta' % marker_set_id)
                if not os.path.exists(user_msa_file):
                    # file will not exist if there are no User genomes from a given domain
                    continue

                classify_tree = self.place_genomes(user_msa_file,
                                                   marker_set_id, out_dir,
                                                   prefix)

                # get taxonomic classification of each user genome
                tree = dendropy.Tree.get_from_path(classify_tree,
                                                   schema='newick',
                                                   rooting='force-rooted',
                                                   preserve_underscores=True)

                gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)

                fout = open(
                    os.path.join(
                        out_dir,
                        prefix + '.%s.classification.tsv' % marker_set_id),
                    'w')
                fastaniout = open(
                    os.path.join(
                        out_dir,
                        prefix + '.%s.fastani_results.tsv' % marker_set_id),
                    'w')
                redfout = open(
                    os.path.join(out_dir,
                                 prefix + '.%s.summary.tsv' % marker_set_id),
                    'w')
                if debugopt:
                    parchiinfo = open(
                        os.path.join(
                            out_dir,
                            prefix + '.%s.debug_file.tsv' % marker_set_id),
                        'w')

                reddictfile = open(
                    os.path.join(
                        out_dir,
                        prefix + '.%s.red_dictionary.tsv' % marker_set_id),
                    'w')

                marker_dict = {}
                if marker_set_id == 'bac120':
                    marker_dict = Config.RED_DIST_BAC_DICT
                elif marker_set_id == 'ar122':
                    marker_dict = Config.RED_DIST_ARC_DICT
                reddictfile.write('Phylum\t{0}\n'.format(
                    marker_dict.get('p__')))
                reddictfile.write('Class\t{0}\n'.format(
                    marker_dict.get('c__')))
                reddictfile.write('Order\t{0}\n'.format(
                    marker_dict.get('o__')))
                reddictfile.write('Family\t{0}\n'.format(
                    marker_dict.get('f__')))
                reddictfile.write('Genus\t{0}\n'.format(
                    marker_dict.get('g__')))
                reddictfile.close()

                fastaniout.write("User genome\tReference genome\tANI\n")
                redfout.write(
                    "user_genome\tclassification_method\tred_value\n")
                if debugopt:
                    parchiinfo.write(
                        "User genome\tHigher rank\tHigher value\tLower rank\tLower value\tcase\tclosest_rank\n"
                    )

                # Genomes can be classified by using Mash or RED values
                # We go through all leaves of the tree. if the leaf is a user genome we take it's parent node and look at all the leaves for this node.
                # If the parent node has only one Reference genome ( GB or RS ) we calculate the mash distance between the user genome and the reference genome
                analysed_nodes = []
                fastani_dict = {}
                all_fastani_dict = {}

                fastani_list = []
                # some genomes of Case C are handled here, if Mash distance is close enough
                self.logger.info(
                    'Calculating Average Nucleotide Identity using FastANI.')

                for nd in tree.preorder_node_iter():
                    #We store the prefixes of each leaves to check if one starts with GB_ or RS_
                    list_subnode_initials = [
                        subnd.taxon.label.replace("'", '')[0:3]
                        for subnd in nd.leaf_iter()
                    ]
                    list_subnode = [
                        subnd.taxon.label.replace("'", '')
                        for subnd in nd.leaf_iter()
                    ]
                    #if only one genome is a reference genome
                    if (list_subnode_initials.count('RS_') +
                            list_subnode_initials.count('GB_') +
                            list_subnode_initials.count('UBA')) == 1 and len(
                                list_subnode_initials
                            ) > 1 and list_subnode[0] not in analysed_nodes:
                        fastani_list.append(list_subnode)
                        analysed_nodes.extend(list_subnode)

                manager = multiprocessing.Manager()
                out_q = manager.dict()
                procs = []
                nprocs = self.cpus
                if len(fastani_list) > 0:
                    for item in splitchunks_list(fastani_list, nprocs):
                        p = multiprocessing.Process(target=self._fastaniWorker,
                                                    args=(item, genomes,
                                                          out_q))
                        procs.append(p)
                        p.start()

                    # Collect all results into a single result dict. We know how many dicts
                    # with results to expect.
                    #while out_q.empty():
                    #    time.sleep(1)

                    # Wait for all worker processes to finish
                    for p in procs:
                        p.join()
                        if p.exitcode == 1:
                            raise ValueError("Stop!!")

                    all_fastani_dict = dict(out_q)

                for k, v in all_fastani_dict.iteritems():
                    fastaniout.write("{0}\t{1}\t{2}\n".format(
                        k, v.get("ref_genome"), v.get("ani")))
                    if Config.FASTANI_SPECIES_THRESHOLD <= v.get("ani"):
                        suffixed_name = add_ncbi_prefix(v.get("ref_genome"))
                        taxa_str = ";".join(gtdb_taxonomy.get(suffixed_name))
                        if taxa_str.endswith("s__"):
                            taxa_str = taxa_str + v.get("ref_genome")
                        fout.write('%s\t%s\n' % (k, taxa_str))
                        fastani_dict[k] = v
                        redfout.write("{0}\tani\tNone\n".format(k))
                fastaniout.close()

                self.logger.info(
                    '{0} genomes have been classify with FastANI.'.format(
                        len(fastani_dict)))

                scaled_tree = self._calculate_red_distances(
                    classify_tree, out_dir)

                user_genome_ids = set(read_fasta(user_msa_file).keys())
                user_genome_ids = user_genome_ids.difference(
                    set(fastani_dict.keys()))
                # for all other cases we measure the RED distance between a leaf and a parent node ( RED = 1-edge_length). This RED value will tell us
                # the rank level that can be associated with a User genome.
                # As an example if the RED value is close to the order level, the user genome will take the order level of the Reference genome under the same parent node.
                # Is there are multiple orders under the parent node. The user genome is considered as a new order
                for leaf in scaled_tree.leaf_node_iter():
                    if leaf.taxon.label in user_genome_ids:
                        taxa = []
                        # In some cases , pplacer can associate 2 user genomes on the same parent node so we need to go up the tree to find a node with a reference genome as leaf.
                        cur_node = leaf.parent_node
                        list_subnode_initials = [
                            subnd.taxon.label.replace("'", '')[0:3]
                            for subnd in cur_node.leaf_iter()
                        ]
                        while 'RS_' not in list_subnode_initials and 'GB_' not in list_subnode_initials and 'UBA' not in list_subnode_initials:
                            cur_node = cur_node.parent_node
                            list_subnode_initials = [
                                subnd.taxon.label.replace("'", '')[0:3]
                                for subnd in cur_node.leaf_iter()
                            ]

                        current_rel_list = cur_node.rel_dist

                        parent_taxon_node = cur_node.parent_node
                        _support, parent_taxon, _aux_info = parse_label(
                            parent_taxon_node.label)

                        while parent_taxon_node is not None and not parent_taxon:
                            parent_taxon_node = parent_taxon_node.parent_node
                            _support, parent_taxon, _aux_info = parse_label(
                                parent_taxon_node.label)

                        parent_rank = parent_taxon.split(";")[-1][0:3]
                        parent_rel_dist = parent_taxon_node.rel_dist

                        genome_parent_child = [
                            leaf.taxon.label, parent_rank, parent_rel_dist, '',
                            '', '', ''
                        ]

                        child_taxons = []
                        closest_rank = None
                        detection = "RED"
                        # if the genome is placed between the genus and specie ranks , it will be associated with the genus when _get_closest_red_rank is called
                        if parent_rank != 'g__':
                            child_rk = self.order_rank[
                                self.order_rank.index(parent_rank) + 1]
                            list_subnode = [
                                childnd.taxon.label.replace("'", '')
                                for childnd in cur_node.leaf_iter()
                                if (childnd.taxon.label.startswith('RS_')
                                    or childnd.taxon.label.startswith('GB_'))
                            ]
                            list_ranks = [
                                gtdb_taxonomy.get(name)[self.order_rank.index(
                                    child_rk)] for name in list_subnode
                            ]
                            if len(set(list_ranks)) == 1:
                                for subranknd in cur_node.preorder_iter():
                                    _support, subranknd_taxon, _aux_info = parse_label(
                                        subranknd.label)
                                    if subranknd.is_internal(
                                    ) and subranknd_taxon is not None and subranknd_taxon.startswith(
                                            child_rk):
                                        child_taxons = subranknd_taxon.split(
                                            ";")
                                        child_taxon_node = subranknd
                                        child_rel_dist = child_taxon_node.rel_dist
                                        break
                            else:
                                #case 2a and 2b
                                closest_rank = parent_rank
                                detection = "Topology"
                        else:
                            #case 1a
                            closest_rank = parent_rank
                            detection = "Topology"

                        #case 1b
                        if len(child_taxons) == 0 and closest_rank is None:
                            list_leaves = [
                                childnd.taxon.label.replace("'", '')
                                for childnd in cur_node.leaf_iter()
                                if (childnd.taxon.label.startswith('RS_')
                                    or childnd.taxon.label.startswith('GB_'))
                            ]
                            if len(list_leaves) != 1:
                                self.logger.error(
                                    'There should be only one leaf.')
                                sys.exit(-1)
                            list_leaf_ranks = gtdb_taxonomy.get(
                                list_leaves[0])[self.order_rank.index(child_rk
                                                                      ):-1]
                            for leaf_taxon in reversed(list_leaf_ranks):
                                if leaf_taxon == list_leaf_ranks[0]:
                                    if abs(current_rel_list - marker_dict.get(
                                            leaf_taxon[:3])) < abs(
                                                (current_rel_list) -
                                                marker_dict.get(parent_rank)):
                                        #and current_rel_list - marker_dict.get(leaf_taxon[:3]) > 0 ):
                                        closest_rank = leaf_taxon[:3]
                                        genome_parent_child[3] = leaf_taxon
                                        genome_parent_child[
                                            5] = 'case 1b - III'
                                        break
                                else:
                                    pchildrank = list_leaf_ranks[
                                        list_leaf_ranks.index(leaf_taxon) - 1]
                                    if abs(
                                            current_rel_list -
                                            marker_dict.get(leaf_taxon[:3])
                                    ) < abs(current_rel_list -
                                            marker_dict.get(pchildrank[:3])):
                                        #and current_rel_list - marker_dict.get(leaf_taxon[:3]) > 0 ) :
                                        closest_rank = leaf_taxon[:3]
                                        genome_parent_child[1] = pchildrank
                                        genome_parent_child[2] = 1.0
                                        genome_parent_child[3] = leaf_taxon
                                        genome_parent_child[5] = 'case 1b - II'
                                        break
                            if closest_rank is None:
                                closest_rank = parent_rank
                                genome_parent_child[3] = list_leaf_ranks[0]
                                genome_parent_child[5] = 'case 1b - IV'

                        #if there is multiple ranks on the child node (i.e genome between p__Nitrospirae and c__Nitrospiria;o__Nitrospirales;f__Nitropiraceae)
                        #we loop through the list of rank from f_ to c_ rank
                        for child_taxon in reversed(child_taxons):
                            # if lower rank is c__Nitropiria
                            if child_taxon == child_taxons[0]:
                                if (abs(current_rel_list -
                                        marker_dict.get(child_taxon[:3])) <
                                        abs(child_rel_dist -
                                            marker_dict.get(child_taxon[:3]))
                                        and
                                        abs(current_rel_list -
                                            marker_dict.get(child_taxon[:3])) <
                                        abs(current_rel_list -
                                            marker_dict.get(parent_rank))):
                                    genome_parent_child[3] = ';'.join(
                                        child_taxons)
                                    genome_parent_child[4] = child_rel_dist
                                    genome_parent_child[5] = 'case 3b - II'
                                    closest_rank = child_taxon[:3]
                                elif closest_rank is None:
                                    closest_rank = parent_rank
                                    genome_parent_child[3] = ';'.join(
                                        child_taxons)
                                    genome_parent_child[4] = child_rel_dist
                                    genome_parent_child[5] = 'case 3b - III'
                            else:
                                pchildrank = child_taxons[
                                    child_taxons.index(child_taxon) - 1]
                                if (abs(current_rel_list -
                                        marker_dict.get(child_taxon[:3])) <
                                        abs(current_rel_list -
                                            marker_dict.get(pchildrank[:3]))
                                        and
                                        abs(current_rel_list -
                                            marker_dict.get(child_taxon[:3])) <
                                        abs(child_rel_dist -
                                            marker_dict.get(child_taxon[:3]))):
                                    closest_rank = child_taxon
                                    genome_parent_child[3] = ';'.join(
                                        child_taxons)
                                    genome_parent_child[4] = child_rel_dist
                                    genome_parent_child[5] = 'case 3b - I'
                                    break

                        # case 1b
                        if closest_rank is None:
                            print "IT SHOULDN'T HAPPEN!!!"

                        genome_parent_child[6] = closest_rank

                        list_subnode = [
                            subnd.taxon.label.replace("'", '')
                            for subnd in cur_node.leaf_iter()
                        ]
                        red_taxonomy = self._get_redtax(
                            list_subnode, closest_rank, gtdb_taxonomy)

                        fout.write('{0}\t{1}\n'.format(leaf.taxon.label,
                                                       red_taxonomy))
                        del genome_parent_child[0]
                        redfout.write("{0}\t{1}\t{2}\n".format(
                            leaf.taxon.label, detection, current_rel_list))
                        if debugopt:
                            parchiinfo.write('{0}\t{1}\t{2}\t{3}\n'.format(
                                leaf.taxon.label, current_rel_list,
                                '\t'.join(str(x) for x in genome_parent_child),
                                detection))

                redfout.close()
                fout.close()
                if debugopt:
                    parchiinfo.close()

                pplaceout = open(
                    os.path.join(
                        out_dir, prefix +
                        '.%s.classification_pplacer.tsv' % marker_set_id), 'w')

                # We get the pplacer taxonomy for comparison
                user_genome_ids = set(read_fasta(user_msa_file).keys())
                for leaf in tree.leaf_node_iter():
                    if leaf.taxon.label in user_genome_ids:
                        taxa = []
                        cur_node = leaf
                        while cur_node.parent_node:
                            _support, taxon, _aux_info = parse_label(
                                cur_node.label)
                            if taxon:
                                for t in taxon.split(';')[::-1]:
                                    taxa.append(t.strip())
                            cur_node = cur_node.parent_node
                        taxa_str = ';'.join(taxa[::-1])
                        pplaceout.write('%s\t%s\n' %
                                        (leaf.taxon.label, taxa_str))
                pplaceout.close()
        except ValueError as error:
            print "GTDB-Tk has stopped before finishing"
            sys.exit(-1)
        except Exception as error:
            print "GTDB-Tk has stopped before finishing"
            sys.exit(-1)
Beispiel #25
0
    def check_tree(self, options):
        """Validate taxonomy of decorated tree and check for polyphyletic groups."""

        check_file_exists(options.decorated_tree)

        # validate taxonomy
        taxonomy = Taxonomy()
        if options.taxonomy_file:
            t = taxonomy.read(options.taxonomy_file)
        else:
            t = taxonomy.read_from_tree(options.decorated_tree)

        taxonomy.validate(t,
                          check_prefixes=True,
                          check_ranks=True,
                          check_hierarchy=True,
                          check_species=True,
                          check_group_names=True,
                          check_duplicate_names=True,
                          report_errors=True)

        # check for polyphyletic groups
        polyphyletic_groups = set()
        tree = dendropy.Tree.get_from_path(options.decorated_tree,
                                           schema='newick',
                                           rooting="force-rooted",
                                           preserve_underscores=True)

        if options.taxonomy_file:
            # reduce taxonomy to taxa in tree and map taxon labels to Taxon objects
            reduced_taxonomy = {}
            taxon_map = {}
            for leaf in tree.leaf_node_iter():
                reduced_taxonomy[leaf.taxon.label] = t[leaf.taxon.label]
                taxon_map[leaf.taxon.label] = leaf.taxon

            # find taxa with an MRCA spanning additional taxa
            for rank_label in Taxonomy.rank_labels[1:]:
                extant_taxa = taxonomy.extant_taxa_for_rank(
                    rank_label, reduced_taxonomy)
                for taxon, taxa_ids in extant_taxa.items():
                    mrca = tree.mrca(taxa=[taxon_map[t] for t in taxa_ids])
                    mrca_leaf_count = sum([1 for leaf in mrca.leaf_iter()])
                    if mrca_leaf_count != len(taxa_ids):
                        polyphyletic_groups.add(taxon)
        else:
            # find duplicate taxon labels in tree
            taxa = set()

            for node in tree.preorder_node_iter(lambda n: not n.is_leaf()):
                _support, taxon_label, _aux_info = parse_label(node.label)
                if taxon_label:
                    for taxon in [t.strip() for t in taxon_label.split(';')]:
                        if taxon in taxa:
                            polyphyletic_groups.add(taxon)

                        taxa.add(taxon)

        if len(polyphyletic_groups):
            print('')
            print('Tree contains polyphyletic groups:')
            for taxon in polyphyletic_groups:
                print('%s' % (taxon))

        self.logger.info('Finished performing validation tests.')
Beispiel #26
0
    def _resolve_ambiguous_placements(self, tree, fmeasure_for_taxa, median_rank_rd):
        """Resolve ambiguous taxon label placements using median relative divergences.
        
        Parameters
        ----------
        tree : Tree
          Dendropy tree.
        fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall)]
          Node with highest F-measure for each taxon.
        median_rank_rd : d[rank_index] -> float
          Median relative divergence for each taxonomic rank.
        """
        
        # For ambiguous nodes place them closest to median for rank 
        # and within accept relative divergence distance. Taxon labels
        # are placed in reverse taxonomic order (species to domain) and
        # this ordering used to ensure taxonomic consistency.
        for taxon in Taxonomy().sort_taxa(fmeasure_for_taxa.keys(), reverse=True):
            if len(fmeasure_for_taxa[taxon]) == 1:
                continue
                                
            rank_prefix = taxon[0:3]
            rank_index = Taxonomy.rank_prefixes.index(rank_prefix)
            rd = median_rank_rd[rank_index]

            # Find node closest to median distance, but making sure
            # taxon is not placed below a more specific taxon label.
            # The 'fmeasure_for_taxa' stores node information in preorder.
            closest_index = None
            closest_dist = 1e9
            closest_node = None
            for i, d in enumerate(fmeasure_for_taxa[taxon]):
                cur_node = d[0]    

                cur_rank_index = -1
                _support, cur_taxon, _aux_info = parse_label(cur_node.label)
                if cur_taxon:
                    cur_prefix = cur_taxon.split(';')[-1][0:3]
                    cur_rank_index = Taxonomy.rank_prefixes.index(cur_prefix)
                    
                if cur_rank_index > rank_index:
                    # reached a node with a more specific label so
                    # label should be appended to this node or
                    # placed above it
                    if closest_index is None:
                        closest_index = i
                        closest_node = cur_node
                    break
                    
                rd_diff = abs(rd - cur_node.rel_dist)
                if rd_diff < 0.1 and rd_diff < closest_dist:
                    closest_dist = rd_diff
                    closest_index = i
                    closest_node = cur_node
                    
            if closest_index is None:
                # no node is within an acceptable relative divergence distance 
                # for this label so it should be placed at the most extant node
                # which should be a leaf node
                closest_index = len(fmeasure_for_taxa[taxon]) - 1
                closest_node = fmeasure_for_taxa[taxon][closest_index][0]
                
                if not closest_node.is_leaf():
                    self.logger.error('Leaf node expected!')
                    sys.exit()
                    
            # add label to node
            support, cur_taxon, aux_info = parse_label(closest_node.label)
            if not cur_taxon:
                taxa_str = taxon
            else:
                taxa = cur_taxon.split(';') + [taxon]
                taxa_str = ';'.join(Taxonomy().sort_taxa(taxa))
                
            closest_node.label = create_label(support, taxa_str, aux_info)
                    
            # remove other potential node assignments
            fmeasure_for_taxa[taxon] = [fmeasure_for_taxa[taxon][closest_index]]