Esempio n. 1
0
 def _ancestor_multiple_taxa_at_rank(self, node, rank_prefix):
     """Find first ancestor that contains multiple named lineages at the specified rank."""
     
     parent = node.parent_node
     while True:
         taxa = []
         
         for node in parent.levelorder_iter():
             if node.label:
                 support, taxon_name, _auxiliary_info = parse_label(node.label)
                 
                 if taxon_name:
                     for taxon in [x.strip() for x in taxon_name.split(';')]:
                         if taxon.startswith(rank_prefix):
                             taxa.append(taxon)
                         
             if len(taxa) >= 2:
                 break
                 
         if len(taxa) >= 2:
             break  
     
         parent = parent.parent_node
         
     return parent
Esempio n. 2
0
def get_phyla_lineages(tree):
    """Get list of phyla level lineages.

    Parameters
    ----------
    tree : Dendropy Tree
        Phylogenetic tree.

    Returns
    -------
    list
        List of phyla level lineages.
    """
    phyla = []
    for node in tree.preorder_node_iter():
        if not node.label or node.is_leaf():
            continue

        _support, taxon_name, _auxiliary_info = parse_label(node.label)
        if taxon_name:
            taxa = [x.strip() for x in taxon_name.split(';')]
            if taxa[-1].startswith('p__'):
                phyla.append(taxa[-1])
                
    return phyla
Esempio n. 3
0
    def rel_dist_to_named_clades(self, tree, mblet=False):
        """Determine relative distance to specific taxa.

        Parameters
        ----------
        tree : Dendropy Tree
            Phylogenetic tree.

        Returns
        -------
        dict : d[rank_index][taxon] -> relative divergence
        """

        # calculate relative distance for all nodes
        self.decorate_rel_dist(tree, mblet)

        # tabulate values for internal nodes with ranks
        rel_dists = defaultdict(dict)
        for node in tree.preorder_node_iter(lambda n: n != tree.seed_node):
            if not node.label or node.is_leaf():
                continue

            _support, taxon_name, _auxiliary_info = parse_label(node.label)
            if not taxon_name:
                continue

            # get most-specific rank if a node represents multiple ranks
            if ';' in taxon_name:
                taxon_name = taxon_name.split(';')[-1].strip()

            most_specific_rank = taxon_name[0:3]
            rel_dists[Taxonomy.rank_index[most_specific_rank]][taxon_name] = node.rel_dist

        return rel_dists
Esempio n. 4
0
def get_phyla_lineages(tree):
    """Get list of phyla level lineages.

    Parameters
    ----------
    tree : Dendropy Tree
        Phylogenetic tree.

    Returns
    -------
    list
        List of phyla level lineages.
    """
    phyla = []
    for node in tree.preorder_node_iter():
        if not node.label or node.is_leaf():
            continue

        _support, taxon_name, _auxiliary_info = parse_label(node.label)
        if taxon_name:
            taxa = [x.strip() for x in taxon_name.split(';')]
            if taxa[-1].startswith('p__'):
                phyla.append(taxa[-1])
                
    return phyla
Esempio n. 5
0
def translate_viral_tree(tree):
    """Translate prefixes of viral taxonomy in tree to prokaryotic prefixes."""

    if isinstance(tree, str):
        tree = dendropy.Tree.get_from_path(tree,
                                           schema='newick',
                                           rooting="force-rooted",
                                           preserve_underscores=True)

    for node in tree.preorder_node_iter():
        if not node.label or node.is_leaf():
            continue

        support, taxa, auxiliary_info = parse_label(node.label)
        if not taxa:
            continue

        translated_taxa = []
        for taxon in [t.strip() for t in taxa.split(';')]:
            prefix = taxon[0:3]
            if prefix not in VIRAL_PREFIX_TRANSLATION:
                print('Unrecognized viral prefix for {}: {}'.format(
                    taxon, prefix))
                sys.exit(1)

            translated_taxa.append(
                taxon.replace(prefix, VIRAL_PREFIX_TRANSLATION[prefix]))

        taxa_str = ';'.join(translated_taxa)
        node.label = create_label(support, taxa_str, auxiliary_info)
Esempio n. 6
0
    def _ancestor_multiple_taxa_at_rank(self, node, rank_prefix):
        """Find first ancestor that contains multiple named lineages at the specified rank."""

        parent = node.parent_node
        while True:
            taxa = []

            for node in parent.levelorder_iter():
                if node.label:
                    support, taxon_name, _auxiliary_info = parse_label(
                        node.label)

                    if taxon_name:
                        for taxon in [
                                x.strip() for x in taxon_name.split(';')
                        ]:
                            if taxon.startswith(rank_prefix):
                                taxa.append(taxon)

                if len(taxa) >= 2:
                    break

            if len(taxa) >= 2:
                break

            parent = parent.parent_node

        return parent
Esempio n. 7
0
    def rel_dist_to_named_clades(self, tree):
        """Determine relative distance to specific taxa.

        Parameters
        ----------
        tree : Dendropy Tree
            Phylogenetic tree.

        Returns
        -------
        dict : d[rank_index][taxon] -> relative divergence
        """

        # calculate relative distance for all nodes
        self.decorate_rel_dist(tree)

        # assign internal nodes with ranks from
        rel_dists = defaultdict(dict)
        for node in tree.preorder_node_iter(lambda n: n != tree.seed_node):
            if not node.label or node.is_leaf():
                continue

            # check for support value
            _support, taxon_name, _auxiliary_info = parse_label(node.label)

            if not taxon_name:
                continue

            # get most-specific rank if a node represents multiple ranks
            if ';' in taxon_name:
                taxon_name = taxon_name.split(';')[-1].strip()

            most_specific_rank = taxon_name[0:3]
            rel_dists[Taxonomy.rank_index[most_specific_rank]][taxon_name] = node.rel_dist

        return rel_dists
Esempio n. 8
0
    def decorate(self, input_tree, taxonomy_file, threshold, rank,
                 retain_named_lineages, keep_labels, prune, output_tree):
        """Produce table with number of lineage for increasing mean branch lengths

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        taxonomy_file : str
            File with taxonomic information for each taxon.
        threshold : float
            Branch length threshold.
        rank : int
            Rank of labels to retain on tree.
        retain_named_lineages : bool
            Retain existing named lineages at the specified rank.
        keep_labels : bool
            Keep existing labels on tree.
        prune : bool
            Prune tree to preserve only the shallowest and deepest taxa in each lineage.
        output_tree : str
            Name of output tree.
        """

        # read taxonomy
        taxonomy = Taxonomy().read(taxonomy_file)

        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        # decorate tree
        rank_prefix = Taxonomy.rank_prefixes[rank]
        new_name_number = defaultdict(int)
        ncbi_only = 0
        sra_only = 0

        labeled_nodes = set()

        stack = [tree.seed_node]
        while stack:
            node = stack.pop()

            # check if node is a leaf
            if node.is_leaf():
                continue

            # check if ancestor already has a label at this rank
            p = node
            parent_taxon = None
            while p and not parent_taxon:
                if p.label:
                    support, taxon_name, _auxiliary_info = parse_label(p.label)

                    if taxon_name:
                        for taxon in [
                                x.strip() for x in taxon_name.split(';')
                        ]:
                            if taxon.startswith(rank_prefix):
                                parent_taxon = taxon

                p = p.parent_node

            if retain_named_lineages and parent_taxon:
                for c in node.child_node_iter():
                    stack.append(c)
                continue

            # check if descendant node already has a label at this rank
            children_taxon = []
            for c in node.preorder_internal_node_iter():
                if c.label:
                    support, taxon_name, _auxiliary_info = parse_label(c.label)

                    if taxon_name:
                        for taxon in [
                                x.strip() for x in taxon_name.split(';')
                        ]:
                            if taxon.startswith(rank_prefix):
                                children_taxon.append(taxon)

            if retain_named_lineages and children_taxon:
                for c in node.child_node_iter():
                    stack.append(c)
                continue

            # check if node meets mean branch length criterion
            dists_to_tips = []
            for t in node.leaf_iter():
                dists_to_tips.append(self._dist_to_ancestor(t, node))

            if np_mean(dists_to_tips) > threshold:
                for c in node.child_node_iter():
                    stack.append(c)
                continue

            # count number of SRA and NCBI taxa below node
            num_sra_taxa = 0
            num_ncbi_taxa = 0
            taxa_labels = set()
            for t in node.leaf_iter():
                if t.taxon.label.startswith('U_'):
                    num_sra_taxa += 1
                else:
                    num_ncbi_taxa += 1

                t = taxonomy[t.taxon.label]
                taxon = t[rank][3:].replace('Candidatus ', '')
                if taxon:
                    taxa_labels.add(taxon)

            if parent_taxon:
                taxa_labels.add(parent_taxon[3:].replace('Candidatus ', ''))
            elif children_taxon:
                for c in children_taxon:
                    taxa_labels.add(c[3:].replace('Candidatus ', ''))

            # name lineage based on position to existing named lineages
            if taxa_labels:
                lineage_name = ', '.join(sorted(taxa_labels))
            else:
                lineage_name = 'Unclassified lineage'

            support = None
            taxon_name = None
            if node.label:  # preserve support information
                support, _taxon_name, _auxiliary_info = parse_label(node.label)

            new_name_number[lineage_name] += 1

            if support:
                node.label = '%d:%s %d' % (support, lineage_name,
                                           new_name_number[lineage_name])
            else:
                node.label = '%s %d' % (lineage_name,
                                        new_name_number[lineage_name])

            labeled_nodes.add(node)

            if num_sra_taxa == 0:
                ncbi_only += 1
            if num_ncbi_taxa == 0:
                sra_only += 1

        # strip previous labels
        if not keep_labels:
            for node in tree.preorder_internal_node_iter():
                if node in labeled_nodes:
                    continue

                if node.label:  # preserve support information
                    support, _taxon_name, _auxiliary_info = parse_label(
                        node.label)
                    node.label = support

        # prune tree to shallowest and deepest taxa in each named lineage
        if prune:
            nodes_to_prune = set()
            for node in labeled_nodes:
                for c in node.child_node_iter():
                    dists = []
                    for t in c.leaf_iter():
                        d = self._dist_to_ancestor(t, node)
                        dists.append((d, t))

                    dists.sort()

                    # select taxa at the 10th and 90th percentiles to
                    # give a good sense of the range of depths
                    perc_10th_index = int(0.1 * len(dists) + 0.5)
                    perc_90th_index = int(0.9 * len(dists) + 0.5)
                    for i, (d, t) in enumerate(dists):
                        if i != perc_10th_index and i != perc_90th_index:
                            nodes_to_prune.add(t.taxon)

            print('before prune', sum([1 for _ in tree.leaf_node_iter()]))
            tree.prune_taxa(nodes_to_prune)
            print('after prune', sum([1 for _ in tree.leaf_node_iter()]))

        self.logger.info('Decorated %d internal nodes.' %
                         sum(new_name_number.values()))
        # self.logger.info('NCBI-only %d; SRA-only %d' % (ncbi_only, sra_only))

        tree.write_to_path(output_tree,
                           schema='newick',
                           suppress_rooting=True,
                           unquoted_underscores=True)
Esempio n. 9
0
    def run(self, tree1_file, tree2_file, output_dir, min_support, min_taxa, named_only):
        """Calculate supported topological differences between trees.
        
        Parameters
        ----------
        tree1_file : str
            File with tree in Newick format.
        tree2_file : str
            File with tree in Newick format.
        output_dir : str
            Output directory.
        min_support : float
            Minimum value to consider a lineage well supported.
        min_taxa : int
            Only consider lineage with sufficient number of taxa.
        named_only : boolean
            Only consider named lineages.  
        """
        
        if not named_only:
            self.logger.error("This command currently assumes the 'named_only' flag will be thrown.")
            sys.exit()
            
        tree1_name = os.path.splitext(os.path.basename(tree1_file))[0]
        tree2_name = os.path.splitext(os.path.basename(tree2_file))[0]
        
        tree1 = dendropy.Tree.get_from_path(tree1_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
                                            
        tree2 = dendropy.Tree.get_from_path(tree2_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        # prune both trees to the set of common taxa
        taxa1 = set()
        for t in tree1.leaf_node_iter():
            taxa1.add(t.taxon.label)
            
        taxa2 = set()
        for t in tree2.leaf_node_iter():
            taxa2.add(t.taxon.label)
            
        taxa_in_common = taxa1.intersection(taxa2)
        self.logger.info('Tree 1 contains %d taxa.' % len(taxa1))
        self.logger.info('Tree 2 contains %d taxa.' % len(taxa2))
        self.logger.info('Pruning trees to the %d taxa in common.' % len(taxa_in_common))
        
        tree1.retain_taxa_with_labels(taxa_in_common)
        tree2.retain_taxa_with_labels(taxa_in_common)
        
        # identify nodes meeting specified criteria
        tree1_nodes = {}
        tree2_nodes = {}
        node_support1 = {}
        node_support2 = {}
        for tree, tree_nodes, support_values in ([tree1, tree1_nodes, node_support1],[tree2, tree2_nodes, node_support2]):
            for n in tree.preorder_internal_node_iter():
                support, taxon_name, _auxiliary_info = parse_label(n.label)
                if named_only and not taxon_name:
                    continue
                    
                if not support:
                    continue
                    
                support = int(support)
                support_values[taxon_name] = support
                
                num_taxa = sum([1 for _ in n.leaf_iter()])
                if support >= min_support and num_taxa >= min_taxa:
                    tree_nodes[taxon_name] = [support, num_taxa, n]
                    
        self.logger.info('Tree 1 has %d supported nodes.' % len(tree1_nodes))
        self.logger.info('Tree 2 has %d supported nodes.' % len(tree2_nodes))
        
        # compare supported nodes between the two trees
        diffs = {}
        congruent_taxa = defaultdict(list)       # same node bootstrap supported in both trees
        incongruent_taxa = defaultdict(list)     # node supported in both trees, but have different extant taxa
        unresolved_taxa = defaultdict(list)      # supported node in one tree is not present and/or well support in the other tree

        for taxon, data1 in tree1_nodes.iteritems():
            most_specific_taxon = taxon.split(';')[-1].strip()
            rank_index = Taxonomy.rank_prefixes.index(most_specific_taxon[0:3])
            support1, num_taxa1, node1 = data1
            
            if taxon in tree2_nodes:
                support2, num_taxa2, node2 = tree2_nodes[taxon]
                
                taxa1 = set([t.taxon.label for t in node1.leaf_iter()])
                taxa2 = set([t.taxon.label for t in node2.leaf_iter()])
                
                diff_taxa = taxa1.symmetric_difference(taxa2)
                
                if len(diff_taxa) > 0:
                    diffs[taxon] = [len(diff_taxa), ','.join(taxa1 - taxa2), ','.join(taxa2- taxa1)]
                    incongruent_taxa[rank_index].append((taxon, len(diff_taxa)))
                else:
                    congruent_taxa[rank_index].append((taxon, support1, support2))
            else:
                unresolved_taxa[rank_index].append((taxon, tree1_name, support1, tree2_name, node_support2.get(taxon, -1)))
                
        # identify unresolved taxa in tree 2
        for taxon, data2 in tree2_nodes.iteritems():
            support2, num_taxa2, node2 = data1
            if taxon not in tree1_nodes:
                unresolved_taxa[rank_index].append((taxon, tree2_name, support2, tree1_name, node_support1.get(taxon, -1)))
        
        # write out difference in extant taxa for incongruent taxa
        tax_diff_file = os.path.join(output_dir, 'incongruent_taxa.tsv')
        fout = open(tax_diff_file, 'w')
        fout.write('Taxon\tNo. Incongruent Taxa\tTree1 - Tree2\tTree2 - Tree1\n')
        for taxon in Taxonomy().sort_taxa(diffs.keys()):
            num_diffs, t12_diff_str, t21_diff_str = diffs[taxon]
            fout.write('%s\t%d\t%s\t%s\n' % (taxon,
                                                num_diffs,
                                                t12_diff_str,
                                                t21_diff_str))
        
        fout.close()
        
        # write out classification of each node
        classification_file = os.path.join(output_dir, 'taxon_classification.tsv')
        fout_classification = open(classification_file, 'w')
        fout_classification.write('Rank\tTaxon\tClassification\tDescription\n')
        
        stats_file = os.path.join(output_dir, 'tree_diff_stats.tsv')
        fout_stats = open(stats_file, 'w')
        fout_stats.write('Rank\tCongruent\tIncongruent\tUnresolved for %s\tUnresolved for %s\n' % (tree1_name, tree2_name))
        for rank, rank_label in enumerate(Taxonomy.rank_labels):
            for info in congruent_taxa[rank]:
                taxon, support1, support2 = info
                
                desc = 'Taxon is congruent with %d and %d support.' % (support1, support2)
                fout_classification.write('%s\t%s\t%s\t%s\n' % (rank_label, taxon, 'congruent', desc))
                
            for info in incongruent_taxa[rank]:
                taxon, num_diff_taxa = info
                desc = 'Taxon has %d extant taxa in disagreement.' % num_diff_taxa
                fout_classification.write('%s\t%s\t%s\t%s\n' % (rank_label, taxon, 'incongruent', desc))
                
            unresolved1 = 0
            unresolved2 = 0
            for info in unresolved_taxa[rank]:
                taxon, supported_tree_name, support1, unsupported_tree_name, support2 = info
                desc = 'Taxon is supported in %s (%d), but not in %s (%d)' % (supported_tree_name, support1, unsupported_tree_name, support2)
                fout_classification.write('%s\t%s\t%s\t%s\n' % (rank_label, taxon, 'incongruent', desc))
                
                if supported_tree_name == tree1_name:
                    unresolved1 += 1
                else:
                    unresolved2 += 1
                
            fout_stats.write('%s\t%d\t%d\t%s\t%s\n' % (rank_label, 
                                                        len(congruent_taxa[rank]),
                                                        len(incongruent_taxa[rank]), 
                                                        unresolved1,
                                                        unresolved2))
                
        fout_classification.close()
        fout_stats.close()
Esempio n. 10
0
    def median_rd_over_phyla(self, 
                                tree, 
                                taxa_for_dist_inference,
                                taxonomy):
        """Calculate the median relative divergence over all phyla rootings.
        
        Parameters
        ----------
        tree : Tree
          Dendropy tree.
        taxa_for_dist_inference : set
          Taxa to use for inference relative divergence distributions.
        taxonomy : d[taxon_id] -> [d__, p__, ..., s__]
          Taxonomy of extant taxa.
        """
    
        # get list of phyla level lineages
        all_phyla = get_phyla_lineages(tree)
        self.logger.info('Identified %d phyla.' % len(all_phyla))
        
        phyla = [p for p in all_phyla if p in taxa_for_dist_inference]
        self.logger.info('Using %d phyla as rootings for inferring distributions.' % len(phyla))
        if len(phyla) < 2:
            self.logger.error('Rescaling requires at least 2 valid phyla.')
            sys.exit(-1)
            
        # give each node a unique id
        for i, n in enumerate(tree.preorder_node_iter()):
            n.id = i
    
        # calculate relative divergence for tree rooted on each phylum
        phylum_rel_dists = {}
        rel_node_dists = defaultdict(list)
        rd = RelativeDistance()
        for p in phyla:
            phylum = p.replace('p__', '').replace(' ', '_').lower()
            self.logger.info('Calculating information with rooting on %s.' % phylum.capitalize())
            
            cur_tree = self.root_with_outgroup(tree, taxonomy, p)
            
            # calculate relative distance to taxa
            rel_dists = rd.rel_dist_to_named_clades(cur_tree)
            rel_dists.pop(0, None) # remove results for Domain

            # remove named groups in outgroup
            children = Taxonomy().children(p, taxonomy)
            for r in list(rel_dists.keys()):
                rel_dists[r].pop(p, None)

            for t in children:
                for r in list(rel_dists.keys()):
                    rel_dists[r].pop(t, None)

            phylum_rel_dists[phylum] = rel_dists
            
            # calculate relative distance to all nodes
            rd.decorate_rel_dist(cur_tree)
            
            # determine which lineages represents the 'ingroup'
            ingroup_subtree = None
            for c in cur_tree.seed_node.child_node_iter():
                _support, taxon_name, _auxiliary_info = parse_label(c.label)
                if not taxon_name or p not in taxon_name:
                    ingroup_subtree = c
                    break
            
            # do a preorder traversal of 'ingroup' and record relative divergence to nodes
            for n in ingroup_subtree.preorder_iter():                        
                rel_node_dists[n.id].append(n.rel_dist)
                                                           
        return phylum_rel_dists, rel_node_dists
Esempio n. 11
0
def filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support):
    """Determine taxa to use for inferring distribution of relative divergences.

    Parameters
    ----------
    tree : Dendropy Tree
        Phylogenetic tree.
    taxonomy : d[taxon ID] -> [d__x; p__y; ...]
        Taxonomy for each taxon.
    trusted_taxa : iterable
        Trusted taxa to consider when inferring distribution.
    min_children : int
        Only consider taxa with at least the specified number of children taxa when inferring distribution.
    min_support : float
        Only consider taxa with at least this level of support when inferring distribution.
    """

    # determine children taxa for each named group
    taxon_children = Taxonomy().taxon_children(taxonomy)

    # get all named groups
    taxa_for_dist_inference = set()
    for taxon_id, taxa in taxonomy.iteritems():
        for taxon in taxa:
            taxa_for_dist_inference.add(taxon)

    # sanity check species names as these are a common problem
    species = set()
    for taxon_id, taxa in taxonomy.iteritems():
        if len(taxa) > Taxonomy.rank_index['s__']:
            species_name = taxa[Taxonomy.rank_index['s__']]
            valid, error_msg = True, None
            if species_name != 's__':
                valid, error_msg = Taxonomy().validate_species_name(species_name, require_full=True, require_prefix=True)
            if not valid:
                print '[Warning] Species name %s for %s is invalid: %s' % (species_name, taxon_id, error_msg)
                continue
                
            species.add(species_name)

    # restrict taxa to those with a sufficient number of named children
    # Note: a taxonomic group with no children will not end up in the
    # taxon_children data structure so care must be taken when applying
    # this filtering criteria.
    if min_children > 0:
        valid_taxa = set()
        for taxon, children_taxa in taxon_children.iteritems():
            if len(children_taxa) >= min_children:
                valid_taxa.add(taxon)

        taxa_for_dist_inference.intersection_update(valid_taxa)

        # explicitly add in the species since they have no
        # children and thus be absent from the taxon_child dictionary
        taxa_for_dist_inference.update(species)

    # restrict taxa used for inferring distribution to those with sufficient support
    if min_support > 0:
        for node in tree.preorder_node_iter():
            if not node.label or node.is_leaf():
                continue

            # check for support value
            support, taxon_name, _auxiliary_info = parse_label(node.label)

            if not taxon_name:
                continue

            if support and float(support) < min_support:
                taxa_for_dist_inference.difference_update([taxon_name])
            elif not support and min_support > 0:
                # no support value, so inform user if they were trying to filter on this property
                print '[Error] Tree does not contain support values. As such, --min_support should be set to 0.'
                continue

    # restrict taxa used for inferring distribution to the trusted set
    if trusted_taxa:
        taxa_for_dist_inference = trusted_taxa.intersection(taxa_for_dist_inference)

    return taxa_for_dist_inference
Esempio n. 12
0
    def rank_res(self, options):
        """Calculate taxonomic resolution at each rank."""

        check_file_exists(options.input_tree)
        check_file_exists(options.taxonomy_file)
        
        if options.taxa_file:
            taxa_out = open(options.taxa_file, 'w')
            taxa_out.write('Rank\tLowest Rank\tTaxon\n')

        # determine taxonomic resolution of named groups
        tree = dendropy.Tree.get_from_path(options.input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        rank_res = defaultdict(lambda: defaultdict(int))
        for node in tree.preorder_node_iter(lambda n: n != tree.seed_node):
            if not node.label or node.is_leaf():
                continue

            _support, taxon_name, _auxiliary_info = parse_label(node.label)
            
            if taxon_name:
                lowest_rank = [x.strip() for x in taxon_name.split(';')][-1][0:3]
                for rank_prefix in Taxonomy.rank_prefixes:
                    if rank_prefix in taxon_name:
                        rank_res[rank_prefix][lowest_rank] += 1
                        if options.taxa_file:
                            rank_prefix_name = Taxonomy.rank_labels[Taxonomy.rank_index[rank_prefix]]
                            lowest_rank_name = Taxonomy.rank_labels[Taxonomy.rank_index[lowest_rank]]
                            taxa_out.write('%s\t%s\t%s\n' % (rank_prefix_name, lowest_rank_name, taxon_name))

        # identify any singleton taxa which are treated as having species level resolution
        for line in open(options.taxonomy_file):
            line_split = line.split('\t')
            genome_id = line_split[0]
            taxonomy = line_split[1].split(';')
            
            for i, rank_prefix in enumerate(Taxonomy.rank_prefixes):
                if taxonomy[i] == rank_prefix:
                    # this taxa is undefined at the specified rank so
                    # must be the sole representative; e.g., a p__
                    # indicates a taxon that represents a novel phyla
                    rank_res[rank_prefix]['s__'] += 1
                    if options.taxa_file:
                        rank_prefix_name = Taxonomy.rank_labels[Taxonomy.rank_index[rank_prefix]]
                        taxa_out.write('%s\t%s\t%s (%s)\n' % (rank_prefix_name, 'species', taxonomy[i], genome_id))                   
        if options.taxa_file:
            taxa_out.close()
                      
        # write out results
        fout = open(options.output_file, 'w')
        fout.write('Category')
        for rank in Taxonomy.rank_labels[1:]:
            fout.write('\t' + rank)
        fout.write('\n')

        for i, rank_prefix in enumerate(Taxonomy.rank_prefixes[1:]):
            fout.write(Taxonomy.rank_labels[i+1])
            
            for j, r in enumerate(Taxonomy.rank_prefixes[1:]):
                if i >= j:
                    fout.write('\t' + str(rank_res[r].get(rank_prefix, 0)))
                else:
                    fout.write('\t-')
            fout.write('\n')
        fout.close()

        self.logger.info('Done.')
Esempio n. 13
0
    def optimal(self, input_tree, rank, min_dist, max_dist, step_size,
                output_table):
        """Determine branch length for best congruency with existing taxonomy.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        rank : int
            Taxonomic rank to consider (1=Phylum, ..., 6=Species).
        output_table : str
            Name of output table.
        """

        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        # get mean distance to terminal taxa for each node along with
        # other stats needed to determine classification
        self.logger.info('Determining MDTT for each node.')
        rank_prefix = Taxonomy.rank_prefixes[rank]
        child_rank_prefix = Taxonomy.rank_prefixes[rank + 1]
        rank_info = []
        rank_dists = set()
        for node in tree.seed_node.preorder_internal_node_iter():
            if node == tree.seed_node:
                continue

            # check if node is at the specified rank
            node_taxon = None
            if node.label:
                support, taxon_name, _auxiliary_info = parse_label(node.label)

                if taxon_name:
                    for taxon in [x.strip() for x in taxon_name.split(';')]:
                        if taxon.startswith(rank_prefix):
                            node_taxon = taxon

            if not node_taxon:
                continue

            # check that node has two descendants at the next rank
            child_rank_taxa = []
            for c in node.levelorder_iter():
                if c.label:
                    support, taxon_name, _auxiliary_info = parse_label(c.label)

                    if taxon_name:
                        for taxon in [
                                x.strip() for x in taxon_name.split(';')
                        ]:
                            if taxon.startswith(child_rank_prefix):
                                child_rank_taxa.append(taxon)

                if len(child_rank_taxa) >= 2:
                    break

            if len(child_rank_taxa) < 2:
                continue

            # get mean branch length to terminal taxa
            dists_to_tips = []
            for t in node.leaf_iter():
                dists_to_tips.append(self._dist_to_ancestor(t, node))

            node_dist = np_mean(dists_to_tips)

            # get mean branch length to terminal taxa for first ancestor spanning multiple phyla
            ancestor = self._ancestor_multiple_taxa_at_rank(node, rank_prefix)

            ancestor_dists_to_tips = []
            for t in ancestor.leaf_iter():
                ancestor_dists_to_tips.append(
                    self._dist_to_ancestor(t, ancestor))

            ancestor_dist = np_mean(ancestor_dists_to_tips)

            rank_info.append([node_dist, ancestor_dist, node_taxon])
            rank_dists.add(node_dist)

        self.logger.info(
            'Calculating threshold from %d taxa with specified rank resolution.'
            % len(rank_info))

        fout = open('bl_optimal_taxa_dists.tsv', 'w')
        fout.write('Taxon\tNode MDTT\tMulti-phyla Ancestor MDTT\n')
        for node_dist, ancestor_dist, node_taxon in rank_info:
            fout.write('%s\t%.3f\t%.3f\n' %
                       (node_taxon, node_dist, ancestor_dist))
        fout.close()

        # report number of correct and incorrect taxa for each threshold
        fout = open(output_table, 'w')
        header = 'Threshold\tCorrect\tIncorrect\tPrecision\tNo. Lineages\tNo. Multiple Taxa Lineages\tNo. Terminal Lineages'
        fout.write(header + '\n')
        print(header)

        top_correct = 0
        top_incorrect = 0
        top_precision = 0
        for d in np_arange(min_dist, max_dist + step_size, step_size):
            rank_dists.add(d)

        for dist_threshold in sorted(rank_dists, reverse=True):
            correct = 0
            incorrect = 0
            for node_dist, ancestor_dist, node_taxon in rank_info:
                # check if node/edge would be collapsed at the given threshold
                if node_dist <= dist_threshold and ancestor_dist > dist_threshold:
                    correct += 1
                elif node_dist > dist_threshold:
                    incorrect += 1
                else:
                    incorrect += 1  # above ancestor with multiple taxa

            denominator = correct + incorrect
            if denominator:
                precision = float(correct) / denominator
            else:
                precision = 0

            num_lineages, num_terminal_lineages = self._num_lineages(
                tree, dist_threshold)

            row = '%f\t%d\t%d\t%.3f\t%d\t%d\t%d' % (
                dist_threshold, correct, incorrect, precision, num_lineages +
                num_terminal_lineages, num_lineages, num_terminal_lineages)

            fout.write(row + '\n')
            print(row)

            if precision > top_precision:
                top_correct = correct
                top_incorrect = incorrect
                top_precision = precision
                top_threshold = dist_threshold

        return top_threshold, top_correct, top_incorrect
Esempio n. 14
0
    def decorate(self, 
                    input_tree,
                    taxonomy_file,
                    threshold, 
                    rank, 
                    retain_named_lineages, 
                    keep_labels,
                    prune,
                    output_tree):
        """Produce table with number of lineage for increasing mean branch lengths

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        taxonomy_file : str
            File with taxonomic information for each taxon.
        threshold : float
            Branch length threshold.
        rank : int
            Rank of labels to retain on tree.
        retain_named_lineages : bool
            Retain existing named lineages at the specified rank.
        keep_labels : bool
            Keep existing labels on tree.
        prune : bool
            Prune tree to preserve only the shallowest and deepest taxa in each lineage.
        output_tree : str
            Name of output tree.
        """
        
        # read taxonomy
        taxonomy = Taxonomy().read(taxonomy_file)
        
        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree,
                                            schema='newick',
                                            rooting='force-rooted',
                                            preserve_underscores=True)
        
        # decorate tree
        rank_prefix = Taxonomy.rank_prefixes[rank]
        new_name_number = defaultdict(int)
        ncbi_only = 0
        sra_only = 0
        
        labeled_nodes = set()
        
        stack = [tree.seed_node]
        while stack:
            node = stack.pop()
            
            # check if node is a leaf
            if node.is_leaf():
                continue
                
            # check if ancestor already has a label at this rank
            p = node
            parent_taxon = None
            while p and not parent_taxon:
                if p.label:
                    support, taxon_name, _auxiliary_info = parse_label(p.label)
                    
                    if taxon_name:
                        for taxon in [x.strip() for x in taxon_name.split(';')]:
                            if taxon.startswith(rank_prefix):
                                parent_taxon = taxon
                    
                p = p.parent_node
                    
            if retain_named_lineages and parent_taxon:
                for c in node.child_node_iter():
                    stack.append(c)
                continue
                
            # check if descendant node already has a label at this rank
            children_taxon = []
            for c in node.preorder_internal_node_iter():
                if c.label:
                    support, taxon_name, _auxiliary_info = parse_label(c.label)
                    
                    if taxon_name:
                        for taxon in [x.strip() for x in taxon_name.split(';')]:
                            if taxon.startswith(rank_prefix):
                                children_taxon.append(taxon)
                        
            if retain_named_lineages and children_taxon:
                for c in node.child_node_iter():
                    stack.append(c)
                continue
                
            # check if node meets mean branch length criterion
            dists_to_tips = []
            for t in node.leaf_iter():
                dists_to_tips.append(self._dist_to_ancestor(t, node))
                
            if np_mean(dists_to_tips) > threshold:
                for c in node.child_node_iter():
                    stack.append(c)
                continue
                                
            # count number of SRA and NCBI taxa below node
            num_sra_taxa = 0
            num_ncbi_taxa = 0
            taxa_labels = set()
            for t in node.leaf_iter():
                if t.taxon.label.startswith('U_'):
                    num_sra_taxa += 1
                else:
                    num_ncbi_taxa += 1
                    
                t = taxonomy[t.taxon.label]
                taxon = t[rank][3:].replace('Candidatus ', '')
                if taxon:
                    taxa_labels.add(taxon)
                    
            if parent_taxon:
                taxa_labels.add(parent_taxon[3:].replace('Candidatus ', ''))
            elif children_taxon:
                for c in children_taxon:
                    taxa_labels.add(c[3:].replace('Candidatus ', ''))
            
                    
            # name lineage based on position to existing named lineages
            if taxa_labels:
                lineage_name = ', '.join(sorted(taxa_labels))
            else:
                lineage_name = 'Unclassified lineage'
            
            support = None
            taxon_name = None
            if node.label: # preserve support information
                support, _taxon_name, _auxiliary_info = parse_label(node.label)

            new_name_number[lineage_name] += 1

            if support:
                node.label = '%d:%s %d' % (support, lineage_name, new_name_number[lineage_name])
            else:    
                node.label = '%s %d' % (lineage_name, new_name_number[lineage_name])
                                                        
            labeled_nodes.add(node)
                 
            if num_sra_taxa == 0:
                ncbi_only += 1
            if num_ncbi_taxa == 0:
                sra_only += 1
                
        # strip previous labels
        if not keep_labels:
            for node in tree.preorder_internal_node_iter():
                if node in labeled_nodes:
                    continue
                    
                if node.label: # preserve support information
                    support, _taxon_name, _auxiliary_info = parse_label(node.label)
                    node.label = support
                    
        # prune tree to shallowest and deepest taxa in each named lineage
        if prune:
            nodes_to_prune = set()
            for node in labeled_nodes:
                for c in node.child_node_iter():
                    dists = []
                    for t in c.leaf_iter():
                        d = self._dist_to_ancestor(t, node)
                        dists.append((d, t))
                    
                    dists.sort()
                    
                    # select taxa at the 10th and 90th percentiles to
                    # give a good sense of the range of depths
                    perc_10th_index = int(0.1 * len(dists) + 0.5)
                    perc_90th_index = int(0.9 * len(dists) + 0.5)
                    for i, (d, t) in enumerate(dists):
                        if i != perc_10th_index and i != perc_90th_index:
                            nodes_to_prune.add(t.taxon)
                
            print 'before prune', sum([1 for _ in tree.leaf_node_iter()])
            tree.prune_taxa(nodes_to_prune)
            print 'after prune', sum([1 for _ in tree.leaf_node_iter()])
                        
        self.logger.info('Decorated %d internal nodes.' % sum(new_name_number.values()))
        #self.logger.info('NCBI-only %d; SRA-only %d' % (ncbi_only, sra_only))
        
        tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
Esempio n. 15
0
    def run(self, input_tree, 
                    taxonomy_file, 
                    output_dir, 
                    plot_taxa_file,
                    plot_dist_taxa_only,
                    plot_domain,
                    highlight_polyphyly,
                    highlight_taxa_file,
                    trusted_taxa_file,
                    fixed_root,
                    min_children, 
                    min_support,
                    mblet,
                    fmeasure_table,
                    min_fmeasure,
                    fmeasure_mono,
                    verbose_table):
        """Determine distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        input_tree : str
          Name of input tree.
        taxonomy_file : str
          File with taxonomy strings for each taxa.
        output_dir : str
          Desired output directory.
        plot_taxa_file : str
          File specifying taxa to plot. Set to None to consider all taxa.
        plot_dist_taxa_only : boolean    
          Only plot the taxa used to infer distribution.
        plot_domain : boolean
          Plot domain rank.
        trusted_taxa_file : str
          File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        fixed_root : boolean
          Usa a single fixed root to infer outliers.
        min_children : int
          Only consider taxa with at least the specified number of children taxa when inferring distribution.
        min_support : float
          Only consider taxa with at least this level of support when inferring distribution.
        verbose_table : boolean
          Print additional columns in output table.
        """
        
        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)

        input_tree_name = os.path.splitext(os.path.basename(input_tree))[0]

        # pull taxonomy from tree and file
        self.logger.info('Reading taxonomy.')
        taxonomy = Taxonomy().read(taxonomy_file)
        tree_taxonomy = Taxonomy().read_from_tree(input_tree,
                                                    warnings=False)
            
        gtdb_parent_ranks = Taxonomy().parents(tree_taxonomy)

        # read trusted taxa
        trusted_taxa = None
        if trusted_taxa_file:
            trusted_taxa = read_taxa_file(trusted_taxa_file)
            
        # read F-measure for taxa
        fmeasure = None
        if fmeasure_table:
            fmeasure = self.read_fmeasure(fmeasure_table)

        # determine taxa to be used for inferring distribution
        taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, 
                                                                    taxonomy, 
                                                                    trusted_taxa, 
                                                                    min_children, 
                                                                    min_support,
                                                                    fmeasure,
                                                                    min_fmeasure)

        # limit plotted taxa
        taxa_to_plot = None
        if plot_dist_taxa_only:
            taxa_to_plot = taxa_for_dist_inference
        elif plot_taxa_file:
            taxa_to_plot = read_taxa_file(plot_taxa_file)
        else:
            # plot every taxon defined in tree
            taxa_to_plot = set()
            for node in tree.preorder_node_iter():
                support, taxon, _auxiliary_info = parse_label(node.label)
                if taxon:
                    taxon = taxon.split(';')[-1].strip() # get most specific taxon from compound names 
                                                         # (e.g. p__Armatimonadetes; c__Chthonomonadetes)
                    taxa_to_plot.add(taxon)
            
            if False:
                # HACK FOR NCBI: only plot taxa with >= 2 taxa
                taxa_to_plot = set()
                for node in tree.preorder_node_iter():
                    if not node.label or node.is_leaf():
                        continue

                    support, taxon, _auxiliary_info = parse_label(node.label)
                    if not taxon:
                        continue
                    taxon = taxon.split(';')[-1].strip() # get most specific taxon from compound names 
                                                         # (e.g. p__Armatimonadetes; c__Chthonomonadetes)
                  
                    # count number of subordinate children
                    rank_prefix = taxon[0:3]
                    if min_children > 0 and rank_prefix != 's__':
                        child_rank_index = Taxonomy().rank_index[rank_prefix] + 1
                        child_rank_prefix = Taxonomy.rank_prefixes[child_rank_index]
                        subordinate_taxa = set()
                        for leaf in node.leaf_iter():
                            taxa = taxonomy.get(leaf.taxon.label, Taxonomy.rank_prefixes)
                            if len(taxa) > child_rank_index:
                                sub_taxon = taxa[child_rank_index]
                                if sub_taxon != Taxonomy.rank_prefixes[child_rank_index] and sub_taxon.startswith(child_rank_prefix):
                                    subordinate_taxa.add(sub_taxon)

                        if len(subordinate_taxa) < min_children:
                            continue
                            
                    taxa_to_plot.add(taxon)
            
        # highlight taxa
        highlight_taxa = set()
        if highlight_taxa_file:
            for line in open(highlight_taxa_file):
                highlight_taxa.add(line.strip().split('\t')[0])
                
        # check if a single fixed root should be used
        if fixed_root or mblet:
            self.logger.info('Using single fixed rooting for inferring distributions.')
            if not mblet:
                rel_dists = self.rd_fixed_root(tree, taxa_for_dist_inference)
            else:
                rel_dists = self.mblet(tree, taxa_for_dist_inference)
                
            # create fixed rooting style tables and plots
            distribution_table = os.path.join(output_dir, '%s.rank_distribution.tsv' % input_tree_name)
            plot_file = os.path.join(output_dir, '%s.png' % input_tree_name)
            self._distribution_plot(rel_dists, 
                                        taxa_for_dist_inference,
                                        highlight_polyphyly,
                                        highlight_taxa,
                                        distribution_table,
                                        fmeasure,
                                        fmeasure_mono,
                                        plot_file)

            median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name)
            self._median_outlier_file(rel_dists, 
                                        taxa_for_dist_inference, 
                                        gtdb_parent_ranks, 
                                        median_outlier_table)
        else:
            # calculate relative distance to taxa
            rd = RelativeDistance()
            rel_dists = rd.rel_dist_to_named_clades(tree)
            
            # restrict to taxa of interest
            if taxa_to_plot:
                for r in rel_dists:
                    for k in set(rel_dists[r].keys()) - set(taxa_to_plot):
                        del rel_dists[r][k]
            
            # report number of taxa at each rank
            print ''
            print 'Rank\tTaxa to Plot\tTaxa for Inference'
            for rank, taxa in rel_dists.iteritems():
                taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference]
                print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference))
            print ''
        
            # *** determine phyla for inferring distribution
            if True:
                phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, 
                                                                                taxa_for_dist_inference)
            else:                                                                    
                phyla_for_inference = filter_taxa_for_dist_inference(tree, 
                                                                        taxonomy, 
                                                                        trusted_taxa, 
                                                                        2, 
                                                                        min_support,
                                                                        fmeasure,
                                                                        min_fmeasure)
                phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, 
                                                                                phyla_for_inference)
                print ''
                print 'Phyla for RED Inference:'
                print ','.join(phylum_rel_dists)
                phyla_file = os.path.join(output_dir, '%s.phyla.tsv' % input_tree_name)
                fout = open(phyla_file, 'w')
                for p in phylum_rel_dists:
                    fout.write(p + '\n')
                fout.close()
                                                                            
            # set edge lengths to median value over all rootings
            tree.seed_node.rel_dist = 0.0
            for n in tree.preorder_node_iter(lambda n: n != tree.seed_node):
                n.rel_dist = np_median(rel_node_dists[n.id])
                rd_to_parent = n.rel_dist - n.parent_node.rel_dist
                if rd_to_parent < 0:
                    self.logger.warning('Not all branches are positive after scaling.')
                n.edge_length = rd_to_parent

            for phylum, rel_dists in phylum_rel_dists.iteritems():
                phylum_dir = os.path.join(output_dir, phylum)
                if not os.path.exists(phylum_dir):
                    os.makedirs(phylum_dir)
                    
                # restrict to taxa of interest
                if taxa_to_plot:
                    for r in rel_dists:
                        for k in set(rel_dists[r].keys()) - set(taxa_to_plot):
                            del rel_dists[r][k]
                    
                # create distribution plot
                distribution_table = os.path.join(phylum_dir, '%s.rank_distribution.tsv' % phylum)
                plot_file = os.path.join(phylum_dir, '%s.rank_distribution.png' % phylum)
                self._distribution_plot(rel_dists, 
                                        taxa_for_dist_inference,
                                        highlight_polyphyly,
                                        highlight_taxa,
                                        distribution_table,
                                        fmeasure,
                                        fmeasure_mono,
                                        plot_file)

                median_outlier_table = os.path.join(phylum_dir, '%s.median_outlier.tsv' % phylum)
                self._median_outlier_file(rel_dists, 
                                            taxa_for_dist_inference, 
                                            gtdb_parent_ranks,
                                            median_outlier_table)
   
            plot_file = os.path.join(output_dir, '%s.png' % input_tree_name)
            self._distribution_summary_plot(phylum_rel_dists, 
                                            taxa_for_dist_inference,
                                            highlight_polyphyly,
                                            highlight_taxa,
                                            fmeasure,
                                            fmeasure_mono,
                                            plot_file)

            median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name)
            median_rank_file = os.path.join(output_dir, '%s.dict' % input_tree_name)
            self._median_summary_outlier_file(phylum_rel_dists, 
                                                taxa_for_dist_inference, 
                                                gtdb_parent_ranks, 
                                                median_outlier_table, 
                                                median_rank_file, 
                                                verbose_table)

        output_rd_file = os.path.join(output_dir, '%s.node_rd.tsv' % input_tree_name)
        self._write_rd(tree, output_rd_file)
                                                
        output_tree = os.path.join(output_dir, '%s.scaled.tree' % input_tree_name)
        tree.write_to_path(output_tree, 
                            schema='newick', 
                            suppress_rooting=True, 
                            unquoted_underscores=True)                
Esempio n. 16
0
def filter_taxa_for_dist_inference(tree,
                                   taxonomy,
                                   trusted_taxa,
                                   min_children,
                                   min_support,
                                   fmeasure=None,
                                   min_fmeasure=None):
    """Determine taxa to use for inferring distribution of relative divergences.

    Parameters
    ----------
    tree : Dendropy Tree
        Phylogenetic tree.
    taxonomy : d[taxon ID] -> [d__x; p__y; ...]
        Taxonomy for each taxon.
    trusted_taxa : iterable
        Trusted taxa to consider when inferring distribution.
    min_children : int
        Only consider taxa with at least the specified number of children taxa when inferring distribution.
    min_support : float
        Only consider taxa with at least this level of support when inferring distribution.
    """

    # sanity check species names as these are a common problem
    species = set()
    for taxon_id, taxa in taxonomy.iteritems():
        if len(taxa) > Taxonomy.rank_index['s__']:
            species_name = taxa[Taxonomy.rank_index['s__']]
            valid, error_msg = True, None
            if species_name != 's__':
                valid, error_msg = Taxonomy().validate_species_name(
                    species_name, require_full=True, require_prefix=True)
            if not valid:
                print '[Warning] Species name %s for %s is invalid: %s' % (
                    species_name, taxon_id, error_msg)
                continue

            species.add(species_name)

    # restrict taxa to those with a sufficient number
    # of named children and sufficient support
    taxa_for_dist_inference = set()
    for node in tree.preorder_node_iter():
        if not node.label or node.is_leaf():
            continue

        support, taxon, _auxiliary_info = parse_label(node.label)
        if not taxon:
            continue

        taxon = taxon.split(
            ';')[-1].strip()  # get most specific taxon from compound names
        # (e.g. p__Armatimonadetes; c__Chthonomonadetes)

        if support and min_support > 0 and support < min_support:
            continue

        if not support and min_support > 0:
            # no support value, so inform user if they were trying to filter on this property
            print '[Error] Tree does not contain support values. As such, --min_support must be set to 0.'
            sys.exit()

        if fmeasure and fmeasure[taxon] < min_fmeasure:
            continue

        # count number of subordinate children
        rank_prefix = taxon[0:3]
        if min_children > 0 and rank_prefix != 's__':
            child_rank_index = Taxonomy().rank_index[rank_prefix] + 1
            child_rank_prefix = Taxonomy.rank_prefixes[child_rank_index]
            subordinate_taxa = set()
            for leaf in node.leaf_iter():
                taxa = taxonomy.get(leaf.taxon.label, Taxonomy.rank_prefixes)
                if len(taxa) > child_rank_index:
                    sub_taxon = taxa[child_rank_index]
                    if sub_taxon != Taxonomy.rank_prefixes[
                            child_rank_index] and sub_taxon.startswith(
                                child_rank_prefix):
                        subordinate_taxa.add(sub_taxon)

            if len(subordinate_taxa) < min_children:
                continue

        taxa_for_dist_inference.add(taxon)

    # restrict taxa used for inferring distribution to the trusted set
    if trusted_taxa:
        taxa_for_dist_inference = trusted_taxa.intersection(
            taxa_for_dist_inference)

    return taxa_for_dist_inference
Esempio n. 17
0
    def run(self, tree1_file, tree2_file, output_dir, min_support, min_taxa,
            named_only):
        """Calculate supported topological differences between trees.
        
        Parameters
        ----------
        tree1_file : str
            File with tree in Newick format.
        tree2_file : str
            File with tree in Newick format.
        output_dir : str
            Output directory.
        min_support : float
            Minimum value to consider a lineage well supported.
        min_taxa : int
            Only consider lineage with sufficient number of taxa.
        named_only : boolean
            Only consider named lineages.  
        """

        if not named_only:
            self.logger.error(
                "This command currently assumes the 'named_only' flag will be thrown."
            )
            sys.exit()

        tree1_name = os.path.splitext(os.path.basename(tree1_file))[0]
        tree2_name = os.path.splitext(os.path.basename(tree2_file))[0]

        tree1 = dendropy.Tree.get_from_path(tree1_file,
                                            schema='newick',
                                            rooting='force-rooted',
                                            preserve_underscores=True)

        tree2 = dendropy.Tree.get_from_path(tree2_file,
                                            schema='newick',
                                            rooting='force-rooted',
                                            preserve_underscores=True)

        # prune both trees to the set of common taxa
        taxa1 = set()
        for t in tree1.leaf_node_iter():
            taxa1.add(t.taxon.label)

        taxa2 = set()
        for t in tree2.leaf_node_iter():
            taxa2.add(t.taxon.label)

        taxa_in_common = taxa1.intersection(taxa2)
        self.logger.info('Tree 1 contains %d taxa.' % len(taxa1))
        self.logger.info('Tree 2 contains %d taxa.' % len(taxa2))
        self.logger.info('Pruning trees to the %d taxa in common.' %
                         len(taxa_in_common))

        tree1.retain_taxa_with_labels(taxa_in_common)
        tree2.retain_taxa_with_labels(taxa_in_common)

        # identify nodes meeting specified criteria
        tree1_nodes = {}
        tree2_nodes = {}
        node_support1 = {}
        node_support2 = {}
        for tree, tree_nodes, support_values in ([
                tree1, tree1_nodes, node_support1
        ], [tree2, tree2_nodes, node_support2]):
            for n in tree.preorder_internal_node_iter():
                support, taxon_name, _auxiliary_info = parse_label(n.label)
                if named_only and not taxon_name:
                    continue

                if not support:
                    continue

                support = int(support)
                support_values[taxon_name] = support

                num_taxa = sum([1 for _ in n.leaf_iter()])
                if support >= min_support and num_taxa >= min_taxa:
                    tree_nodes[taxon_name] = [support, num_taxa, n]

        self.logger.info('Tree 1 has %d supported nodes.' % len(tree1_nodes))
        self.logger.info('Tree 2 has %d supported nodes.' % len(tree2_nodes))

        # compare supported nodes between the two trees
        diffs = {}
        congruent_taxa = defaultdict(
            list)  # same node bootstrap supported in both trees
        incongruent_taxa = defaultdict(
            list
        )  # node supported in both trees, but have different extant taxa
        unresolved_taxa = defaultdict(
            list
        )  # supported node in one tree is not present and/or well support in the other tree

        for taxon, data1 in tree1_nodes.items():
            most_specific_taxon = taxon.split(';')[-1].strip()
            rank_index = Taxonomy.rank_prefixes.index(most_specific_taxon[0:3])
            support1, num_taxa1, node1 = data1

            if taxon in tree2_nodes:
                support2, num_taxa2, node2 = tree2_nodes[taxon]

                taxa1 = set([t.taxon.label for t in node1.leaf_iter()])
                taxa2 = set([t.taxon.label for t in node2.leaf_iter()])

                diff_taxa = taxa1.symmetric_difference(taxa2)

                if len(diff_taxa) > 0:
                    diffs[taxon] = [
                        len(diff_taxa), ','.join(taxa1 - taxa2),
                        ','.join(taxa2 - taxa1)
                    ]
                    incongruent_taxa[rank_index].append(
                        (taxon, len(diff_taxa)))
                else:
                    congruent_taxa[rank_index].append(
                        (taxon, support1, support2))
            else:
                unresolved_taxa[rank_index].append(
                    (taxon, tree1_name, support1, tree2_name,
                     node_support2.get(taxon, -1)))

        # identify unresolved taxa in tree 2
        for taxon, data2 in tree2_nodes.items():
            support2, num_taxa2, node2 = data1
            if taxon not in tree1_nodes:
                unresolved_taxa[rank_index].append(
                    (taxon, tree2_name, support2, tree1_name,
                     node_support1.get(taxon, -1)))

        # write out difference in extant taxa for incongruent taxa
        tax_diff_file = os.path.join(output_dir, 'incongruent_taxa.tsv')
        fout = open(tax_diff_file, 'w')
        fout.write(
            'Taxon\tNo. Incongruent Taxa\tTree1 - Tree2\tTree2 - Tree1\n')
        for taxon in Taxonomy().sort_taxa(list(diffs.keys())):
            num_diffs, t12_diff_str, t21_diff_str = diffs[taxon]
            fout.write('%s\t%d\t%s\t%s\n' %
                       (taxon, num_diffs, t12_diff_str, t21_diff_str))

        fout.close()

        # write out classification of each node
        classification_file = os.path.join(output_dir,
                                           'taxon_classification.tsv')
        fout_classification = open(classification_file, 'w')
        fout_classification.write('Rank\tTaxon\tClassification\tDescription\n')

        stats_file = os.path.join(output_dir, 'tree_diff_stats.tsv')
        fout_stats = open(stats_file, 'w')
        fout_stats.write(
            'Rank\tCongruent\tIncongruent\tUnresolved for %s\tUnresolved for %s\n'
            % (tree1_name, tree2_name))
        for rank, rank_label in enumerate(Taxonomy.rank_labels):
            for info in congruent_taxa[rank]:
                taxon, support1, support2 = info

                desc = 'Taxon is congruent with %d and %d support.' % (
                    support1, support2)
                fout_classification.write(
                    '%s\t%s\t%s\t%s\n' %
                    (rank_label, taxon, 'congruent', desc))

            for info in incongruent_taxa[rank]:
                taxon, num_diff_taxa = info
                desc = 'Taxon has %d extant taxa in disagreement.' % num_diff_taxa
                fout_classification.write(
                    '%s\t%s\t%s\t%s\n' %
                    (rank_label, taxon, 'incongruent', desc))

            unresolved1 = 0
            unresolved2 = 0
            for info in unresolved_taxa[rank]:
                taxon, supported_tree_name, support1, unsupported_tree_name, support2 = info
                desc = 'Taxon is supported in %s (%d), but not in %s (%d)' % (
                    supported_tree_name, support1, unsupported_tree_name,
                    support2)
                fout_classification.write(
                    '%s\t%s\t%s\t%s\n' %
                    (rank_label, taxon, 'incongruent', desc))

                if supported_tree_name == tree1_name:
                    unresolved1 += 1
                else:
                    unresolved2 += 1

            fout_stats.write(
                '%s\t%d\t%d\t%s\t%s\n' %
                (rank_label, len(congruent_taxa[rank]),
                 len(incongruent_taxa[rank]), unresolved1, unresolved2))

        fout_classification.close()
        fout_stats.close()
Esempio n. 18
0
    def rank_res(self, options):
        """Calculate taxonomic resolution at each rank."""

        check_file_exists(options.input_tree)
        check_file_exists(options.taxonomy_file)

        if options.taxa_file:
            taxa_out = open(options.taxa_file, 'w')
            taxa_out.write('Rank\tLowest Rank\tTaxon\n')

        # determine taxonomic resolution of named groups
        tree = dendropy.Tree.get_from_path(options.input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        rank_res = defaultdict(lambda: defaultdict(int))
        for node in tree.preorder_node_iter(lambda n: n != tree.seed_node):
            if not node.label or node.is_leaf():
                continue

            _support, taxon_name, _auxiliary_info = parse_label(node.label)

            if taxon_name:
                lowest_rank = [x.strip()
                               for x in taxon_name.split(';')][-1][0:3]
                for rank_prefix in Taxonomy.rank_prefixes:
                    if rank_prefix in taxon_name:
                        rank_res[rank_prefix][lowest_rank] += 1
                        if options.taxa_file:
                            rank_prefix_name = Taxonomy.rank_labels[
                                Taxonomy.rank_index[rank_prefix]]
                            lowest_rank_name = Taxonomy.rank_labels[
                                Taxonomy.rank_index[lowest_rank]]
                            taxa_out.write('%s\t%s\t%s\n' %
                                           (rank_prefix_name, lowest_rank_name,
                                            taxon_name))

        # identify any singleton taxa which are treated as having species level resolution
        for line in open(options.taxonomy_file):
            line_split = line.split('\t')
            genome_id = line_split[0]
            taxonomy = line_split[1].split(';')

            for i, rank_prefix in enumerate(Taxonomy.rank_prefixes):
                if taxonomy[i] == rank_prefix:
                    # this taxa is undefined at the specified rank so
                    # must be the sole representative; e.g., a p__
                    # indicates a taxon that represents a novel phyla
                    rank_res[rank_prefix]['s__'] += 1
                    if options.taxa_file:
                        rank_prefix_name = Taxonomy.rank_labels[
                            Taxonomy.rank_index[rank_prefix]]
                        taxa_out.write('%s\t%s\t%s (%s)\n' %
                                       (rank_prefix_name, 'species',
                                        taxonomy[i], genome_id))
        if options.taxa_file:
            taxa_out.close()

        # write out results
        fout = open(options.output_file, 'w')
        fout.write('Category')
        for rank in Taxonomy.rank_labels[1:]:
            fout.write('\t' + rank)
        fout.write('\n')

        for i, rank_prefix in enumerate(Taxonomy.rank_prefixes[1:]):
            fout.write(Taxonomy.rank_labels[i + 1])

            for j, r in enumerate(Taxonomy.rank_prefixes[1:]):
                if i >= j:
                    fout.write('\t' + str(rank_res[r].get(rank_prefix, 0)))
                else:
                    fout.write('\t-')
            fout.write('\n')
        fout.close()

        self.logger.info('Done.')
Esempio n. 19
0
    def run(self, input_tree, trusted_taxa_file, min_children, taxonomy_file,
            output_dir):
        """Calculate distribution of branch lengths at each taxonomic rank.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        trusted_taxa_file : str
            File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        min_children : int
            Only consider taxa with at least the specified number of children taxa when inferring distribution.
        taxonomy_file : str
            File containing taxonomic information for leaf nodes (if NULL, read taxonomy from tree).
        output_dir : str
            Desired output directory.
        """

        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        input_tree_name = os.path.splitext(os.path.basename(input_tree))[0]

        # pull taxonomy from tree
        if not taxonomy_file:
            self.logger.info('Reading taxonomy from tree.')
            taxonomy_file = os.path.join(output_dir,
                                         '%s.taxonomy.tsv' % input_tree_name)
            taxonomy = Taxonomy().read_from_tree(input_tree)
            Taxonomy().write(taxonomy, taxonomy_file)
        else:
            self.logger.info('Reading taxonomy from file.')
            taxonomy = Taxonomy().read(taxonomy_file)

        # read trusted taxa
        trusted_taxa = None
        if trusted_taxa_file:
            trusted_taxa = read_taxa_file(trusted_taxa_file)

        # determine taxa to be used for inferring distribution
        taxa_for_dist_inference = filter_taxa_for_dist_inference(
            tree, taxonomy, set(), min_children, -1)

        # determine branch lengths to leaves for named lineages
        rank_bl_dist = defaultdict(list)
        taxa_bl_dist = defaultdict(list)
        taxa_at_rank = defaultdict(list)
        for node in tree.postorder_node_iter():
            if node.is_leaf() or not node.label:
                continue

            _support, taxon, _auxiliary_info = parse_label(node.label)
            if not taxon:
                continue

            # get most specific rank in multi-rank taxa string
            taxa = [t.strip() for t in taxon.split(';')]
            taxon = taxa[-1]

            most_specific_rank = taxon[0:3]
            taxa_at_rank[Taxonomy.rank_index[most_specific_rank]].append(taxon)

            for n in node.leaf_iter():
                dist_to_node = self._dist_to_ancestor(n, node)

                for t in taxa:
                    taxa_bl_dist[t].append(dist_to_node)

            rank = Taxonomy.rank_labels[
                Taxonomy.rank_index[most_specific_rank]]
            if rank != 'species' or Taxonomy().validate_species_name(taxon):
                if taxon in taxa_for_dist_inference:
                    rank_bl_dist[rank].append(np_mean(taxa_bl_dist[taxon]))

        # report number of taxa at each rank
        print('')
        print('Rank\tTaxa\tTaxa for Inference')
        for rank, taxa in taxa_at_rank.items():
            taxa_for_inference = [
                x for x in taxa if x in taxa_for_dist_inference
            ]
            print('%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa),
                                  len(taxa_for_inference)))
        print('')

        # report results sorted by rank
        sorted_taxon = []
        for rank_prefix in Taxonomy.rank_prefixes:
            taxa_at_rank = []
            for taxon in taxa_bl_dist:
                if taxon.startswith(rank_prefix):
                    taxa_at_rank.append(taxon)

            sorted_taxon += sorted(taxa_at_rank)

        # report results for each named group
        taxa_file = os.path.join(output_dir,
                                 '%s.taxa_bl_dist.tsv' % input_tree_name)
        fout = open(taxa_file, 'w')
        fout.write(
            'Taxa\tUsed for Inference\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n'
        )
        for taxon in sorted_taxon:
            dist = taxa_bl_dist[taxon]

            p = np_percentile(dist, [5, 10, 50, 90, 95])
            fout.write(
                '%s\t%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' %
                (taxon, str(taxon in taxa_for_dist_inference), np_mean(dist),
                 np_std(dist), p[0], p[1], p[2], p[3], p[4]))
        fout.close()

        # report results for each taxonomic rank
        rank_file = os.path.join(output_dir,
                                 '%s.rank_bl_dist.tsv' % input_tree_name)
        fout = open(rank_file, 'w')
        fout.write('Rank\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n')
        for rank in Taxonomy.rank_labels:
            dist = rank_bl_dist[rank]
            p = np_percentile(dist, [5, 10, 50, 90, 95])
            fout.write('%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' %
                       (rank, np_mean(dist), np_std(dist), p[0], p[1], p[2],
                        p[3], p[4]))
        fout.close()

        # report results for each node
        output_bl_file = os.path.join(output_dir,
                                      '%s.node_bl_dist.tsv' % input_tree_name)
        self._write_bl_dist(tree, output_bl_file)
Esempio n. 20
0
    def median_rd_over_phyla(self, 
                                tree, 
                                taxa_for_dist_inference,
                                taxonomy):
        """Calculate the median relative divergence over all phyla rootings.
        
        Parameters
        ----------
        tree : Tree
          Dendropy tree.
        taxa_for_dist_inference : set
          Taxa to use for inference relative divergence distributions.
        taxonomy : d[taxon_id] -> [d__, p__, ..., s__]
          Taxonomy of extant taxa.
        """
    
        # get list of phyla level lineages
        all_phyla = get_phyla_lineages(tree)
        self.logger.info('Identified %d phyla.' % len(all_phyla))
        
        phyla = [p for p in all_phyla if p in taxa_for_dist_inference]
        self.logger.info('Using %d phyla as rootings for inferring distributions.' % len(phyla))
        if len(phyla) < 2:
            self.logger.error('Rescaling requires at least 2 valid phyla.')
            sys.exit(-1)
            
        # give each node a unique id
        for i, n in enumerate(tree.preorder_node_iter()):
            n.id = i
    
        # calculate relative divergence for tree rooted on each phylum
        phylum_rel_dists = {}
        rel_node_dists = defaultdict(list)
        rd = RelativeDistance()
        for p in phyla:
            phylum = p.replace('p__', '').replace(' ', '_').lower()
            self.logger.info('Calculating information with rooting on %s.' % phylum.capitalize())
            
            cur_tree = self.root_with_outgroup(tree, taxonomy, p)
            
            # calculate relative distance to taxa
            rel_dists = rd.rel_dist_to_named_clades(cur_tree)
            rel_dists.pop(0, None) # remove results for Domain

            # remove named groups in outgroup
            children = Taxonomy().children(p, taxonomy)
            for r in rel_dists.keys():
                rel_dists[r].pop(p, None)

            for t in children:
                for r in rel_dists.keys():
                    rel_dists[r].pop(t, None)

            phylum_rel_dists[phylum] = rel_dists
            
            # calculate relative distance to all nodes')
            rd.decorate_rel_dist(cur_tree)
            
            # determine which lineages represents the 'ingroup'
            ingroup_subtree = None
            for c in cur_tree.seed_node.child_node_iter():
                _support, taxon_name, _auxiliary_info = parse_label(c.label)
                if not taxon_name or p not in taxon_name:
                    ingroup_subtree = c
                    break
            
            # do a preorder traversal of 'ingroup' and record relative divergence to nodes
            for n in ingroup_subtree.preorder_iter():                        
                rel_node_dists[n.id].append(n.rel_dist)
                                                           
        return phylum_rel_dists, rel_node_dists
Esempio n. 21
0
    def run(self, input_tree, rd_thresholds, output_dir):
        """Calculate number of taxa for specified relative divergence thresholds.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        rd_thresholds : d[rank] -> threshold
            Relative divergence threshold for defining taxonomic ranks.
        output_dir : str
            Desired output directory.
        """

        # get list of phyla level lineages
        tree = tree = dendropy.Tree.get_from_path(input_tree,
                                                  schema='newick',
                                                  rooting='force-rooted',
                                                  preserve_underscores=True)
        phyla = get_phyla_lineages(tree)
        self.logger.info('Identified %d phyla for rooting.' % len(phyla))

        self.logger.info('Reading taxonomy from tree.')
        taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv')
        taxonomy = Taxonomy().read_from_tree(input_tree)
        Taxonomy().write(taxonomy, taxonomy_file)

        rd = RelativeDistance()
        overall_ranks_below_taxon = defaultdict(lambda: defaultdict(list))
        for p in phyla:
            phylum_children = Taxonomy().children(p, taxonomy)
            phylum = p.replace('p__', '')
            self.logger.info('Calculating information with rooting on %s.' %
                             phylum)

            phylum_dir = os.path.join(output_dir, phylum)
            if not os.path.exists(phylum_dir):
                os.makedirs(phylum_dir)

            output_tree = os.path.join(phylum_dir, 'rerooted.tree')
            os.system('genometreetk outgroup %s %s %s %s' %
                      (input_tree, taxonomy_file, p, output_tree))

            # calculate relative distance for all nodes
            cur_tree = dendropy.Tree.get_from_path(output_tree,
                                                   schema='newick',
                                                   rooting='force-rooted',
                                                   preserve_underscores=True)
            rd.decorate_rel_dist(cur_tree)

            # determine ranks
            for n in cur_tree.postorder_node_iter(
                    lambda n: n != tree.seed_node):
                ranks = []
                for rank_prefix, threshold in rd_thresholds.items():
                    if n.rel_dist >= threshold and n.parent_node.rel_dist < threshold:
                        ranks.append(rank_prefix.capitalize() + '__')

                if ranks:
                    if not n.label:
                        n.label = '|%s [rd=%.2f]' % (';'.join(ranks),
                                                     n.rel_dist)
                    else:
                        n.label += '|%s [rd=%.2f]' % (';'.join(ranks),
                                                      n.rel_dist)

            cur_tree.write_to_path(os.path.join(phylum_dir, 'rd_ranks.tree'),
                                   schema='newick',
                                   suppress_rooting=True,
                                   unquoted_underscores=True)

            # determine number of ranks below root and all named nodes
            ranks_below_taxon = defaultdict(lambda: defaultdict(int))
            for cur_node in cur_tree.postorder_node_iter():
                if cur_node == cur_tree.seed_node:
                    cur_taxon = 'root'
                elif cur_node.label:
                    _support, cur_taxon, _auxiliary_info = parse_label(
                        cur_node.label)
                    if not cur_taxon or cur_taxon.strip() == '':
                        continue
                else:
                    continue

                for n in cur_node.postorder_iter():
                    if not n.label:
                        continue

                    _support, _taxon, auxiliary_info = parse_label(n.label)
                    if auxiliary_info:
                        ranks = auxiliary_info[0:auxiliary_info.rfind('[')]
                        ranks = [r.strip() for r in ranks.split(';')]

                        for r in ranks:
                            ranks_below_taxon[cur_taxon][r] += 1

            for taxon in ranks_below_taxon:
                if taxon == p or taxon in phylum_children:
                    # do not record results for named groups in the lineage
                    # used for rooting
                    continue

                for rank, count in ranks_below_taxon[taxon].items():
                    overall_ranks_below_taxon[taxon][rank].append(count)

            results_table = os.path.join(phylum_dir, 'rd_ranks.tsv')
            self.write_rank_count(ranks_below_taxon, results_table)

        results_table = os.path.join(output_dir, 'mean_rd_ranks.tsv')
        self.write_rank_count(overall_ranks_below_taxon, results_table)
Esempio n. 22
0
    def run(self, input_tree, trusted_taxa_file, min_children, taxonomy_file, output_dir):
        """Calculate distribution of branch lengths at each taxonomic rank.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        trusted_taxa_file : str
            File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        min_children : int
            Only consider taxa with at least the specified number of children taxa when inferring distribution.
        taxonomy_file : str
            File containing taxonomic information for leaf nodes (if NULL, read taxonomy from tree).
        output_dir : str
            Desired output directory.
        """

        tree = dendropy.Tree.get_from_path(input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        # pull taxonomy from tree
        if not taxonomy_file:
            self.logger.info('Reading taxonomy from tree.')
            taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv')
            taxonomy = Taxonomy().read_from_tree(input_tree)
            Taxonomy().write(taxonomy, taxonomy_file)
        else:
            self.logger.info('Reading taxonomy from file.')
            taxonomy = Taxonomy().read(taxonomy_file)
            
        # read trusted taxa
        trusted_taxa = None
        if trusted_taxa_file:
            trusted_taxa = read_taxa_file(trusted_taxa_file)
        
        # determine taxa to be used for inferring distribution
        taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, set(), min_children, -1)
        
        # determine branch lengths to leaves for named lineages
        rank_bl_dist = defaultdict(list)
        taxa_bl_dist = defaultdict(list)
        taxa_at_rank = defaultdict(list)
        for node in tree.postorder_node_iter():
            if node.is_leaf() or not node.label:
                continue
                
            _support, taxon, _auxiliary_info = parse_label(node.label)
            if not taxon:
                continue
                
            # get most specific rank in multi-rank taxa string
            taxa = [t.strip() for t in taxon.split(';')]
            taxon = taxa[-1]
            
            most_specific_rank = taxon[0:3]
            taxa_at_rank[Taxonomy.rank_index[most_specific_rank]].append(taxon)
                
            for n in node.leaf_iter():
                dist_to_node = 0
                while n != node:
                    dist_to_node += n.edge_length
                    n = n.parent_node
                
                for t in taxa:
                    taxa_bl_dist[t].append(dist_to_node)

            rank = Taxonomy.rank_labels[Taxonomy.rank_index[most_specific_rank]]
            if rank != 'species' or Taxonomy().validate_species_name(taxon):
                if taxon in taxa_for_dist_inference:
                    rank_bl_dist[rank].append(np_mean(taxa_bl_dist[taxon]))
                            
        # report number of taxa at each rank
        print ''
        print 'Rank\tTaxa\tTaxa for Inference'
        for rank, taxa in taxa_at_rank.iteritems():
            taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference]
            print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference))
        print ''
                    
        # report results sorted by rank
        sorted_taxon = []
        for rank_prefix in Taxonomy.rank_prefixes:
            taxa_at_rank = []
            for taxon in taxa_bl_dist:
                if taxon.startswith(rank_prefix):
                    taxa_at_rank.append(taxon)
                    
            sorted_taxon += sorted(taxa_at_rank)
                
        # report results for each named group
        taxa_file = os.path.join(output_dir, 'taxa_bl_dist.tsv')
        fout = open(taxa_file, 'w')
        fout.write('Taxa\tUsed for Inference\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n')
        for taxon in sorted_taxon:
            dist = taxa_bl_dist[taxon]

            p = np_percentile(dist, [5, 10, 50, 90, 95])
            fout.write('%s\t%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (taxon,
                                                                str(taxon in taxa_for_dist_inference),
                                                                np_mean(dist),
                                                                np_std(dist),
                                                                p[0], p[1], p[2], p[3], p[4]))
        fout.close()
        
        # report results for each taxonomic rank
        rank_file = os.path.join(output_dir, 'rank_bl_dist.tsv')
        fout = open(rank_file, 'w')
        fout.write('Rank\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n')
        for rank in Taxonomy.rank_labels:
            dist = rank_bl_dist[rank]
            p = np_percentile(dist, [5, 10, 50, 90, 95])
            fout.write('%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (rank,
                                                                np_mean(dist),
                                                                np_std(dist),
                                                                p[0], p[1], p[2], p[3], p[4]))
        fout.close()
        
Esempio n. 23
0
    def run(self, input_tree, output_tree, min_support, only_named_clades,
            min_length, show_percentiles, show_relative_divergence,
            show_prediction, thresholds):
        """Read distribution file.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        output_tree : str
            Name of output tree.
        min_support : int
            Only decorate nodes above specified support value.
        only_named_clades : boolean
            Only decorate nodes with existing labels.
        min_length : float
            Only decorate nodes above specified length.
        show_percentiles : bool
            Flag indicating if percentiles should be placed on nodes.
        show_relative_divergence : bool
            Flag indicating if relative divergences should be placed on nodes.
        show_prediction : bool
            Flag indicating if predicate ranks should be placed on nodes.
        thresholds : d[rank] -> threshold
            Relative divergence threshold for defining taxonomic ranks.
        """

        # make sure we have a TreeNode object
        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        # calculate relative distance for all nodes
        rd = RelativeDistance()
        rd.decorate_rel_dist(tree)

        # decorate nodes based on specified criteria
        self.logger.info('')
        self.logger.info('  %s\t%s' % ('Rank', 'Prediction results'))

        correct = defaultdict(int)
        incorrect = defaultdict(int)

        fout = open(output_tree + '.info', 'w')
        fout.write(
            'Taxon name\tPredicted rank\tRelative divergence\tCurrent rank percentile\tPredicted rank percentile\n'
        )
        for n in tree.preorder_node_iter():
            if n.is_leaf():
                continue

            if n.edge_length < min_length:
                continue

            # parse taxon name and support value from node label
            if n.label:
                support, taxon_name, _auxiliary_info = parse_label(n.label)
                n.label += '|'
            else:
                support = 100
                taxon_name = None
                n.label = ''

            if support and float(support) < min_support:
                continue

            if only_named_clades and not taxon_name:
                continue

            # Decorate node with predicted rank prefix. Nodes with
            # a relative divergence greater than the genus threshold
            # are a species. Nodes with a relative divergence less than
            # the domain threshold have no real prediction, so are marked
            # with an 'X__', All other nodes will be assigned an intermediate
            # rank based on the threshold values.
            if show_prediction:
                # calculate distance to each median threshold
                min_dist = 1e6
                predicted_rank = None
                for rank, threshold in thresholds.items():
                    d = abs(n.rel_dist - threshold)
                    if d < min_dist:
                        min_dist = d
                        rank_index = self.rank_designators.index(rank)
                        predicted_rank = self.rank_prefixes[rank_index]

                n.label += predicted_rank

            if show_relative_divergence:
                n.label += '[rd=%.2f]' % n.rel_dist

            if taxon_name and predicted_rank != self.highly_basal_designator:
                # tabulate number of correct and incorrect predictions
                named_rank = taxon_name.split(';')[-1][0:3]
                if named_rank == predicted_rank.lower():
                    correct[named_rank] += 1
                else:
                    incorrect[named_rank] += 1

            if taxon_name:
                fout.write('%s\t%s\t%.3f\n' %
                           (taxon_name, predicted_rank, n.rel_dist))

        fout.close()
        root.write(output_tree)

        for rank_prefix in self.rank_prefixes[1:7]:
            correct_taxa = correct[rank_prefix.lower()]
            incorrect_taxa = incorrect[rank_prefix.lower()]
            total_taxa = max(correct_taxa + incorrect_taxa, 1)
            self.logger.info('  %s\t%d of %d (%.2f%%)' %
                             (rank_prefix, correct_taxa, total_taxa,
                              correct_taxa * 100.0 / total_taxa))
Esempio n. 24
0
    def optimal(self, input_tree, 
                        rank,
                        min_dist, 
                        max_dist, 
                        step_size,
                        output_table):
        """Determine branch length for best congruency with existing taxonomy.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        rank : int
            Taxonomic rank to consider (1=Phylum, ..., 6=Species).
        output_table : str
            Name of output table.
        """
    
        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree,
                                            schema='newick',
                                            rooting='force-rooted',
                                            preserve_underscores=True)
        
        # get mean distance to terminal taxa for each node along with
        # other stats needed to determine classification
        self.logger.info('Determining MDTT for each node.')
        rank_prefix = Taxonomy.rank_prefixes[rank]
        child_rank_prefix = Taxonomy.rank_prefixes[rank+1]
        rank_info = []
        rank_dists = set()                                
        for node in tree.seed_node.preorder_internal_node_iter():
            if node == tree.seed_node:
                continue
                
            # check if node is at the specified rank
            node_taxon = None
            if node.label:
                support, taxon_name, _auxiliary_info = parse_label(node.label)
                
                if taxon_name:
                    for taxon in [x.strip() for x in taxon_name.split(';')]:
                        if taxon.startswith(rank_prefix):
                            node_taxon = taxon
                        
            if not node_taxon:
                continue
                
            # check that node has two descendants at the next rank
            child_rank_taxa = []
            for c in node.levelorder_iter():
                if c.label:
                    support, taxon_name, _auxiliary_info = parse_label(c.label)
                    
                    if taxon_name:
                        for taxon in [x.strip() for x in taxon_name.split(';')]:
                            if taxon.startswith(child_rank_prefix):
                                child_rank_taxa.append(taxon)
                            
                if len(child_rank_taxa) >= 2:
                    break
                    
            if len(child_rank_taxa) < 2:
                continue
                
            # get mean branch length to terminal taxa
            dists_to_tips = []
            for t in node.leaf_iter():
                dists_to_tips.append(self._dist_to_ancestor(t, node))
                
            node_dist = np_mean(dists_to_tips)
            
            # get mean branch length to terminal taxa for first ancestor spanning multiple phyla
            ancestor = self._ancestor_multiple_taxa_at_rank(node, rank_prefix)
            
            ancestor_dists_to_tips = []
            for t in ancestor.leaf_iter():
                ancestor_dists_to_tips.append(self._dist_to_ancestor(t, ancestor))
                
            ancestor_dist = np_mean(ancestor_dists_to_tips)
                    
            rank_info.append([node_dist, ancestor_dist, node_taxon])
            rank_dists.add(node_dist)
            
        self.logger.info('Calculating threshold from %d taxa with specified rank resolution.' % len(rank_info))
            
        fout = open('bl_optimal_taxa_dists.tsv' , 'w')
        fout.write('Taxon\tNode MDTT\tMulti-phyla Ancestor MDTT\n')
        for node_dist, ancestor_dist, node_taxon in rank_info:
            fout.write('%s\t%.3f\t%.3f\n' % (node_taxon, node_dist, ancestor_dist))
        fout.close()
                    
        # report number of correct and incorrect taxa for each threshold
        fout = open(output_table, 'w')
        header = 'Threshold\tCorrect\tIncorrect\tPrecision\tNo. Lineages\tNo. Multiple Taxa Lineages\tNo. Terminal Lineages'
        fout.write(header + '\n')
        print header
        
        top_correct = 0
        top_incorrect = 0
        top_precision = 0
        for d in np_arange(min_dist, max_dist+step_size, step_size):
            rank_dists.add(d)
            
        for dist_threshold in sorted(rank_dists, reverse=True):
            correct = 0
            incorrect = 0
            for node_dist, ancestor_dist, node_taxon in rank_info:
                # check if node/edge would be collapsed at the given threshold
                if node_dist <= dist_threshold and ancestor_dist > dist_threshold:
                    correct += 1
                elif node_dist > dist_threshold:
                    incorrect += 1
                else:
                    incorrect += 1 # above ancestor with multiple taxa
         
            denominator = correct + incorrect
            if denominator:
                precision = float(correct) / denominator
            else:
                precision = 0
                
            num_lineages, num_terminal_lineages = self._num_lineages(tree, dist_threshold)
                    
            row = '%f\t%d\t%d\t%.3f\t%d\t%d\t%d' % (dist_threshold, 
                                                            correct, 
                                                            incorrect, 
                                                            precision,
                                                            num_lineages + num_terminal_lineages,
                                                            num_lineages, 
                                                            num_terminal_lineages)
                                                            
            fout.write(row + '\n')
            print row
            
            if precision > top_precision:
                top_correct = correct
                top_incorrect = incorrect
                top_precision = precision
                top_threshold = dist_threshold
                
        return top_threshold, top_correct, top_incorrect
Esempio n. 25
0
def filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support):
    """Determine taxa to use for inferring distribution of relative divergences.

    Parameters
    ----------
    tree : Dendropy Tree
        Phylogenetic tree.
    taxonomy : d[taxon ID] -> [d__x; p__y; ...]
        Taxonomy for each taxon.
    trusted_taxa : iterable
        Trusted taxa to consider when inferring distribution.
    min_children : int
        Only consider taxa with at least the specified number of children taxa when inferring distribution.
    min_support : float
        Only consider taxa with at least this level of support when inferring distribution.
    """

    # determine children taxa for each named group
    taxon_children = Taxonomy().taxon_children(taxonomy)

    # get all named groups
    taxa_for_dist_inference = set()
    for taxon_id, taxa in taxonomy.items():
        for taxon in taxa:
            taxa_for_dist_inference.add(taxon)

    # sanity check species names as these are a common problem
    species = set()
    for taxon_id, taxa in taxonomy.items():
        if len(taxa) > Taxonomy.rank_index['s__']:
            species_name = taxa[Taxonomy.rank_index['s__']]
            valid, error_msg = True, None
            if species_name != 's__':
                valid, error_msg = Taxonomy().validate_species_name(species_name, require_full=True, require_prefix=True)
            if not valid:
                print('[Warning] Species name %s for %s is invalid: %s' % (species_name, taxon_id, error_msg))
                continue
                
            species.add(species_name)

    # restrict taxa to those with a sufficient number of named children
    # Note: a taxonomic group with no children will not end up in the
    # taxon_children data structure so care must be taken when applying
    # this filtering criteria.
    if min_children > 0:
        valid_taxa = set()
        for taxon, children_taxa in taxon_children.items():
            if len(children_taxa) >= min_children:
                valid_taxa.add(taxon)

        taxa_for_dist_inference.intersection_update(valid_taxa)

        # explicitly add in the species since they have no
        # children and thus be absent from the taxon_child dictionary
        taxa_for_dist_inference.update(species)

    # restrict taxa used for inferring distribution to those with sufficient support
    if min_support > 0:
        for node in tree.preorder_node_iter():
            if not node.label or node.is_leaf():
                continue

            # check for support value
            support, taxon_name, _auxiliary_info = parse_label(node.label)

            if not taxon_name:
                continue

            if support and float(support) < min_support:
                taxa_for_dist_inference.difference_update([taxon_name])
            elif not support and min_support > 0:
                # no support value, so inform user if they were trying to filter on this property
                print('[Error] Tree does not contain support values. As such, --min_support should be set to 0.')
                continue

    # restrict taxa used for inferring distribution to the trusted set
    if trusted_taxa:
        taxa_for_dist_inference = trusted_taxa.intersection(taxa_for_dist_inference)

    return taxa_for_dist_inference
Esempio n. 26
0
    def run(self, input_tree, rd_thresholds, output_dir):
        """Calculate number of taxa for specified relative divergence thresholds.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        rd_thresholds : d[rank] -> threshold
            Relative divergence threshold for defining taxonomic ranks.
        output_dir : str
            Desired output directory.
        """

        # get list of phyla level lineages
        tree = TreeNode.read(input_tree, convert_underscores=False)
        phyla = get_phyla_lineages(tree)
        self.logger.info('Identified %d phyla for rooting.' % len(phyla))
        
        self.logger.info('Reading taxonomy from tree.')
        taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv')
        taxonomy = Taxonomy().read_from_tree(input_tree)
        Taxonomy().write(taxonomy, taxonomy_file)
        
        rd = RelativeDistance()
        overall_ranks_below_taxon = defaultdict(lambda: defaultdict(list))
        for p in phyla:
            phylum_children = Taxonomy().children(p, taxonomy)
            phylum = p.replace('p__', '')
            self.logger.info('Calculating information with rooting on %s.' % phylum)

            phylum_dir = os.path.join(output_dir, phylum)
            if not os.path.exists(phylum_dir):
                os.makedirs(phylum_dir)

            output_tree = os.path.join(phylum_dir, 'rerooted.tree')
            os.system('genometreetk outgroup %s %s %s %s' % (input_tree, taxonomy_file, p, output_tree))

            # calculate relative distance for all nodes
            cur_tree = dendropy.Tree.get_from_path(output_tree, 
                                                schema='newick', 
                                                rooting='force-rooted', 
                                                preserve_underscores=True)
            rd.decorate_rel_dist(cur_tree)

            # determine ranks
            for n in cur_tree.postorder_node_iter(lambda n: n != tree.seed_node):
                ranks = []
                for rank_prefix, threshold in rd_thresholds.iteritems():
                    if n.rel_dist >= threshold and n.parent_node.rel_dist < threshold:
                        ranks.append(rank_prefix.capitalize() + '__')
                        
                if ranks:
                    if not n.label:
                        n.label = '|%s [rd=%.2f]' % (';'.join(ranks), n.rel_dist)
                    else:
                        n.label += '|%s [rd=%.2f]' % (';'.join(ranks), n.rel_dist)

            cur_tree.write_to_path(os.path.join(phylum_dir, 'rd_ranks.tree'), 
                                    schema='newick', 
                                    suppress_rooting=True, 
                                    unquoted_underscores=True)
            
            # determine number of ranks below root and all named nodes
            ranks_below_taxon = defaultdict(lambda: defaultdict(int))
            for cur_node in cur_tree.postorder_node_iter():
                if cur_node == cur_tree.seed_node:
                    cur_taxon = 'root'
                elif cur_node.label:
                    _support, cur_taxon, _auxiliary_info = parse_label(cur_node.label)
                    if not cur_taxon or cur_taxon.strip() == '':
                        continue
                else:
                    continue
                        
                for n in cur_node.postorder_iter():
                    if not n.label:
                        continue
                        
                    _support, _taxon, auxiliary_info = parse_label(n.label)
                    if auxiliary_info:
                        ranks = auxiliary_info[0:auxiliary_info.rfind('[')]
                        ranks = [r.strip() for r in ranks.split(';')]

                        for r in ranks:
                            ranks_below_taxon[cur_taxon][r] += 1
                            
            for taxon in ranks_below_taxon:
                if taxon == p or taxon in phylum_children:
                    # do not record results for named groups in the lineage 
                    # used for rooting
                    continue
                    
                for rank, count in ranks_below_taxon[taxon].iteritems():
                    overall_ranks_below_taxon[taxon][rank].append(count)
                            
            results_table = os.path.join(phylum_dir, 'rd_ranks.tsv')
            self.write_rank_count(ranks_below_taxon, results_table)

        results_table = os.path.join(output_dir, 'mean_rd_ranks.tsv')
        self.write_rank_count(overall_ranks_below_taxon, results_table)