Esempio n. 1
0
 def _tax_diff_table(self, tax1, tax2, output_table):
     """Tabulate incongruency of taxonomy strings at each rank."""
     
     fout = open(output_table, 'w')
     fout.write('Lineage\tNo. Extent Taxa')
     for rank_label in Taxonomy.rank_labels:
         fout.write('\t%s (%%)' % rank_label.title())
     fout.write('\n')
     
     taxonomy = Taxonomy()
     named_lineages_at_rank = taxonomy.named_lineages_at_rank(tax1)
     for rank, taxa in named_lineages_at_rank.iteritems():
         rank_label = Taxonomy.rank_labels[rank]
         if rank_label == 'species':
             continue
             
         extant_taxa_for_rank = taxonomy.extant_taxa_for_rank(rank_label, tax1)
         
         for taxon in taxa:
             extent_taxa = extant_taxa_for_rank[taxon]
             fout.write('%s\t%d' % (taxon, len(extent_taxa)))
             
             row = defaultdict(list)
             for genome_id in extent_taxa:
                 taxa1 = tax1[genome_id]
                 taxa2 = tax2[genome_id]
                 
                 for cur_rank, (taxa1, taxa2) in enumerate(zip(taxa1, taxa2)):
                      row[cur_rank].append(taxa1 == taxa2)
                      
             for cur_rank, matches in row.iteritems():
                 if cur_rank <= rank:
                     fout.write('\t-')
                 else:
                     perc_match = sum(matches) * 100.0 / len(matches)
                     fout.write('\t%.1f' % (100.0 - perc_match))
             fout.write('\n')
     fout.close()
Esempio n. 2
0
 def _tax_diff_table(self, tax1, tax2, output_table):
     """Tabulate incongruency of taxonomy strings at each rank."""
     
     fout = open(output_table, 'w')
     fout.write('Lineage\tNo. Extent Taxa')
     for rank_label in Taxonomy.rank_labels:
         fout.write('\t%s (%%)' % rank_label.title())
     fout.write('\n')
     
     taxonomy = Taxonomy()
     named_lineages_at_rank = taxonomy.named_lineages_at_rank(tax1)
     for rank, taxa in named_lineages_at_rank.items():
         rank_label = Taxonomy.rank_labels[rank]
         if rank_label == 'species':
             continue
             
         extant_taxa_for_rank = taxonomy.extant_taxa_for_rank(rank_label, tax1)
         
         for taxon in taxa:
             extent_taxa = extant_taxa_for_rank[taxon]
             fout.write('%s\t%d' % (taxon, len(extent_taxa)))
             
             row = defaultdict(list)
             for genome_id in extent_taxa:
                 taxa1 = tax1[genome_id]
                 taxa2 = tax2[genome_id]
                 
                 for cur_rank, (taxa1, taxa2) in enumerate(list(zip(taxa1, taxa2))):
                      row[cur_rank].append(taxa1 == taxa2)
                      
             for cur_rank, matches in row.items():
                 if cur_rank <= rank:
                     fout.write('\t-')
                 else:
                     perc_match = sum(matches) * 100.0 / len(matches)
                     fout.write('\t%.1f' % (100.0 - perc_match))
             fout.write('\n')
     fout.close()
Esempio n. 3
0
    def tree_tax_diff(self, tree1_file, tree2_file, output_dir):
        """Tabulate differences between two taxonomies on a tree.
        
        Parameters
        ----------
        tree1_file : str
            File with tree in Newick format.
        tree2_file : str
            File with tree in Newick format.
        output_dir : str
            Output directory.
        """
        
        tree1 = dendropy.Tree.get_from_path(tree1_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
                                            
        tree2 = dendropy.Tree.get_from_path(tree2_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        # prune both trees to a set of common taxa
        taxa1 = set()
        for t in tree1.leaf_node_iter():
            taxa1.add(t.taxon.label)
            
        taxa2 = set()
        for t in tree2.leaf_node_iter():
            taxa2.add(t.taxon.label)
            
        taxa_in_common = taxa1.intersection(taxa2)
        self.logger.info('Tree 1 contains %d taxa.' % len(taxa1))
        self.logger.info('Tree 2 contains %d taxa.' % len(taxa2))
        self.logger.info('Pruning trees to the %d taxa in common.' % len(taxa_in_common))
        
        tree1.retain_taxa_with_labels(taxa_in_common)
        tree2.retain_taxa_with_labels(taxa_in_common)
        
        # get named lineages at each taxonomic rank
        taxonomy = Taxonomy()
        tax1 = taxonomy.read_from_tree(tree1)
        tax2 = taxonomy.read_from_tree(tree2)
        
        taxa_at_rank1 = taxonomy.named_lineages_at_rank(tax1)
        taxa_at_rank2 = taxonomy.named_lineages_at_rank(tax2)

        # identify retained taxonomic names
        tax_file_name = os.path.splitext(os.path.basename(tree1_file))[0]
        output_file = os.path.join(output_dir, '%s.taxa_diff.tsv' % tax_file_name)
        fout = open(output_file, 'w')
        fout.write('Rank\tClassification\tTaxonomy 1\tTaxonomy 2\n')
        taxon2_accounted_for = defaultdict(set)
        for rank, rank_label in enumerate(Taxonomy.rank_labels[0:-1]):
            for taxon in taxa_at_rank1[rank]: 
                # check if taxon has been retained
                if taxon in taxa_at_rank2[rank]:
                    fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'retained', taxon, taxon))
                    taxon2_accounted_for[rank].add(taxon)
                    continue
                    
                # check if name was simply corrected by changing suffix
                old_taxon = self._change_suffix(taxon, rank, taxa_at_rank2)  
                if old_taxon:
                    fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'corrected', taxon, old_taxon))
                    taxon2_accounted_for[rank].add(old_taxon)
                    continue
                                         
                # check if taxon has been moved up or down in rank
                old_taxon, old_rank = self._renamed(taxon, rank, taxa_at_rank2)
                if old_taxon:
                    if rank < old_rank:
                        fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'more general', taxon, old_taxon))
                    elif rank == old_rank:
                        fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'corrected', taxon, old_taxon))
                    else:
                        fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'more specific', taxon, old_taxon))
                    
                    taxon2_accounted_for[old_rank].add(old_taxon)   
                    continue
                          
                # otherwise, the taxon appears to be new
                fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'new', taxon, 'NA'))
               
        # report deprecated taxa
        for rank, rank_label in enumerate(Taxonomy.rank_labels[0:-1]):
            for taxon in taxa_at_rank2[rank]:
                if taxon not in taxon2_accounted_for[rank]:
                    fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'deprecated', 'NA', taxon))

        fout.close()
        
        # tabulate congruence of taxonomy strings
        output_table = os.path.join(output_dir, '%s.perc_diff.tsv' % tax_file_name)
        self._tax_diff_table(tax1, tax2, output_table)
Esempio n. 4
0
    def tree_tax_diff(self, tree1_file, tree2_file, output_dir):
        """Tabulate differences between two taxonomies on a tree.
        
        Parameters
        ----------
        tree1_file : str
            File with tree in Newick format.
        tree2_file : str
            File with tree in Newick format.
        output_dir : str
            Output directory.
        """
        
        tree1 = dendropy.Tree.get_from_path(tree1_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
                                            
        tree2 = dendropy.Tree.get_from_path(tree2_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        # prune both trees to a set of common taxa
        taxa1 = set()
        for t in tree1.leaf_node_iter():
            taxa1.add(t.taxon.label)
            
        taxa2 = set()
        for t in tree2.leaf_node_iter():
            taxa2.add(t.taxon.label)
            
        taxa_in_common = taxa1.intersection(taxa2)
        self.logger.info('Tree 1 contains %d taxa.' % len(taxa1))
        self.logger.info('Tree 2 contains %d taxa.' % len(taxa2))
        self.logger.info('Pruning trees to the %d taxa in common.' % len(taxa_in_common))
        
        tree1.retain_taxa_with_labels(taxa_in_common)
        tree2.retain_taxa_with_labels(taxa_in_common)
        
        # get named lineages at each taxonomic rank
        taxonomy = Taxonomy()
        tax1 = taxonomy.read_from_tree(tree1)
        tax2 = taxonomy.read_from_tree(tree2)
        
        taxa_at_rank1 = taxonomy.named_lineages_at_rank(tax1)
        taxa_at_rank2 = taxonomy.named_lineages_at_rank(tax2)

        # identify retained taxonomic names
        tax_file_name = os.path.splitext(os.path.basename(tree1_file))[0]
        output_file = os.path.join(output_dir, '%s.taxa_diff.tsv' % tax_file_name)
        fout = open(output_file, 'w')
        fout.write('Rank\tClassification\tTaxonomy 1\tTaxonomy 2\n')
        taxon2_accounted_for = defaultdict(set)
        for rank, rank_label in enumerate(Taxonomy.rank_labels[0:-1]):
            for taxon in taxa_at_rank1[rank]: 
                # check if taxon has been retained
                if taxon in taxa_at_rank2[rank]:
                    fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'retained', taxon, taxon))
                    taxon2_accounted_for[rank].add(taxon)
                    continue
                    
                # check if name was simply corrected by changing suffix
                old_taxon = self._change_suffix(taxon, rank, taxa_at_rank2)  
                if old_taxon:
                    fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'corrected', taxon, old_taxon))
                    taxon2_accounted_for[rank].add(old_taxon)
                    continue
                                         
                # check if taxon has been moved up or down in rank
                old_taxon, old_rank = self._renamed(taxon, rank, taxa_at_rank2)
                if old_taxon:
                    if rank < old_rank:
                        fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'more general', taxon, old_taxon))
                    elif rank == old_rank:
                        fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'corrected', taxon, old_taxon))
                    else:
                        fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'more specific', taxon, old_taxon))
                    
                    taxon2_accounted_for[old_rank].add(old_taxon)   
                    continue
                          
                # otherwise, the taxon appears to be new
                fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'new', taxon, 'NA'))
               
        # report deprecated taxa
        for rank, rank_label in enumerate(Taxonomy.rank_labels[0:-1]):
            for taxon in taxa_at_rank2[rank]:
                if taxon not in taxon2_accounted_for[rank]:
                    fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'deprecated', 'NA', taxon))

        fout.close()
        
        # tabulate congruence of taxonomy strings
        output_table = os.path.join(output_dir, '%s.perc_diff.tsv' % tax_file_name)
        self._tax_diff_table(tax1, tax2, output_table)