def binomial(self, options): """Ensure species are designated using binomial nomenclature.""" check_file_exists(options.input_taxonomy) fout = open(options.output_taxonomy, 'w') taxonomy = Taxonomy() t = taxonomy.read(options.input_taxonomy) for genome_id, taxon_list in t.items(): taxonomy_str = ';'.join(taxon_list) if not taxonomy.check_full(taxonomy_str): sys.exit(-1) genus = taxon_list[5][3:] species = taxon_list[6][3:] if species and genus not in species: taxon_list[6] = 's__' + genus + ' ' + species taxonomy_str = ';'.join(taxon_list) fout.write('%s\t%s\n' % (genome_id, taxonomy_str)) fout.close() self.logger.info('Revised taxonomy written to: %s' % options.output_taxonomy)
def validate(self, options): """Validate command""" check_file_exists(options.taxonomy_file) taxonomy = Taxonomy() t = taxonomy.read(options.taxonomy_file) errors = taxonomy.validate(t, check_prefixes=not options.no_prefix, check_ranks=not options.no_all_ranks, check_hierarchy=not options.no_hierarhcy, check_species=not options.no_species, check_group_names=True, check_duplicate_names=True, report_errors=True) invalid_ranks, invalid_prefixes, invalid_species_name, invalid_hierarchies, invalid_group_name = errors if sum([len(e) for e in errors]) == 0: self.logger.info('No errors identified in taxonomy file.') else: self.logger.info('Identified %d incomplete taxonomy strings.' % len(invalid_ranks)) self.logger.info('Identified %d rank prefix errors.' % len(invalid_prefixes)) self.logger.info('Identified %d invalid species names.' % len(invalid_species_name)) self.logger.info('Identified %d taxa with multiple parents.' % len(invalid_hierarchies)) self.logger.info('Identified %d invalid group names.' % len(invalid_group_name))
def validate(self, options): """Check taxonomy file is formatted as expected.""" check_file_exists(options.input_taxonomy) taxonomy = Taxonomy() t = taxonomy.read(options.input_taxonomy) taxonomy.validate(t, check_prefixes=True, check_ranks=True, check_hierarchy=True, check_species=True, check_group_names=True, check_duplicate_names=True, report_errors=True) self.logger.info('Finished performing validation tests.')
def fill_ranks(self, options): """Ensure taxonomy strings contain all 7 canonical ranks.""" check_file_exists(options.input_taxonomy) fout = open(options.output_taxonomy, 'w') taxonomy = Taxonomy() t = taxonomy.read(options.input_taxonomy) for genome_id, taxon_list in t.iteritems(): full_taxon_list = taxonomy.fill_missing_ranks(taxon_list) taxonomy_str = ';'.join(full_taxon_list) if not taxonomy.check_full(taxonomy_str): sys.exit(-1) fout.write('%s\t%s\n' % (genome_id, taxonomy_str)) fout.close() self.logger.info('Revised taxonomy written to: %s' % options.output_taxonomy)
def fill_ranks(self, options): """Ensure taxonomy strings contain all 7 canonical ranks.""" check_file_exists(options.input_taxonomy) fout = open(options.output_taxonomy, 'w') taxonomy = Taxonomy() t = taxonomy.read(options.input_taxonomy) for genome_id, taxon_list in t.items(): full_taxon_list = taxonomy.fill_missing_ranks(taxon_list) taxonomy_str = ';'.join(full_taxon_list) if not taxonomy.check_full(taxonomy_str): sys.exit(-1) fout.write('%s\t%s\n' % (genome_id, taxonomy_str)) fout.close() self.logger.info('Revised taxonomy written to: %s' % options.output_taxonomy)
def validate(self, options): """Validate command""" check_file_exists(options.taxonomy_file) taxonomy = Taxonomy() t = taxonomy.read(options.taxonomy_file) errors = taxonomy.validate(t, not options.no_prefix, not options.no_all_ranks, not options.no_hierarhcy, not options.no_species, True) invalid_ranks, invalid_prefixes, invalid_species_name, invalid_hierarchies = errors if sum([len(e) for e in errors]) == 0: self.logger.info('No errors identified in taxonomy file.') else: self.logger.info('Identified %d incomplete taxonomy strings.' % len(invalid_ranks)) self.logger.info('Identified %d rank prefix errors.' % len(invalid_prefixes)) self.logger.info('Identified %d invalid species names.' % len(invalid_species_name)) self.logger.info('Identified %d taxa with multiple parents.' % len(invalid_hierarchies))
def propagate(self, options): """Propagate labels to all genomes in a cluster.""" check_file_exists(options.input_taxonomy) check_file_exists(options.metadata_file) # get representative genome information rep_metadata = read_gtdb_metadata(options.metadata_file, ['gtdb_representative', 'gtdb_clustered_genomes']) taxonomy = Taxonomy() explict_tax = taxonomy.read(options.input_taxonomy) expanded_taxonomy = {} incongruent_count = 0 for genome_id, taxon_list in explict_tax.iteritems(): taxonomy_str = ';'.join(taxon_list) # Propagate taxonomy strings if genome is a representatives. Also, determine # if genomes clustered together have compatible taxonomies. Note that a genome # may not have metadata as it is possible a User has removed a genome that is # in the provided taxonomy file. _rep_genome, clustered_genomes = rep_metadata.get(genome_id, (None, None)) if clustered_genomes: # genome is a representative clustered_genome_ids = clustered_genomes.split(';') # get taxonomy of all genomes in cluster with a specified taxonomy clustered_genome_tax = {} for cluster_genome_id in clustered_genome_ids: if cluster_genome_id == genome_id: continue if cluster_genome_id not in rep_metadata: continue # genome is no longer in the GTDB so ignore it if cluster_genome_id in explict_tax: clustered_genome_tax[cluster_genome_id] = explict_tax[cluster_genome_id] # determine if representative and clustered genome taxonomy strings are congruent working_cluster_taxonomy = list(taxon_list) incongruent_with_rep = False for cluster_genome_id, cluster_tax in clustered_genome_tax.iteritems(): if incongruent_with_rep: working_cluster_taxonomy = list(taxon_list) # default to rep taxonomy break for r in xrange(0, len(Taxonomy.rank_prefixes)): if cluster_tax[r] == Taxonomy.rank_prefixes[r]: break # no more taxonomy information to consider if cluster_tax[r] != taxon_list[r]: if taxon_list[r] == Taxonomy.rank_prefixes[r]: # clustered genome has a more specific taxonomy string which # should be propagate to the representative if all clustered # genomes are in agreement if working_cluster_taxonomy[r] == Taxonomy.rank_prefixes[r]: # make taxonomy more specific based on genomes in cluster working_cluster_taxonomy[r] = cluster_tax[r] elif working_cluster_taxonomy[r] != cluster_tax[r]: # not all genomes agree on the assignment of this rank so leave it unspecified working_cluster_taxonomy[r] = Taxonomy.rank_prefixes[r] break else: # genomes in cluster have incongruent taxonomies so defer to representative self.logger.warning("Genomes in cluster have incongruent taxonomies.") self.logger.warning("Representative %s: %s" % (genome_id, taxonomy_str)) self.logger.warning("Clustered genome %s: %s" % (cluster_genome_id, ';'.join(cluster_tax))) self.logger.warning("Deferring to taxonomy specified for representative.") incongruent_count += 1 incongruent_with_rep = True break cluster_taxonomy_str = ';'.join(working_cluster_taxonomy) # assign taxonomy to representative and all genomes in the cluster expanded_taxonomy[genome_id] = cluster_taxonomy_str for cluster_genome_id in clustered_genome_ids: expanded_taxonomy[cluster_genome_id] = cluster_taxonomy_str else: if genome_id in expanded_taxonomy: # genome has already been assigned a taxonomy based on its representative pass else: # genome is a singleton expanded_taxonomy[genome_id] = taxonomy_str self.logger.info('Identified %d clusters with incongruent taxonomies.' % incongruent_count) fout = open(options.output_taxonomy, 'w') for genome_id, taxonomy_str in expanded_taxonomy.iteritems(): fout.write('%s\t%s\n' % (genome_id, taxonomy_str)) fout.close() self.logger.info('Taxonomy written to: %s' % options.output_taxonomy)
def propagate(self, options): """Propagate labels to all genomes in a cluster.""" check_file_exists(options.input_taxonomy) check_file_exists(options.metadata_file) # get representative genome information rep_metadata = read_gtdb_metadata( options.metadata_file, ['gtdb_representative', 'gtdb_clustered_genomes']) taxonomy = Taxonomy() explict_tax = taxonomy.read(options.input_taxonomy) expanded_taxonomy = {} incongruent_count = 0 for genome_id, taxon_list in explict_tax.iteritems(): taxonomy_str = ';'.join(taxon_list) # Propagate taxonomy strings if genome is a representatives. Also, determine # if genomes clustered together have compatible taxonomies. Note that a genome # may not have metadata as it is possible a User has removed a genome that is # in the provided taxonomy file. _rep_genome, clustered_genomes = rep_metadata.get( genome_id, (None, None)) if clustered_genomes: # genome is a representative clustered_genome_ids = clustered_genomes.split(';') # get taxonomy of all genomes in cluster with a specified taxonomy clustered_genome_tax = {} for cluster_genome_id in clustered_genome_ids: if cluster_genome_id == genome_id: continue if cluster_genome_id not in rep_metadata: continue # genome is no longer in the GTDB so ignore it if cluster_genome_id in explict_tax: clustered_genome_tax[cluster_genome_id] = explict_tax[ cluster_genome_id] # determine if representative and clustered genome taxonomy strings are congruent working_cluster_taxonomy = list(taxon_list) incongruent_with_rep = False for cluster_genome_id, cluster_tax in clustered_genome_tax.iteritems( ): if incongruent_with_rep: working_cluster_taxonomy = list( taxon_list) # default to rep taxonomy break for r in xrange(0, len(Taxonomy.rank_prefixes)): if cluster_tax[r] == Taxonomy.rank_prefixes[r]: break # no more taxonomy information to consider if cluster_tax[r] != taxon_list[r]: if taxon_list[r] == Taxonomy.rank_prefixes[r]: # clustered genome has a more specific taxonomy string which # should be propagate to the representative if all clustered # genomes are in agreement if working_cluster_taxonomy[ r] == Taxonomy.rank_prefixes[r]: # make taxonomy more specific based on genomes in cluster working_cluster_taxonomy[r] = cluster_tax[ r] elif working_cluster_taxonomy[ r] != cluster_tax[r]: # not all genomes agree on the assignment of this rank so leave it unspecified working_cluster_taxonomy[ r] = Taxonomy.rank_prefixes[r] break else: # genomes in cluster have incongruent taxonomies so defer to representative self.logger.warning( "Genomes in cluster have incongruent taxonomies." ) self.logger.warning("Representative %s: %s" % (genome_id, taxonomy_str)) self.logger.warning( "Clustered genome %s: %s" % (cluster_genome_id, ';'.join(cluster_tax))) self.logger.warning( "Deferring to taxonomy specified for representative." ) incongruent_count += 1 incongruent_with_rep = True break cluster_taxonomy_str = ';'.join(working_cluster_taxonomy) # assign taxonomy to representative and all genomes in the cluster expanded_taxonomy[genome_id] = cluster_taxonomy_str for cluster_genome_id in clustered_genome_ids: expanded_taxonomy[cluster_genome_id] = cluster_taxonomy_str else: if genome_id in expanded_taxonomy: # genome has already been assigned a taxonomy based on its representative pass else: # genome is a singleton expanded_taxonomy[genome_id] = taxonomy_str self.logger.info( 'Identified %d clusters with incongruent taxonomies.' % incongruent_count) fout = open(options.output_taxonomy, 'w') for genome_id, taxonomy_str in expanded_taxonomy.iteritems(): fout.write('%s\t%s\n' % (genome_id, taxonomy_str)) fout.close() self.logger.info('Taxonomy written to: %s' % options.output_taxonomy)
def check_tree(self, options): """Validate taxonomy of decorated tree and check for polyphyletic groups.""" check_file_exists(options.decorated_tree) # validate taxonomy taxonomy = Taxonomy() if options.taxonomy_file: t = taxonomy.read(options.taxonomy_file) else: t = taxonomy.read_from_tree(options.decorated_tree) taxonomy.validate(t, check_prefixes=True, check_ranks=True, check_hierarchy=True, check_species=True, check_group_names=True, check_duplicate_names=True, report_errors=True) # check for polyphyletic groups polyphyletic_groups = set() tree = dendropy.Tree.get_from_path(options.decorated_tree, schema='newick', rooting="force-rooted", preserve_underscores=True) if options.taxonomy_file: # reduce taxonomy to taxa in tree and map taxon labels to Taxon objects reduced_taxonomy = {} taxon_map = {} for leaf in tree.leaf_node_iter(): reduced_taxonomy[leaf.taxon.label] = t[leaf.taxon.label] taxon_map[leaf.taxon.label] = leaf.taxon # find taxa with an MRCA spanning additional taxa for rank_label in Taxonomy.rank_labels[1:]: extant_taxa = taxonomy.extant_taxa_for_rank( rank_label, reduced_taxonomy) for taxon, taxa_ids in extant_taxa.items(): mrca = tree.mrca(taxa=[taxon_map[t] for t in taxa_ids]) mrca_leaf_count = sum([1 for leaf in mrca.leaf_iter()]) if mrca_leaf_count != len(taxa_ids): polyphyletic_groups.add(taxon) else: # find duplicate taxon labels in tree taxa = set() for node in tree.preorder_node_iter(lambda n: not n.is_leaf()): _support, taxon_label, _aux_info = parse_label(node.label) if taxon_label: for taxon in [t.strip() for t in taxon_label.split(';')]: if taxon in taxa: polyphyletic_groups.add(taxon) taxa.add(taxon) if len(polyphyletic_groups): print('') print('Tree contains polyphyletic groups:') for taxon in polyphyletic_groups: print('%s' % (taxon)) self.logger.info('Finished performing validation tests.')