def pull(self, options): """Create taxonomy file from a decorated tree.""" check_file_exists(options.input_tree) if options.no_validation: tree = dendropy.Tree.get_from_path(options.input_tree, schema='newick', rooting="force-rooted", preserve_underscores=True) taxonomy = {} for leaf in tree.leaf_node_iter(): taxon_id = leaf.taxon.label node = leaf.parent_node taxa = [] while node: support, taxon, aux_info = parse_label(node.label) if taxon: for t in list(map(str.strip, taxon.split(';')))[::-1]: taxa.append(t) node = node.parent_node taxonomy[taxon_id] = taxa[::-1] else: taxonomy = Taxonomy().read_from_tree(options.input_tree) Taxonomy().write(taxonomy, options.output_taxonomy) self.logger.info('Stripped tree written to: %s' % options.output_taxonomy)
def validate(self, options): """Validate command""" check_file_exists(options.taxonomy_file) taxonomy = Taxonomy() t = taxonomy.read(options.taxonomy_file) errors = taxonomy.validate(t, check_prefixes=not options.no_prefix, check_ranks=not options.no_all_ranks, check_hierarchy=not options.no_hierarhcy, check_species=not options.no_species, check_group_names=True, check_duplicate_names=True, report_errors=True) invalid_ranks, invalid_prefixes, invalid_species_name, invalid_hierarchies, invalid_group_name = errors if sum([len(e) for e in errors]) == 0: self.logger.info('No errors identified in taxonomy file.') else: self.logger.info('Identified %d incomplete taxonomy strings.' % len(invalid_ranks)) self.logger.info('Identified %d rank prefix errors.' % len(invalid_prefixes)) self.logger.info('Identified %d invalid species names.' % len(invalid_species_name)) self.logger.info('Identified %d taxa with multiple parents.' % len(invalid_hierarchies)) self.logger.info('Identified %d invalid group names.' % len(invalid_group_name))
def append(self, options): """Append command""" check_file_exists(options.input_tree) check_file_exists(options.input_taxonomy) taxonomy = Taxonomy().read(options.input_taxonomy) tree = dendropy.Tree.get_from_path(options.input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) for n in tree.leaf_node_iter(): taxa_str = taxonomy.get(n.taxon.label, None) if taxa_str == None: self.logger.error( 'Taxonomy file does not contain an entry for %s.' % n.label) sys.exit(-1) n.taxon.label = n.taxon.label + '|' + '; '.join( taxonomy[n.taxon.label]) tree.write_to_path(options.output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True) self.logger.info('Decorated tree written to: %s' % options.output_tree)
def run(self, taxonomy_file, genome_list_file): """Add taxonomy to database.""" genome_list = set() if genome_list_file: for line in open(genome_list_file): if '\t' in line: genome_list.add(line.rstrip().split('\t')[0]) else: genome_list.add(line.rstrip().split(',')[0]) # read taxonomy file taxonomy = Taxonomy().read(taxonomy_file) # add each taxonomic rank to database for i, rank in enumerate(Taxonomy.rank_labels): temp_file = tempfile.NamedTemporaryFile(delete=False) for genome_id, taxa in taxonomy.iteritems(): if genome_list_file and genome_id not in genome_list: continue rank_str = taxa[i] temp_file.write('%s\t%s\n' % (genome_id, rank_str)) temp_file.close() cmd = 'gtdb -r metadata import --table %s --field %s --type %s --metadatafile %s' % ( 'metadata_taxonomy', 'gtdb_' + rank, 'TEXT', temp_file.name) print cmd os.system(cmd) os.remove(temp_file.name)
def append(self, options): """Append command""" check_file_exists(options.input_tree) check_file_exists(options.input_taxonomy) taxonomy = Taxonomy().read(options.input_taxonomy) tree = dendropy.Tree.get_from_path(options.input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) for n in tree.leaf_node_iter(): taxa_str = taxonomy.get(n.taxon.label, None) if taxa_str == None: self.logger.error('Taxonomy file does not contain an entry for %s.' % n.label) sys.exit(-1) n.taxon.label = n.taxon.label + '|' + '; '.join(taxonomy[n.taxon.label]) tree.write_to_path(options.output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True) self.logger.info('Decorated tree written to: %s' % options.output_tree)
def run(self, taxonomy_file, genome_list): """Add taxonomy to database.""" if genome_list: genomes_to_process = set() for line in open(genome_list): if line[0] == '#': continue genomes_to_process.add(line.rstrip().split('\t')[0]) # read taxonomy file taxonomy = Taxonomy().read(taxonomy_file) # add full taxonomy string to database temp_file = tempfile.NamedTemporaryFile(delete=False) for genome_id, taxa in taxonomy.iteritems(): if genome_id.startswith('GCA_'): genome_id = 'GB_' + genome_id elif genome_id.startswith('GCF_'): genome_id = 'RS_' + genome_id if not genome_list or genome_id in genomes_to_process: taxa_str = ';'.join(taxa) temp_file.write('%s\t%s\n' % (genome_id, taxa_str)) temp_file.close() cmd = 'gtdb -r metadata import --table %s --field %s --type %s --metadatafile %s' % ( 'metadata_taxonomy', 'ncbi_taxonomy_unfiltered', 'TEXT', temp_file.name) print cmd os.system(cmd) os.remove(temp_file.name)
def binomial(self, options): """Ensure species are designated using binomial nomenclature.""" check_file_exists(options.input_taxonomy) fout = open(options.output_taxonomy, 'w') taxonomy = Taxonomy() t = taxonomy.read(options.input_taxonomy) for genome_id, taxon_list in t.items(): taxonomy_str = ';'.join(taxon_list) if not taxonomy.check_full(taxonomy_str): sys.exit(-1) genus = taxon_list[5][3:] species = taxon_list[6][3:] if species and genus not in species: taxon_list[6] = 's__' + genus + ' ' + species taxonomy_str = ';'.join(taxon_list) fout.write('%s\t%s\n' % (genome_id, taxonomy_str)) fout.close() self.logger.info('Revised taxonomy written to: %s' % options.output_taxonomy)
def manual_species(self, init_taxonomy, manually_curated_tree): """Identify species names manually set by curators.""" # read initial and manually curated taxonomy self.logger.info('Reading initial species names.') init_taxonomy = Taxonomy().read(init_taxonomy, use_canonical_gid=True) init_num_gids = sum( [1 for gid in init_taxonomy if not gid.startswith('D-')]) self.logger.info( ' - read taxonomy for {:,} genomes.'.format(init_num_gids)) self.logger.info('Reading manually-curated species names from tree.') mc_tree = dendropy.Tree.get_from_path(manually_curated_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) mc_taxonomy = Taxonomy().read_from_tree(mc_tree) mc_specific = {} for gid, taxa in mc_taxonomy.items(): if gid.startswith('D-'): continue mc_sp = taxa[-1] if not mc_sp.startswith('s__') or mc_sp == 's__': self.logger.error( 'Most specific classification for {} is {}.'.format( gid, taxa)) continue mc_specific[gid] = specific_epithet(mc_sp) self.logger.info(' - read taxonomy for {:,} genomes.'.format( len(mc_specific))) # report genomes with modified specific name assignment self.logger.info( 'Identifying genomes with manually-curated species names.') fout = open(os.path.join(self.output_dir, 'manual_species_names.tsv'), 'w') fout.write('Genome ID\tInitial species\tManually-curated species\n') num_mc = 0 for gid, mc_sp in mc_specific.items(): init_species = init_taxonomy[gid][Taxonomy.SPECIES_INDEX] init_specific = specific_epithet(init_species) if init_specific != mc_sp: mc_generic = mc_taxonomy[gid][Taxonomy.GENUS_INDEX].replace( 'g__', '') mc_species = 's__{} {}'.format(mc_generic, mc_sp) num_mc += 1 fout.write('{}\t{}\t{}\n'.format(gid, init_species, mc_species)) fout.close() self.logger.info( ' - identified {:,} manually-curated species names.'.format( num_mc))
def pull(self, options): """Create taxonomy file from a decorated tree.""" check_file_exists(options.input_tree) taxonomy = Taxonomy().read_from_tree(options.input_tree) Taxonomy().write(taxonomy, options.output_taxonomy) self.logger.info('Stripped tree written to: %s' % options.output_taxonomy)
def species_label(gtdb_taxonomy, ncbi_taxonomy, ncbi_organism_name): """Determine 'best' species label for each genome. Currently, this is just being set to the species label in the GTDB taxonomy. In theory, the NCBI taxonomy and organism name could also be consulted. However, since the GTDB taxonomy redefines some species this might be problematic so isn't currently being done. Parameters ---------- gtdb_taxonomy : d[assembly_accession] -> [d__, ..., s__] GTDB taxonomy of each genome. ncbi_taxonomy : d[assembly_accession] -> [d__, ..., s__] NCBI taxonomy of each genome. ncbi_organism_name : d[assembly_accession] -> name NCBI organism name of each genome. Return ------ dict : d[assembly_accession] -> species name Species name of each genome. """ taxonomy = Taxonomy() species = {} species_index = Taxonomy.rank_index['s__'] for genome_id, taxa in gtdb_taxonomy.iteritems(): sp = taxa[species_index] if sp != 's__': species[genome_id] = sp if False: # do not consider NCBI information as # it may conflict with GTDB information # in unwanted ways for genome_id, taxa in ncbi_taxonomy.iteritems(): if genome_id in species: continue sp = taxa[species_index] sp = taxonomy.extract_valid_species_name(sp) if sp: species[genome_id] = sp for genome_id, sp in ncbi_organism_name.iteritems(): if genome_id in species: continue sp = taxonomy.extract_valid_species_name(sp) if sp: species[genome_id] = sp return species
def pull(self, options): """Pull command""" check_file_exists(options.input_tree) t = Taxonomy().read_from_tree(options.input_tree) #, False) if not options.no_rank_fill: for taxon_id, taxa in t.iteritems(): t[taxon_id] = Taxonomy().fill_missing_ranks(taxa) Taxonomy().write(t, options.output_file) self.logger.info('Taxonomy strings written to: %s' % options.output_file)
def clean_ftp(self, new_list_genomes, ftp_genome_dir_file, ftp_genome_dir, report_dir, taxonomy_file=None): list_of_files = new_list_genomes.split(',') genome_in_new_rel = [] make_sure_path_exists(report_dir) for new_genome_file in list_of_files: with open(new_genome_file, 'r') as ngf: for line in ngf: genome_in_new_rel.append(line.strip().split('\t')[0]) # read taxonomy file taxonomy = {} if taxonomy_file is not None: taxonomy = Taxonomy().read(taxonomy_file) current_ftp_genomes = {} with open(ftp_genome_dir_file) as fgdf: for line in fgdf: infos = line.strip().split('\t') current_ftp_genomes[infos[0]] = infos[1] deleted_genomes = list( set(current_ftp_genomes.keys()) - set(genome_in_new_rel)) added_genomes = list( set(genome_in_new_rel) - set(current_ftp_genomes.keys())) deleted_genome_file = open( os.path.join(report_dir, 'deleted_genomes.tsv'), 'w') added_genome_file = open(os.path.join(report_dir, 'added_genomes.tsv'), 'w') print('{} genomes have been deleted in the release'.format( len(deleted_genomes))) print('{} genomes have been added in the release'.format( len(added_genomes))) for idx, deleted_genome in enumerate(deleted_genomes): print("{}/{} genomes deleted".format(idx, len(deleted_genomes)), end="\r") deleted_genome_file.write('{}\n'.format(deleted_genome)) #print('we delete {}'.format(current_ftp_genomes.get(deleted_genome))) shutil.rmtree(current_ftp_genomes.get(deleted_genome)) self.delete_empty_directory( os.path.dirname(current_ftp_genomes.get(deleted_genome))) for added_genome in added_genomes: added_genome_file.write('{}\t{}\n'.format( added_genome, taxonomy.get(added_genome, ['N/A'] * 7)[6]))
def pull(self, options): """Pull command""" check_file_exists(options.input_tree) t = Taxonomy().read_from_tree(options.input_tree) #, False) if not options.no_rank_fill: for taxon_id, taxa in t.items(): t[taxon_id] = Taxonomy().fill_missing_ranks(taxa) Taxonomy().write(t, options.output_file) self.logger.info('Taxonomy strings written to: %s' % options.output_file)
def run(self, taxonomy_file, genome_list_file): """Add taxonomy to database.""" genome_list = set() if genome_list_file: for line in open(genome_list_file): if '\t' in line: genome_list.add(line.rstrip().split('\t')[0]) else: genome_list.add(line.rstrip().split(',')[0]) # read taxonomy file taxonomy = Taxonomy().read(taxonomy_file) # add full taxonomy string to database temp_file = tempfile.NamedTemporaryFile(delete=False) for genome_id, taxa in taxonomy.iteritems(): if genome_list_file and genome_id not in genome_list: continue taxa_str = ';'.join(taxa) temp_file.write('%s\t%s\n' % (genome_id, taxa_str)) temp_file.close() cmd = 'gtdb -r metadata import --table %s --field %s --type %s --metadatafile %s' % ('metadata_taxonomy', 'gtdb_taxonomy', 'TEXT', temp_file.name) print cmd os.system(cmd) os.remove(temp_file.name) # add each taxonomic rank to database for i, rank in enumerate(Taxonomy.rank_labels): temp_file = tempfile.NamedTemporaryFile(delete=False) for genome_id, taxa in taxonomy.iteritems(): if genome_list_file and genome_id not in genome_list: continue rank_str = taxa[i] if Taxonomy.rank_labels[i] == 'species': # ensure species name includes genus if taxa[i-1][3:] not in taxa[i]: rank_str = 's__' + taxa[i-1][3:] + ' ' + taxa[i][3:] temp_file.write('%s\t%s\n' % (genome_id, rank_str)) temp_file.close() cmd = 'gtdb -r metadata import --table %s --field %s --type %s --metadatafile %s' % ('metadata_taxonomy', 'gtdb_' + rank, 'TEXT', temp_file.name) print cmd os.system(cmd) os.remove(temp_file.name)
def add_sp_label(self, options): """Generate tree with species labels.""" check_file_exists(options.taxonomy_file) check_file_exists(options.tree_file) self.logger.info('Reading GTDB taxonomy.') gtdb_taxonomy = Taxonomy().read(options.taxonomy_file) self.logger.info('Reading input tree.') tree = dendropy.Tree.get_from_path(options.tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) self.logger.info('Appending species labels.') for node in tree.postorder_node_iter(): if node.is_leaf(): gid = node.taxon.label species = gtdb_taxonomy[gid][Taxonomy.SPECIES_INDEX].replace( 's__', '') node.taxon.label += ' | {}'.format(species) self.logger.info('Writing output tree.') tree.write_to_path(options.output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True) self.logger.info('Done.')
def _assign_taxon_labels(self, fmeasure_for_taxa): """Assign taxon labels to nodes. Parameters ---------- fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall), ...] Node with highest F-measure for each taxon. Returns ------- set Taxon labels placed in tree. """ placed_taxon = set() for taxon in Taxonomy().sort_taxa(list(fmeasure_for_taxa.keys())): if len(fmeasure_for_taxa[taxon]) == 1: placed_taxon.add(taxon) node, fmeasure, precision, recall = fmeasure_for_taxa[taxon][0] support, taxon_label, aux_info = parse_label(node.label) if taxon_label: taxon_label += '; ' + taxon else: taxon_label = taxon node.label = create_label(support, taxon_label, aux_info) return placed_taxon
def classify(self, seq_file, db, taxonomy_file, evalue_threshold, output_dir): """Classify rRNA genes. Parameters ---------- seq_file : str Name of fasta file containing rRNA sequences. ssu_db : str BLAST database of rRNA genes. ssu_taxonomy_file : str Taxonomy file for genes in the rRNA database. evalue_threshold : float E-value threshold for defining valid hits. output_dir : str Output directory. """ # blast sequences against rRNA database blast = Blast(self.cpus) blast_file = os.path.join(output_dir, '%s.blastn.tsv' % self.rna_name) blast.blastn(seq_file, db, blast_file, evalue=evalue_threshold, max_matches=5, output_fmt='custom') # read taxonomy file taxonomy = Taxonomy().read(taxonomy_file) # write out classification file classification_file = os.path.join( output_dir, '%s.taxonomy.tsv' % self.rna_name) fout = open(classification_file, 'w') fout.write( 'query_id\ttaxonomy\tlength\tblast_subject_id\tblast_evalue\tblast_bitscore\tblast_align_len\tblast_perc_identity\n') processed_query_ids = set() for line in open(blast_file): line_split = [x.strip() for x in line.split('\t')] query_id = line_split[0] if query_id in processed_query_ids: # A query may have multiple hits to different genes or sections # of a gene. Blast results are organized by bitscore so # only the first hit is considered. continue processed_query_ids.add(query_id) query_len = int(line_split[1]) subject_id = line_split[2] align_len = line_split[5] perc_identity = line_split[6] evalue = line_split[7] bitscore = line_split[8] taxonomy_str = ';'.join(taxonomy[subject_id]) fout.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (query_id, taxonomy_str, query_len, subject_id, evalue, bitscore, align_len, perc_identity)) fout.close()
def validate(self, options): """Check taxonomy file is formatted as expected.""" check_file_exists(options.input_taxonomy) taxonomy = Taxonomy() t = taxonomy.read(options.input_taxonomy) taxonomy.validate(t, check_prefixes=True, check_ranks=True, check_hierarchy=True, check_species=True, check_group_names=True, check_duplicate_names=True, report_errors=True) self.logger.info('Finished performing validation tests.')
def run(self, gtdb_bac_taxonomy_file, gtdb_ar_taxonomy_file, silva_ssu_ref, silva_lsu_ref, ssu_blast_table, lsu_blast_table, output_dir): """Create table assigning GTDB taxonomy to SILVA accessions based on SSU and LSU BLAST results.""" if not os.path.exists(output_dir): os.makedirs(output_dir) # read GTDB taxonomy print('Reading GTDB taxonomy.') gtdb_bac_taxonomy = Taxonomy().read(gtdb_bac_taxonomy_file) gtdb_ar_taxonomy = Taxonomy().read(gtdb_ar_taxonomy_file) gtdb_taxonomy = gtdb_bac_taxonomy.copy() gtdb_taxonomy.update(gtdb_ar_taxonomy) print('Identified %d bacterial genomes to process.' % len(gtdb_bac_taxonomy)) print('Identified %d archaeal genomes to process.' % len(gtdb_ar_taxonomy)) print('Identified %d genomes to process.' % len(gtdb_taxonomy)) # read SILVA taxonomy print('Reading SILVA 16S and 23S rRNA taxonomies.') silva_ssu_taxonomy = {} for seq_id, seq, taxonomy in seq_io.read_seq(silva_ssu_ref, keep_annotation=True): silva_ssu_taxonomy[seq_id] = taxonomy silva_lsu_taxonomy = {} for seq_id, seq, taxonomy in seq_io.read_seq(silva_lsu_ref, keep_annotation=True): silva_lsu_taxonomy[seq_id] = taxonomy # parse BLAST tables print('Parsing BLAST tables.') ssu_table = os.path.join(output_dir, 'ssu_silva.tsv') self._parse_blast_table(ssu_blast_table, gtdb_taxonomy, silva_ssu_taxonomy, self.min_ssu_len, ssu_table) lsu_table = os.path.join(output_dir, 'lsu_silva.tsv') self._parse_blast_table(lsu_blast_table, gtdb_taxonomy, silva_lsu_taxonomy, self.min_lsu_len, lsu_table)
def __init__(self, genome_id, taxonomy): """Initialization. Parameters ---------- genome_id : str Unique id of genome taxonomy : dict[ref_genome_id] -> [domain, phylum, ..., species] Taxonomic assignment of each reference genome. """ self.percent_to_classify = 0.2 self.rank_prefixes = Taxonomy().rank_prefixes self.rank_labels = Taxonomy().rank_labels self.genome_id = genome_id self.taxonomy = taxonomy self.unclassified = 'unclassified' self.TaxaInfo = namedtuple( 'TaxaInfo', """evalue perc_identity aln_length num_seqs num_basepairs""") # track hits at each rank: dict[contig_id][rank][taxa] -> [HitInfo, ...] self.HitInfo = namedtuple( 'HitInfo', """evalue perc_identity aln_length""") self.hits = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) # total fragments from genome self.total_fragments = 0 # length of contigs self.seq_len = {} # number of fragments from each sequence self.fragments_from_seq = {}
def __init__(self): """Initialize.""" self.logger = logging.getLogger('timestamp') self.prev_taxonomy_dir = '/srv/projects/gtdb/data/taxonomy_gtdb' # get all previous taxonomy files self.logger.info('Reading previous GTDB taxonomy files in {}:'.format( self.prev_taxonomy_dir)) taxonomies = defaultdict(lambda: {}) for f in os.listdir(self.prev_taxonomy_dir): if f.endswith('.tsv') and 'gtdb' in f: self.logger.info(' %s' % f) taxonomy_file = os.path.join(self.prev_taxonomy_dir, f) taxonomy_id = '_'.join(f.split('_')[0:2]) taxonomies[taxonomy_id].update(Taxonomy().read(taxonomy_file)) self.logger.info( 'Considering taxonomy from {:,} previous releases.'.format( len(taxonomies))) # get highest alphabetic suffix for each taxon self.logger.info( 'Determining highest polyphyletic alphabetic suffix for each taxon.' ) self.taxon_suffix = {} for taxonomy in taxonomies.values(): for taxa in taxonomy.values(): for taxon in taxa: rank_prefix = taxon[0:3] taxon_name = taxon[3:] if '_' in taxon_name: if rank_prefix != 's__': taxon_name, suffix = taxon_name.rsplit('_', 1) else: # check if the specific name has a suffix generic_name, specific_name = taxon_name.split() if '_' in specific_name: canonical_specific_name, suffix = specific_name.rsplit( '_', 1) taxon_name = '{} {}'.format( generic_name, canonical_specific_name) else: continue canonical_taxon = '{}{}'.format( rank_prefix, taxon_name) cur_suffix = self.taxon_suffix.get( canonical_taxon, 'A') if self._suffix_value(suffix) >= self._suffix_value( cur_suffix): self.taxon_suffix[canonical_taxon] = suffix
def root(self, options): """Root tree using outgroup.""" self.logger.warning("Tree rooting is still under development!") check_file_exists(options.input_tree) gtdb_taxonomy = Taxonomy().read(Config.TAXONOMY_FILE) self.logger.info('Identifying genomes from the specified outgroup.') outgroup = set() for genome_id, taxa in gtdb_taxonomy.iteritems(): if options.outgroup_taxon in taxa: outgroup.add(genome_id) reroot = RerootTree() reroot.root_with_outgroup(options.input_tree, options.output_tree, outgroup) self.logger.info('Done.')
def taxon_stats(self, options): """Taxon stats command""" check_file_exists(options.taxonomy_file) taxonomy = Taxonomy().read(options.taxonomy_file) taxon_children = Taxonomy().taxon_children(taxonomy) fout = open(options.output_file, 'w') fout.write('Taxa') for rank in Taxonomy.rank_labels[1:]: fout.write('\t# named %s' % rank) fout.write('\t# extant taxon with complete taxonomy') fout.write('\n') for rank_prefix in Taxonomy.rank_prefixes: # find taxon at the specified rank cur_taxa = [] for taxon in taxon_children: if taxon.startswith(rank_prefix): cur_taxa.append(taxon) cur_taxa.sort() for taxon in cur_taxa: fout.write(taxon) fout.write('\t-' * Taxonomy.rank_index[rank_prefix]) next_taxa = [taxon] for _ in range(Taxonomy.rank_index[rank_prefix], Taxonomy.rank_index['s__'] + 1): children_taxa = set() for t in next_taxa: children_taxa.update(taxon_children[t]) fout.write('\t%d' % len(children_taxa)) next_taxa = children_taxa fout.write('\n') fout.close() self.logger.info('Summary statistics written to: %s' % options.output_file)
def _write_summary_table(self, fmeasure_for_taxa, taxonomy, summary_table): """Write table containing statistics for each taxonomic rank. Parameters ---------- fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall)] Node with highest F-measure for each taxon. taxonomy : d[unique_id] -> [d__<taxon>; ...; s__<taxon>] Taxonomic information for taxa in tree of interest. summary_table : str Output table to write statistics for assigned labels. """ # get number of monophyletic, operationally monophyletic, and polyphyletic # taxa at each taxonomic rank taxon_count = defaultdict(int) mono = defaultdict(int) op_mono = defaultdict(int) poly = defaultdict(int) for taxon in Taxonomy().sort_taxa(fmeasure_for_taxa.keys()): if len(fmeasure_for_taxa[taxon]) != 1: self.logger.error( 'Multiple positions specified for taxon label.') sys.exit() rank_prefix = taxon[0:3] taxon_count[rank_prefix] += 1 stat_table = fmeasure_for_taxa[taxon][0] if stat_table.fmeasure == 1.0: mono[rank_prefix] += 1 elif stat_table.fmeasure >= 0.95: op_mono[rank_prefix] += 1 else: poly[rank_prefix] += 1 fout = open(summary_table, 'w') fout.write('Rank\tNo. taxon') fout.write( '\tNo. monophyletic\tNo. operationally monophyletic\tNo. polyphyletic' ) fout.write( '\tMonophyletic (%)\tOperationally monophyletic (%)\tPolyphyletic (%)\n' ) for idx, rank_prefix in enumerate(Taxonomy.rank_prefixes): fout.write('{}\t{}'.format(Taxonomy.rank_labels[idx], taxon_count[rank_prefix])) fout.write('\t{}\t{}\t{}'.format(mono[rank_prefix], op_mono[rank_prefix], poly[rank_prefix])) fout.write('\t{:.3f}\t{:.3f}\t{:.3f}\n'.format( mono[rank_prefix] * 100.0 / taxon_count[rank_prefix], op_mono[rank_prefix] * 100.0 / taxon_count[rank_prefix], poly[rank_prefix] * 100.0 / taxon_count[rank_prefix])) fout.close()
def __init__(self, cpus, output_dir): """Initialization. Parameters ---------- cpus : int Number of cpus to use. output_dir : str Directory to store results. """ self.logger = logging.getLogger() self.cpus = cpus self.output_dir = output_dir self.rank_prefixes = Taxonomy().rank_prefixes self.rank_labels = Taxonomy().rank_labels # profile for each genome self.profiles = {}
def fill_ranks(self, options): """Ensure taxonomy strings contain all 7 canonical ranks.""" check_file_exists(options.input_taxonomy) fout = open(options.output_taxonomy, 'w') taxonomy = Taxonomy() t = taxonomy.read(options.input_taxonomy) for genome_id, taxon_list in t.iteritems(): full_taxon_list = taxonomy.fill_missing_ranks(taxon_list) taxonomy_str = ';'.join(full_taxon_list) if not taxonomy.check_full(taxonomy_str): sys.exit(-1) fout.write('%s\t%s\n' % (genome_id, taxonomy_str)) fout.close() self.logger.info('Revised taxonomy written to: %s' % options.output_taxonomy)
def fill_ranks(self, options): """Ensure taxonomy strings contain all 7 canonical ranks.""" check_file_exists(options.input_taxonomy) fout = open(options.output_taxonomy, 'w') taxonomy = Taxonomy() t = taxonomy.read(options.input_taxonomy) for genome_id, taxon_list in t.items(): full_taxon_list = taxonomy.fill_missing_ranks(taxon_list) taxonomy_str = ';'.join(full_taxon_list) if not taxonomy.check_full(taxonomy_str): sys.exit(-1) fout.write('%s\t%s\n' % (genome_id, taxonomy_str)) fout.close() self.logger.info('Revised taxonomy written to: %s' % options.output_taxonomy)
def _write_statistics_table(self, fmeasure_for_taxa, taxonomy, out_table): """Write table containing statistics for each taxon. Parameters ---------- fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall)] Node with highest F-measure for each taxon. taxonomy : d[unique_id] -> [d__<taxon>; ...; s__<taxon>] Taxonomic information for taxa in tree of interest. out_table : str Output table to write statistics for assigned labels. """ # get extent taxa extant_taxa = Taxonomy().extant_taxa(taxonomy) fout_table = open(out_table, 'w') fout_table.write('Taxon\tNo. Expected in Tree\tF-measure\tPrecision\tRecall') fout_table.write('\tNo. Genomes from Taxon\tNo. Genome In Lineage') fout_table.write('\tRogue out\tRogue in\n') for taxon in Taxonomy().sort_taxa(fmeasure_for_taxa.keys()): if len(fmeasure_for_taxa[taxon]) != 1: self.logger.error('Multiple positions specified for taxon label.') sys.exit() num_genomes = len(extant_taxa[taxon]) stat_table = fmeasure_for_taxa[taxon][0] fout_table.write('%s\t%d\t%.4f\t%.4f\t%.4f\t%d\t%d\t%s\t%s\n' % ( taxon, num_genomes, stat_table.fmeasure, stat_table.precision, stat_table.recall, stat_table.taxa_in_lineage, stat_table.num_leaves_with_taxa, ','.join(stat_table.rogue_out), ','.join(stat_table.rogue_in))) fout_table.close()
def diff(self, options): """Compare two taxonomy files.""" check_file_exists(options.input_taxonomy1) check_file_exists(options.input_taxonomy2) taxonomy1 = Taxonomy().read(options.input_taxonomy1) taxonomy2 = Taxonomy().read(options.input_taxonomy2) all_taxon_ids = set(taxonomy1.keys()).union(list(taxonomy2.keys())) rank_index = Taxonomy.rank_labels.index(options.rank) for taxon_id in all_taxon_ids: if options.report_missing_taxa: if taxon_id not in taxonomy1: print('Missing in taxonomy 1: %s' % taxon_id) elif taxon_id not in taxonomy2: print('Missing in taxonomy 2: %s' % taxon_id) if taxon_id in taxonomy1 and taxon_id in taxonomy2: taxon1 = taxonomy1[taxon_id][rank_index] taxon2 = taxonomy2[taxon_id][rank_index] if taxon1 != taxon2: if options.report_missing_ranks or (taxon1[3:] and taxon2[3:]): print('Different taxon for %s: %s %s' % (taxon_id, taxon1, taxon2)) print('Done.')
def _tax_diff_table(self, tax1, tax2, output_table): """Tabulate incongruency of taxonomy strings at each rank.""" fout = open(output_table, 'w') fout.write('Lineage\tNo. Extent Taxa') for rank_label in Taxonomy.rank_labels: fout.write('\t%s (%%)' % rank_label.title()) fout.write('\n') taxonomy = Taxonomy() named_lineages_at_rank = taxonomy.named_lineages_at_rank(tax1) for rank, taxa in named_lineages_at_rank.items(): rank_label = Taxonomy.rank_labels[rank] if rank_label == 'species': continue extant_taxa_for_rank = taxonomy.extant_taxa_for_rank(rank_label, tax1) for taxon in taxa: extent_taxa = extant_taxa_for_rank[taxon] fout.write('%s\t%d' % (taxon, len(extent_taxa))) row = defaultdict(list) for genome_id in extent_taxa: taxa1 = tax1[genome_id] taxa2 = tax2[genome_id] for cur_rank, (taxa1, taxa2) in enumerate(list(zip(taxa1, taxa2))): row[cur_rank].append(taxa1 == taxa2) for cur_rank, matches in row.items(): if cur_rank <= rank: fout.write('\t-') else: perc_match = sum(matches) * 100.0 / len(matches) fout.write('\t%.1f' % (100.0 - perc_match)) fout.write('\n') fout.close()
def _tax_diff_table(self, tax1, tax2, output_table): """Tabulate incongruency of taxonomy strings at each rank.""" fout = open(output_table, 'w') fout.write('Lineage\tNo. Extent Taxa') for rank_label in Taxonomy.rank_labels: fout.write('\t%s (%%)' % rank_label.title()) fout.write('\n') taxonomy = Taxonomy() named_lineages_at_rank = taxonomy.named_lineages_at_rank(tax1) for rank, taxa in named_lineages_at_rank.iteritems(): rank_label = Taxonomy.rank_labels[rank] if rank_label == 'species': continue extant_taxa_for_rank = taxonomy.extant_taxa_for_rank(rank_label, tax1) for taxon in taxa: extent_taxa = extant_taxa_for_rank[taxon] fout.write('%s\t%d' % (taxon, len(extent_taxa))) row = defaultdict(list) for genome_id in extent_taxa: taxa1 = tax1[genome_id] taxa2 = tax2[genome_id] for cur_rank, (taxa1, taxa2) in enumerate(zip(taxa1, taxa2)): row[cur_rank].append(taxa1 == taxa2) for cur_rank, matches in row.iteritems(): if cur_rank <= rank: fout.write('\t-') else: perc_match = sum(matches) * 100.0 / len(matches) fout.write('\t%.1f' % (100.0 - perc_match)) fout.write('\n') fout.close()
def classify_seqs(self): """Classify sequences. Sequences are classified using a majority vote over all fragments originating from the sequence with a valid hit. If less than 20% of fragments have a valid hit, the sequence is considered unclassified. Classification is performed from the highest (domain) to lowest (species) rank. If a rank is taxonomically inconsistent with a higher ranks classification, this rank and all lower ranks are set to unclassified. Returns ------- dict : d[contig_id][rank] -> [taxa, HitInfo] Classification of each sequence along with summary statistics of hits to the specified taxa. """ expected_parent = Taxonomy().taxonomic_consistency(self.taxonomy) # classify each sequence using a majority vote seq_assignments = defaultdict(lambda: defaultdict(list)) for seq_id, rank_hits in self.hits.iteritems(): parent_taxa = None for rank in xrange(0, len(self.rank_prefixes)): taxa = max(rank_hits[rank], key=lambda x: len(rank_hits[rank][x])) count = len(rank_hits[rank][taxa]) if count >= self.percent_to_classify * self.fragments_from_seq[ seq_id]: if rank == 0 or expected_parent[taxa] == parent_taxa: seq_assignments[seq_id][rank] = [ taxa, rank_hits[rank][taxa] ] parent_taxa = taxa else: # set to unclassified at all lower ranks for r in xrange(rank, len(self.rank_prefixes)): seq_assignments[seq_id][r] = [self.unclassified, None] break # identify sequences with no hits for seq_id in self.seq_len: if seq_id not in seq_assignments: for rank in xrange(0, len(self.rank_prefixes)): seq_assignments[seq_id][rank] = [self.unclassified, None] return seq_assignments
def outgroup(self, options): """Reroot tree with outgroup.""" check_file_exists(options.taxonomy_file) self.logger.info('Identifying genomes from the specified outgroup.') outgroup = set() for genome_id, taxa in Taxonomy().read(options.taxonomy_file).items(): if options.outgroup_taxon in taxa: outgroup.add(genome_id) self.logger.info('Identifying %d genomes in the outgroup.' % len(outgroup)) reroot = RerootTree() reroot.root_with_outgroup(options.input_tree, options.output_tree, outgroup)
def validate(self, options): """Validate command""" check_file_exists(options.taxonomy_file) taxonomy = Taxonomy() t = taxonomy.read(options.taxonomy_file) errors = taxonomy.validate(t, not options.no_prefix, not options.no_all_ranks, not options.no_hierarhcy, not options.no_species, True) invalid_ranks, invalid_prefixes, invalid_species_name, invalid_hierarchies = errors if sum([len(e) for e in errors]) == 0: self.logger.info('No errors identified in taxonomy file.') else: self.logger.info('Identified %d incomplete taxonomy strings.' % len(invalid_ranks)) self.logger.info('Identified %d rank prefix errors.' % len(invalid_prefixes)) self.logger.info('Identified %d invalid species names.' % len(invalid_species_name)) self.logger.info('Identified %d taxa with multiple parents.' % len(invalid_hierarchies))
def create_records(self, metadata_file, msa_file, taxonomy_file, genome_list, output_file): """Create ARB records from GTDB metadata.""" seqs = {} if msa_file: seqs = seq_io.read(msa_file) taxonomy = {} if taxonomy_file: taxonomy = Taxonomy().read(taxonomy_file) genomes_to_keep = set() if genome_list: for line in open(genome_list): genomes_to_keep.add(line.strip()) fout = open(output_file, 'w') delimiter = ',' if metadata_file.endswith('.tsv'): delimiter = '\t' header = True for row in csv.reader(open(metadata_file, 'rb'), delimiter=delimiter): if header: fields = [ f.lower().replace(' ', '_').replace('-', '_') for f in row[1:] ] if taxonomy: fields.append('gtdb_taxonomy') header = False else: genome_id = row[0] values = row[1:] if taxonomy: values.append('; '.join(taxonomy[genome_id])) aligned_seq = seqs.get(genome_id, '') if not genomes_to_keep or genome_id in genomes_to_keep: self._record(fout, genome_id, fields, values, aligned_seq) fout.close()
def dump(self, genomic_file, gtdb_taxonomy, min_5S_len, min_16S_ar_len, min_16S_bac_len, min_23S_len, min_contig_len, include_user, genome_list, output_dir): """Dump 5S, 16S, and 23S sequences to files.""" if include_user: self.logger.warning('User genomes not currently supported.') sys.exit(-1) gtdb_taxonomy = Taxonomy().read(gtdb_taxonomy) genomes_of_interest = set() if genome_list: for line in open(genome_list): line_split = line.strip().split('\t') gid = line_split[0] if gid.startswith('GCA_'): gid = 'GB_' + gid elif gid.startswith('GCF_'): gid = 'RS_' + gid genomes_of_interest.add(gid) self.logger.info('Restricting gene dump to %d genomes.' % len(genomes_of_interest)) self.logger.info('Dumping 5S sequences.') self._dump_seqs(genomic_file, gtdb_taxonomy, genomes_of_interest, 'lsu_5S/lsu_5S', min_5S_len, min_5S_len, min_contig_len, 'lsu_5s', output_dir) self.logger.info('Dumping 16S sequences.') self._dump_seqs(genomic_file, gtdb_taxonomy, genomes_of_interest, 'rna_silva/ssu', min_16S_ar_len, min_16S_bac_len, min_contig_len, 'ssu', output_dir) self.logger.info('Dumping 23S sequences.') self._dump_seqs(genomic_file, gtdb_taxonomy, genomes_of_interest, 'rna_silva/lsu_23S', min_23S_len, min_23S_len, min_contig_len, 'lsu_23s', output_dir)
def propagate(self, options): """Propagate labels to all genomes in a cluster.""" check_file_exists(options.input_taxonomy) check_file_exists(options.metadata_file) # get representative genome information rep_metadata = read_gtdb_metadata(options.metadata_file, ['gtdb_representative', 'gtdb_clustered_genomes']) taxonomy = Taxonomy() explict_tax = taxonomy.read(options.input_taxonomy) expanded_taxonomy = {} incongruent_count = 0 for genome_id, taxon_list in explict_tax.iteritems(): taxonomy_str = ';'.join(taxon_list) # Propagate taxonomy strings if genome is a representatives. Also, determine # if genomes clustered together have compatible taxonomies. Note that a genome # may not have metadata as it is possible a User has removed a genome that is # in the provided taxonomy file. _rep_genome, clustered_genomes = rep_metadata.get(genome_id, (None, None)) if clustered_genomes: # genome is a representative clustered_genome_ids = clustered_genomes.split(';') # get taxonomy of all genomes in cluster with a specified taxonomy clustered_genome_tax = {} for cluster_genome_id in clustered_genome_ids: if cluster_genome_id == genome_id: continue if cluster_genome_id not in rep_metadata: continue # genome is no longer in the GTDB so ignore it if cluster_genome_id in explict_tax: clustered_genome_tax[cluster_genome_id] = explict_tax[cluster_genome_id] # determine if representative and clustered genome taxonomy strings are congruent working_cluster_taxonomy = list(taxon_list) incongruent_with_rep = False for cluster_genome_id, cluster_tax in clustered_genome_tax.iteritems(): if incongruent_with_rep: working_cluster_taxonomy = list(taxon_list) # default to rep taxonomy break for r in xrange(0, len(Taxonomy.rank_prefixes)): if cluster_tax[r] == Taxonomy.rank_prefixes[r]: break # no more taxonomy information to consider if cluster_tax[r] != taxon_list[r]: if taxon_list[r] == Taxonomy.rank_prefixes[r]: # clustered genome has a more specific taxonomy string which # should be propagate to the representative if all clustered # genomes are in agreement if working_cluster_taxonomy[r] == Taxonomy.rank_prefixes[r]: # make taxonomy more specific based on genomes in cluster working_cluster_taxonomy[r] = cluster_tax[r] elif working_cluster_taxonomy[r] != cluster_tax[r]: # not all genomes agree on the assignment of this rank so leave it unspecified working_cluster_taxonomy[r] = Taxonomy.rank_prefixes[r] break else: # genomes in cluster have incongruent taxonomies so defer to representative self.logger.warning("Genomes in cluster have incongruent taxonomies.") self.logger.warning("Representative %s: %s" % (genome_id, taxonomy_str)) self.logger.warning("Clustered genome %s: %s" % (cluster_genome_id, ';'.join(cluster_tax))) self.logger.warning("Deferring to taxonomy specified for representative.") incongruent_count += 1 incongruent_with_rep = True break cluster_taxonomy_str = ';'.join(working_cluster_taxonomy) # assign taxonomy to representative and all genomes in the cluster expanded_taxonomy[genome_id] = cluster_taxonomy_str for cluster_genome_id in clustered_genome_ids: expanded_taxonomy[cluster_genome_id] = cluster_taxonomy_str else: if genome_id in expanded_taxonomy: # genome has already been assigned a taxonomy based on its representative pass else: # genome is a singleton expanded_taxonomy[genome_id] = taxonomy_str self.logger.info('Identified %d clusters with incongruent taxonomies.' % incongruent_count) fout = open(options.output_taxonomy, 'w') for genome_id, taxonomy_str in expanded_taxonomy.iteritems(): fout.write('%s\t%s\n' % (genome_id, taxonomy_str)) fout.close() self.logger.info('Taxonomy written to: %s' % options.output_taxonomy)
def tax_diff(self, tax1_file, tax2_file, include_user_taxa, output_dir): """Tabulate differences between two taxonomies. Parameters ---------- tax1_file : str First taxonomy file. tax2_file : str Second taxonomy file. include_user_taxa : boolean Flag indicating if User genomes should be considered. output_dir : str Output directory. """ tax1 = Taxonomy().read(tax1_file) tax2 = Taxonomy().read(tax2_file) if not include_user_taxa: new_tax1 = {} for genome_id, taxonomy in tax1.iteritems(): if not genome_id.startswith('U_'): new_tax1[genome_id] = taxonomy tax1 = new_tax1 new_tax2 = {} for genome_id, taxonomy in tax2.iteritems(): if not genome_id.startswith('U_'): new_tax2[genome_id] = taxonomy tax2 = new_tax2 common_taxa = set(tax1.keys()).intersection(tax2.keys()) self.logger.info('First taxonomy contains %d taxa.' % len(tax1)) self.logger.info('Second taxonomy contains %d taxa.' % len(tax2)) self.logger.info('Taxonomies have %d taxa in common.' % len(common_taxa)) # identify differences between taxonomies tax_file_name1 = os.path.splitext(os.path.basename(tax1_file))[0] tax_file_name2 = os.path.splitext(os.path.basename(tax2_file))[0] output_table = os.path.join(output_dir, '%s.tax_diff.tsv' % tax_file_name1) fout = open(output_table, 'w') fout.write('Genome ID\tChange\tRank\t%s\t%s\n' % (tax_file_name1, tax_file_name2)) unchanged = defaultdict(int) # T2 = g__Bob -> T1 = g__Bob, or T2 = g__ -> T1 = g__ active_change = defaultdict(int) # T2 = g__Bob -> T1 = g__Jane, or T2 = g__Bob -> T1 = g__Bob_A passive_change = defaultdict(int) # T2 = g__??? -> T1 = g__Jane unresolved_change = defaultdict(int) # T2 = g__Box -> T1 = g__??? for taxa in common_taxa: t1 = tax1[taxa] t2 = tax2[taxa] for rank, (taxon1, taxon2) in enumerate(zip(t1, t2)): if taxon1 == taxon2: unchanged[rank] += 1 elif taxon1 != Taxonomy.rank_prefixes[rank] and taxon2 != Taxonomy.rank_prefixes[rank]: active_change[rank] += 1 fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'active', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2))) elif taxon2 == Taxonomy.rank_prefixes[rank]: passive_change[rank] += 1 fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'passive', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2))) elif taxon1 == Taxonomy.rank_prefixes[rank]: unresolved_change[rank] += 1 fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'unresolved', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2))) fout.close() # report results output_table = os.path.join(output_dir, '%s.tax_diff_summary.tsv' % tax_file_name1) fout = open(output_table, 'w') fout.write('Rank\tUnchanged\tUnchanged (%)\tActive\t Active (%)\tPassive\tPassive (%)\tUnresolved\tUnresolved (%)\n') print 'Rank\tUnchanged\tActive\tPassive\tUnresolved\tTotal' for rank in xrange(0, len(Taxonomy.rank_prefixes)): total = unchanged[rank] + active_change[rank] + passive_change[rank] + unresolved_change[rank] if total != 0: fout.write('%s\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\n' % (Taxonomy.rank_labels[rank], unchanged[rank], unchanged[rank] * 100.0 / total, active_change[rank], active_change[rank] * 100.0 / total, passive_change[rank], passive_change[rank] * 100.0 / total, unresolved_change[rank], unresolved_change[rank] * 100.0 / total)) print '%s\t%d\t%d\t%d\t%d\t%d' % (Taxonomy.rank_labels[rank], unchanged[rank], active_change[rank], passive_change[rank], unresolved_change[rank], total)
def run(self, input_tree, taxonomy_file, trusted_taxa_file, min_children, min_support, output_tree): """Decorate internal nodes with taxa labels. Parameters ---------- input_tree : str Tree to decorate taxonomy_file : str File indicating taxonomic information for extant taxa. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. output_tree: str Name of output tree. """ # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # remove any previous taxon labels self.logger.info('Removing any previous internal node labels.') self._strip_taxon_labels(tree) # read taxonomy and trim to taxa in tree self.logger.info('Reading taxonomy.') full_taxonomy = Taxonomy().read(taxonomy_file) taxonomy = {} for leaf in tree.leaf_node_iter(): taxonomy[leaf.taxon.label] = full_taxonomy.get(leaf.taxon.label, Taxonomy.rank_prefixes) # find best placement for each taxon based # on the F-measure statistic self.logger.info('Calculating F-measure statistic for each taxa.') fmeasure_for_taxa = self._fmeasure(tree, taxonomy) # place labels with only one acceptable position and calculate # the relative divergence thresholds from these as a guide for # placing the remaining labels self.logger.info('Placing labels with unambiguous position in tree.') placed_taxon = self._assign_taxon_labels(fmeasure_for_taxa) # calculating relative self.logger.info('Establishing median relative divergence for taxonomic ranks.') median_rank_rd = self._median_rank_rd(tree, placed_taxon, taxonomy, trusted_taxa_file, min_children, min_support) # resolve ambiguous position in tree self.logger.info('Resolving ambiguous taxon label placements using median relative divergences.') self._resolve_ambiguous_placements(tree, fmeasure_for_taxa, median_rank_rd) # write statistics for placed taxon labels self.logger.info('Writing out statistics for taxa.') out_table = output_tree + '-table' self._write_statistics_table(fmeasure_for_taxa, out_table) # output taxonomy of extant taxa on tree self.logger.info('Writing out taxonomy for extant taxa.') out_taxonomy = output_tree + '-taxonomy' self._write_taxonomy(tree, out_taxonomy) # validate taxonomy self.logger.info('Validating taxonomy for extant taxa.') tree_taxonomy = Taxonomy().read(out_taxonomy) Taxonomy().validate(tree_taxonomy, check_prefixes=True, check_ranks=True, check_hierarchy=True, check_species=True, check_group_names=True, check_duplicate_names=True, report_errors=True) # output decorated tree self.logger.info('Writing out decorated tree.') tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support): """Determine taxa to use for inferring distribution of relative divergences. Parameters ---------- tree : Dendropy Tree Phylogenetic tree. taxonomy : d[taxon ID] -> [d__x; p__y; ...] Taxonomy for each taxon. trusted_taxa : iterable Trusted taxa to consider when inferring distribution. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. """ # determine children taxa for each named group taxon_children = Taxonomy().taxon_children(taxonomy) # get all named groups taxa_for_dist_inference = set() for taxon_id, taxa in taxonomy.iteritems(): for taxon in taxa: taxa_for_dist_inference.add(taxon) # sanity check species names as these are a common problem species = set() for taxon_id, taxa in taxonomy.iteritems(): if len(taxa) > Taxonomy.rank_index['s__']: species_name = taxa[Taxonomy.rank_index['s__']] valid, error_msg = True, None if species_name != 's__': valid, error_msg = Taxonomy().validate_species_name(species_name, require_full=True, require_prefix=True) if not valid: print '[Warning] Species name %s for %s is invalid: %s' % (species_name, taxon_id, error_msg) continue species.add(species_name) # restrict taxa to those with a sufficient number of named children # Note: a taxonomic group with no children will not end up in the # taxon_children data structure so care must be taken when applying # this filtering criteria. if min_children > 0: valid_taxa = set() for taxon, children_taxa in taxon_children.iteritems(): if len(children_taxa) >= min_children: valid_taxa.add(taxon) taxa_for_dist_inference.intersection_update(valid_taxa) # explicitly add in the species since they have no # children and thus be absent from the taxon_child dictionary taxa_for_dist_inference.update(species) # restrict taxa used for inferring distribution to those with sufficient support if min_support > 0: for node in tree.preorder_node_iter(): if not node.label or node.is_leaf(): continue # check for support value support, taxon_name, _auxiliary_info = parse_label(node.label) if not taxon_name: continue if support and float(support) < min_support: taxa_for_dist_inference.difference_update([taxon_name]) elif not support and min_support > 0: # no support value, so inform user if they were trying to filter on this property print '[Error] Tree does not contain support values. As such, --min_support should be set to 0.' continue # restrict taxa used for inferring distribution to the trusted set if trusted_taxa: taxa_for_dist_inference = trusted_taxa.intersection(taxa_for_dist_inference) return taxa_for_dist_inference
def tree_tax_diff(self, tree1_file, tree2_file, output_dir): """Tabulate differences between two taxonomies on a tree. Parameters ---------- tree1_file : str File with tree in Newick format. tree2_file : str File with tree in Newick format. output_dir : str Output directory. """ tree1 = dendropy.Tree.get_from_path(tree1_file, schema='newick', rooting='force-rooted', preserve_underscores=True) tree2 = dendropy.Tree.get_from_path(tree2_file, schema='newick', rooting='force-rooted', preserve_underscores=True) # prune both trees to a set of common taxa taxa1 = set() for t in tree1.leaf_node_iter(): taxa1.add(t.taxon.label) taxa2 = set() for t in tree2.leaf_node_iter(): taxa2.add(t.taxon.label) taxa_in_common = taxa1.intersection(taxa2) self.logger.info('Tree 1 contains %d taxa.' % len(taxa1)) self.logger.info('Tree 2 contains %d taxa.' % len(taxa2)) self.logger.info('Pruning trees to the %d taxa in common.' % len(taxa_in_common)) tree1.retain_taxa_with_labels(taxa_in_common) tree2.retain_taxa_with_labels(taxa_in_common) # get named lineages at each taxonomic rank taxonomy = Taxonomy() tax1 = taxonomy.read_from_tree(tree1) tax2 = taxonomy.read_from_tree(tree2) taxa_at_rank1 = taxonomy.named_lineages_at_rank(tax1) taxa_at_rank2 = taxonomy.named_lineages_at_rank(tax2) # identify retained taxonomic names tax_file_name = os.path.splitext(os.path.basename(tree1_file))[0] output_file = os.path.join(output_dir, '%s.taxa_diff.tsv' % tax_file_name) fout = open(output_file, 'w') fout.write('Rank\tClassification\tTaxonomy 1\tTaxonomy 2\n') taxon2_accounted_for = defaultdict(set) for rank, rank_label in enumerate(Taxonomy.rank_labels[0:-1]): for taxon in taxa_at_rank1[rank]: # check if taxon has been retained if taxon in taxa_at_rank2[rank]: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'retained', taxon, taxon)) taxon2_accounted_for[rank].add(taxon) continue # check if name was simply corrected by changing suffix old_taxon = self._change_suffix(taxon, rank, taxa_at_rank2) if old_taxon: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'corrected', taxon, old_taxon)) taxon2_accounted_for[rank].add(old_taxon) continue # check if taxon has been moved up or down in rank old_taxon, old_rank = self._renamed(taxon, rank, taxa_at_rank2) if old_taxon: if rank < old_rank: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'more general', taxon, old_taxon)) elif rank == old_rank: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'corrected', taxon, old_taxon)) else: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'more specific', taxon, old_taxon)) taxon2_accounted_for[old_rank].add(old_taxon) continue # otherwise, the taxon appears to be new fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'new', taxon, 'NA')) # report deprecated taxa for rank, rank_label in enumerate(Taxonomy.rank_labels[0:-1]): for taxon in taxa_at_rank2[rank]: if taxon not in taxon2_accounted_for[rank]: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'deprecated', 'NA', taxon)) fout.close() # tabulate congruence of taxonomy strings output_table = os.path.join(output_dir, '%s.perc_diff.tsv' % tax_file_name) self._tax_diff_table(tax1, tax2, output_table)