def manual_species(self, init_taxonomy, manually_curated_tree): """Identify species names manually set by curators.""" # read initial and manually curated taxonomy self.logger.info('Reading initial species names.') init_taxonomy = Taxonomy().read(init_taxonomy, use_canonical_gid=True) init_num_gids = sum( [1 for gid in init_taxonomy if not gid.startswith('D-')]) self.logger.info( ' - read taxonomy for {:,} genomes.'.format(init_num_gids)) self.logger.info('Reading manually-curated species names from tree.') mc_tree = dendropy.Tree.get_from_path(manually_curated_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) mc_taxonomy = Taxonomy().read_from_tree(mc_tree) mc_specific = {} for gid, taxa in mc_taxonomy.items(): if gid.startswith('D-'): continue mc_sp = taxa[-1] if not mc_sp.startswith('s__') or mc_sp == 's__': self.logger.error( 'Most specific classification for {} is {}.'.format( gid, taxa)) continue mc_specific[gid] = specific_epithet(mc_sp) self.logger.info(' - read taxonomy for {:,} genomes.'.format( len(mc_specific))) # report genomes with modified specific name assignment self.logger.info( 'Identifying genomes with manually-curated species names.') fout = open(os.path.join(self.output_dir, 'manual_species_names.tsv'), 'w') fout.write('Genome ID\tInitial species\tManually-curated species\n') num_mc = 0 for gid, mc_sp in mc_specific.items(): init_species = init_taxonomy[gid][Taxonomy.SPECIES_INDEX] init_specific = specific_epithet(init_species) if init_specific != mc_sp: mc_generic = mc_taxonomy[gid][Taxonomy.GENUS_INDEX].replace( 'g__', '') mc_species = 's__{} {}'.format(mc_generic, mc_sp) num_mc += 1 fout.write('{}\t{}\t{}\n'.format(gid, init_species, mc_species)) fout.close() self.logger.info( ' - identified {:,} manually-curated species names.'.format( num_mc))
def pull(self, options): """Pull command""" check_file_exists(options.input_tree) t = Taxonomy().read_from_tree(options.input_tree) #, False) if not options.no_rank_fill: for taxon_id, taxa in t.items(): t[taxon_id] = Taxonomy().fill_missing_ranks(taxa) Taxonomy().write(t, options.output_file) self.logger.info('Taxonomy strings written to: %s' % options.output_file)
def filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support): """Determine taxa to use for inferring distribution of relative divergences. Parameters ---------- tree : Dendropy Tree Phylogenetic tree. taxonomy : d[taxon ID] -> [d__x; p__y; ...] Taxonomy for each taxon. trusted_taxa : iterable Trusted taxa to consider when inferring distribution. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. """ # determine children taxa for each named group taxon_children = Taxonomy().taxon_children(taxonomy) # get all named groups taxa_for_dist_inference = set() for taxon_id, taxa in taxonomy.items(): for taxon in taxa: taxa_for_dist_inference.add(taxon) # sanity check species names as these are a common problem species = set() for taxon_id, taxa in taxonomy.items(): if len(taxa) > Taxonomy.rank_index['s__']: species_name = taxa[Taxonomy.rank_index['s__']] valid, error_msg = True, None if species_name != 's__': valid, error_msg = Taxonomy().validate_species_name(species_name, require_full=True, require_prefix=True) if not valid: print('[Warning] Species name %s for %s is invalid: %s' % (species_name, taxon_id, error_msg)) continue species.add(species_name) # restrict taxa to those with a sufficient number of named children # Note: a taxonomic group with no children will not end up in the # taxon_children data structure so care must be taken when applying # this filtering criteria. if min_children > 0: valid_taxa = set() for taxon, children_taxa in taxon_children.items(): if len(children_taxa) >= min_children: valid_taxa.add(taxon) taxa_for_dist_inference.intersection_update(valid_taxa) # explicitly add in the species since they have no # children and thus be absent from the taxon_child dictionary taxa_for_dist_inference.update(species) # restrict taxa used for inferring distribution to those with sufficient support if min_support > 0: for node in tree.preorder_node_iter(): if not node.label or node.is_leaf(): continue # check for support value support, taxon_name, _auxiliary_info = parse_label(node.label) if not taxon_name: continue if support and float(support) < min_support: taxa_for_dist_inference.difference_update([taxon_name]) elif not support and min_support > 0: # no support value, so inform user if they were trying to filter on this property print('[Error] Tree does not contain support values. As such, --min_support should be set to 0.') continue # restrict taxa used for inferring distribution to the trusted set if trusted_taxa: taxa_for_dist_inference = trusted_taxa.intersection(taxa_for_dist_inference) return taxa_for_dist_inference
def run(self, manual_taxonomy, cur_gtdb_metadata_file, qc_passed_file, ncbi_genbank_assembly_file, untrustworthy_type_file, gtdb_type_strains_ledger, sp_priority_ledger, genus_priority_ledger, ncbi_env_bioproject_ledger, lpsn_gss_file): """Finalize species names based on results of manual curation.""" # initialize species priority manager sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger, genus_priority_ledger, lpsn_gss_file, self.output_dir) # identify species and genus names updated during manual curation self.logger.info('Parsing manually curated taxonomy.') mc_taxonomy = Taxonomy().read(manual_taxonomy, use_canonical_gid=True) self.logger.info(' - read taxonomy for {:,} genomes.'.format( len(mc_taxonomy))) # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file, ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger) self.logger.info( f' - current genome set contains {len(cur_genomes):,} genomes.') # establish appropriate species names for GTDB clusters with new representatives self.logger.info( 'Identifying type species genomes with incongruent GTDB genus assignments.' ) fout = open( os.path.join(self.output_dir, 'type_species_incongruencies.tsv'), 'w') fout.write( 'Genome ID\tGTDB genus\tNCBI genus\tGTDB genus priority date\tNCBI genus priority date\tPriority status\tNCBI RefSeq note\n' ) num_incongruent = 0 for rid, taxa in mc_taxonomy.items(): if cur_genomes[rid].is_gtdb_type_species(): gtdb_genus = taxa[Taxonomy.GENUS_INDEX] ncbi_genus = cur_genomes[rid].ncbi_taxa.genus if gtdb_genus != ncbi_genus: priority_genus = sp_priority_mngr.genus_priority( gtdb_genus, ncbi_genus) if priority_genus != gtdb_genus: num_incongruent += 1 if priority_genus == ncbi_genus: priority_status = 'NCBI genus name has priority' else: priority_status = 'Genus with priority must be manually established' fout.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( rid, gtdb_genus, ncbi_genus, sp_priority_mngr.genus_priority_year(gtdb_genus), sp_priority_mngr.genus_priority_year(ncbi_genus), priority_status, cur_genomes[rid].excluded_from_refseq_note)) self.logger.info( ' - identified {:,} genomes with incongruent genus assignments.'. format(num_incongruent)) fout.close()
def run(self, manual_taxonomy, cur_gtdb_metadata_file, uba_genome_paths, qc_passed_file, ncbi_genbank_assembly_file, untrustworthy_type_file, synonym_file, gtdb_type_strains_ledger, sp_priority_ledger, genus_priority_ledger, dsmz_bacnames_file): """Finalize species names based on results of manual curation.""" # initialize species priority manager sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger, genus_priority_ledger, dsmz_bacnames_file) # identify species and genus names updated during manual curation self.logger.info('Parsing manually curated taxonomy.') mc_taxonomy = Taxonomy().read(manual_taxonomy, use_canonical_gid=True) self.logger.info(' - read taxonomy for {:,} genomes.'.format( len(mc_taxonomy))) # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, uba_genome_file=uba_genome_paths, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info( f' ... current genome set contains {len(cur_genomes):,} genomes.') # get all GTDB species represented by a type strain: gtdb_type_species = set() for rid in mc_taxonomy: if cur_genomes[rid].is_effective_type_strain(): gtdb_type_species.add(mc_taxonomy[rid][Taxonomy.SPECIES_INDEX]) # establish appropriate species names for GTDB clusters with new representatives self.logger.info( 'Identifying type strain genomes with incongruent GTDB species assignments.' ) fout = open( os.path.join(self.output_dir, 'type_strains_incongruencies.tsv'), 'w') fout.write( 'Genome ID\tGTDB species\tNCBI species\tGTDB type strain\tNCBI type strain\tNCBI RefSeq note\n' ) num_incongruent = 0 for rid, taxa in mc_taxonomy.items(): if cur_genomes[rid].is_effective_type_strain(): gtdb_sp = taxa[Taxonomy.SPECIES_INDEX] gtdb_generic = generic_name(gtdb_sp) ncbi_sp = cur_genomes[rid].ncbi_taxa.species ncbi_generic = generic_name(ncbi_sp) if ncbi_sp == 's__': # NCBI taxonomy is sometimes behind the genome annotation pages, # and do not have a species assignment even for type strain genome continue # check if genome is a valid genus transfer into a genus # that already contains a species with the specific # name which results in a polyphyletic suffix being required # e.g. G002240355 is Prauserella marina at NCBI and is # transferred into Saccharomonospora under the GTDB. However, # Saccharomonospora marina already exists so this genome # needs to be S. marina_A. if (is_placeholder_taxon(gtdb_sp) and gtdb_generic != ncbi_generic and canonical_species(gtdb_sp) in gtdb_type_species): continue if not test_same_epithet(specific_epithet(gtdb_sp), specific_epithet(ncbi_sp)): num_incongruent += 1 fout.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format( rid, gtdb_sp, ncbi_sp, cur_genomes[rid].is_gtdb_type_strain(), cur_genomes[rid].is_ncbi_type_strain(), cur_genomes[rid].excluded_from_refseq_note)) self.logger.info( ' - identified {:,} genomes with incongruent species assignments.'. format(num_incongruent)) fout.close()
def propagate(self, options): """Propagate labels to all genomes in a cluster.""" check_file_exists(options.input_taxonomy) check_file_exists(options.metadata_file) user_to_uba = {} if options.uba_mapping_file: self.logger.info('Parsing genome ID mapping file.') with open(options.uba_mapping_file) as f: for line in f: tokens = line.strip().split('\t') if len(tokens) == 2: user_to_uba[tokens[0]] = tokens[1] self.logger.info(' - found mappings for {:,} genomes.'.format( len(user_to_uba))) # get representative genome information rep_metadata = read_gtdb_metadata( options.metadata_file, ['gtdb_representative', 'gtdb_clustered_genomes']) rep_metadata = { canonical_gid(gid): values for gid, values in rep_metadata.items() } rep_metadata = { user_to_uba.get(gid, gid): values for gid, values in rep_metadata.items() } explict_tax = Taxonomy().read(options.input_taxonomy) self.logger.info(f' - identified {len(rep_metadata):,} genomes') # sanity check all representatives have a taxonomy string rep_count = 0 for gid in rep_metadata: is_rep_genome, clustered_genomes = rep_metadata.get( gid, (None, None)) if is_rep_genome: rep_count += 1 if gid not in explict_tax: self.logger.error( 'Expected to find {} in input taxonomy as it is a GTDB representative.' .format(gid)) sys.exit(-1) self.logger.info( 'Identified {:,} representatives in metadata file and {:,} genomes in input taxonomy file.' .format(rep_count, len(explict_tax))) # propagate taxonomy to genomes clustered with each representative fout = open(options.output_taxonomy, 'w') for rid, taxon_list in explict_tax.items(): taxonomy_str = ';'.join(taxon_list) rid = canonical_gid(rid) rid = user_to_uba.get(rid, rid) is_rep_genome, clustered_genomes = rep_metadata[rid] if is_rep_genome: # assign taxonomy to representative and all genomes in the cluster fout.write('{}\t{}\n'.format(rid, taxonomy_str)) for cid in [ gid.strip() for gid in clustered_genomes.split(';') ]: cid = canonical_gid(cid) cid = user_to_uba.get(cid, cid) if cid != rid: if cid in rep_metadata: fout.write('{}\t{}\n'.format(cid, taxonomy_str)) else: self.logger.warning( 'Skipping {} as it is not in GTDB metadata file.' .format(cid)) else: self.logger.error( 'Did not expected to find {} in input taxonomy as it is not a GTDB representative.' .format(rid)) sys.exit(-1) self.logger.info('Taxonomy written to: {}'.format( options.output_taxonomy))
def run(self, input_taxonomy, genome_path_file, metadata_file, max_genomes, min_comp, max_cont, min_quality, max_contigs, min_N50, max_ambiguous, max_gap_length, output_dir): """Calculate ANI for named species.""" # get genomes passing filtering criteria filtered_genome_ids = filter_genomes(metadata_file, min_comp, max_cont, min_quality, max_contigs, min_N50, max_ambiguous, max_gap_length) # get species in each named species taxonomy = Taxonomy().read(input_taxonomy) genome_ids_to_remove = set(taxonomy.keys()) - filtered_genome_ids for genome_id in genome_ids_to_remove: del taxonomy[genome_id] named_species = Taxonomy().extant_taxa_for_rank('species', taxonomy) # get path to nucleotide files nt_files = {} for line in open(genome_path_file): line_split = line.strip().split('\t') gtdb_id = line_split[0] genome_id = gtdb_id.replace('GB_', '').replace('RS_', '') genome_dir = line_split[1] nt_file = os.path.join(genome_dir, 'prodigal', genome_id + '_protein.fna') nt_files[gtdb_id] = nt_file # populate worker queue with data to process worker_queue = mp.Queue() writer_queue = mp.Queue() num_species = 0 for species, genome_ids in named_species.items(): if len(genome_ids) > 1: worker_queue.put((species, genome_ids)) num_species += 1 for _ in range(self.cpus): worker_queue.put((None, None)) try: worker_proc = [mp.Process(target=self.__worker, args=(metadata_file, nt_files, max_genomes, worker_queue, writer_queue)) for _ in range(self.cpus)] write_proc = mp.Process(target=self.__writer, args=(num_species, output_dir, writer_queue)) write_proc.start() for p in worker_proc: p.start() for p in worker_proc: p.join() writer_queue.put((None, None, None, None, None)) write_proc.join() except: for p in worker_proc: p.terminate() write_proc.terminate()
def replace_generic(self, manual_species_names, manual_taxonomy): """Replace generic names with genus assignment.""" # read manually-curated species names self.logger.info('Reading manually-curated species names.') mc_species = {} with open(manual_species_names) as f: f.readline() for line in f: tokens = line.strip().split('\t') mc_species[tokens[0]] = tokens[2] self.logger.info( ' - read manually-curated species for {:,} genomes.'.format( len(mc_species))) # read manual taxonomy file self.logger.info('Reading manually-curated taxonomy.') mc_taxonomy = Taxonomy().read(manual_taxonomy, use_canonical_gid=True) mc_num_gids = sum( [1 for gid in mc_taxonomy if not gid.startswith('D-')]) self.logger.info( ' - read taxonomy for {:,} genomes.'.format(mc_num_gids)) # replace generic names with genus names self.logger.info('Creating taxonomy file with updated species names.') fout = open(os.path.join(self.output_dir, 'taxonomy_updated_sp.tsv'), 'w') num_genomes = 0 for gid, taxa in mc_taxonomy.items(): if gid.startswith('D-'): continue genus = taxa[Taxonomy.GENUS_INDEX] generic = genus.replace('g__', '') if not generic: self.logger.error( 'Genome is missing genus assignment: {}'.format(gid)) raise species = taxa[Taxonomy.SPECIES_INDEX] if gid in mc_species: if generic not in species and species != 's__': self.logger.error( 'Genus assignment does not agree with manually-curated species assignment: {} {} {}' .format(gid, mc_species[gid], '; '.join(mc_taxonomy[gid]))) sp_tokens = species.split() if len(sp_tokens) < 2: self.logger.error( 'Species name appear to be erroneous: {} {}'.format( gid, species)) specific = '<unassigned>' else: specific = species.split()[-1] final_sp = 's__{} {}'.format(generic, specific) taxa[Taxonomy.SPECIES_INDEX] = final_sp fout.write('{}\t{}\n'.format(gid, ';'.join(taxa))) num_genomes += 1 fout.close() self.logger.info(' - processed {:,} genomes.'.format(num_genomes))
def tax_diff(self, tax1_file, tax2_file, include_user_taxa, output_dir): """Tabulate differences between two taxonomies. Parameters ---------- tax1_file : str First taxonomy file. tax2_file : str Second taxonomy file. include_user_taxa : boolean Flag indicating if User genomes should be considered. output_dir : str Output directory. """ tax1 = Taxonomy().read(tax1_file) tax2 = Taxonomy().read(tax2_file) if not include_user_taxa: new_tax1 = {} for genome_id, taxonomy in tax1.items(): if not genome_id.startswith('U_'): new_tax1[genome_id] = taxonomy tax1 = new_tax1 new_tax2 = {} for genome_id, taxonomy in tax2.items(): if not genome_id.startswith('U_'): new_tax2[genome_id] = taxonomy tax2 = new_tax2 common_taxa = set(tax1.keys()).intersection(list(tax2.keys())) self.logger.info('First taxonomy contains %d taxa.' % len(tax1)) self.logger.info('Second taxonomy contains %d taxa.' % len(tax2)) self.logger.info('Taxonomies have %d taxa in common.' % len(common_taxa)) # identify differences between taxonomies tax_file_name1 = os.path.splitext(os.path.basename(tax1_file))[0] tax_file_name2 = os.path.splitext(os.path.basename(tax2_file))[0] output_table = os.path.join(output_dir, '%s.tax_diff.tsv' % tax_file_name1) fout = open(output_table, 'w') fout.write('Genome ID\tChange\tRank\t%s\t%s\n' % (tax_file_name1, tax_file_name2)) unchanged = defaultdict(int) # T2 = g__Bob -> T1 = g__Bob, or T2 = g__ -> T1 = g__ active_change = defaultdict(int) # T2 = g__Bob -> T1 = g__Jane, or T2 = g__Bob -> T1 = g__Bob_A passive_change = defaultdict(int) # T2 = g__??? -> T1 = g__Jane unresolved_change = defaultdict(int) # T2 = g__Box -> T1 = g__??? for taxa in common_taxa: t1 = tax1[taxa] t2 = tax2[taxa] for rank, (taxon1, taxon2) in enumerate(list(zip(t1, t2))): if taxon1 == taxon2: unchanged[rank] += 1 elif taxon1 != Taxonomy.rank_prefixes[rank] and taxon2 != Taxonomy.rank_prefixes[rank]: active_change[rank] += 1 fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'active', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2))) elif taxon2 == Taxonomy.rank_prefixes[rank]: passive_change[rank] += 1 fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'passive', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2))) elif taxon1 == Taxonomy.rank_prefixes[rank]: unresolved_change[rank] += 1 fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'unresolved', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2))) fout.close() # report results output_table = os.path.join(output_dir, '%s.tax_diff_summary.tsv' % tax_file_name1) fout = open(output_table, 'w') fout.write('Rank\tUnchanged\tUnchanged (%)\tActive\t Active (%)\tPassive\tPassive (%)\tUnresolved\tUnresolved (%)\n') print('Rank\tUnchanged\tActive\tPassive\tUnresolved\tTotal') for rank in range(0, len(Taxonomy.rank_prefixes)): total = unchanged[rank] + active_change[rank] + passive_change[rank] + unresolved_change[rank] if total != 0: fout.write('%s\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\n' % (Taxonomy.rank_labels[rank], unchanged[rank], unchanged[rank] * 100.0 / total, active_change[rank], active_change[rank] * 100.0 / total, passive_change[rank], passive_change[rank] * 100.0 / total, unresolved_change[rank], unresolved_change[rank] * 100.0 / total)) print('%s\t%d\t%d\t%d\t%d\t%d' % (Taxonomy.rank_labels[rank], unchanged[rank], active_change[rank], passive_change[rank], unresolved_change[rank], total))