def add_suffixed_species(self, species): """Account for species names when generating further placeholder names. This is required as species can be transferred between genera and will retain their existing suffix. As such, we must track that this genera now has a species name with a given suffix. (e.g., Lactobacillus_G kunkeei_A is transferred to Apilactobacillus as A. kunkeei_A, so the next suffixed A. kunkeei representative must have a 'B' suffix.) """ canonical_species = canonical_taxon(species) if canonical_species not in self.taxon_suffix: self.taxon_suffix[canonical_species] = 'A' else: specific = specific_epithet(species) suffix = taxon_suffix(specific) if canonical_species in self.taxon_suffix: if self.is_higher_suffix(suffix, self.taxon_suffix[canonical_species]): self.taxon_suffix[canonical_species] = suffix else: # add new canonical taxon to suffix map self.taxon_suffix[canonical_species] = suffix
def add_suffix(self, generic, specific): """Determine suffix for species.""" sp = 's__{} {}'.format(generic, specific) next_suffix = self.taxon_suffix_manager.next_suffix(sp) canonical_specific = canonical_taxon(specific) return '{}_{}'.format(canonical_specific, next_suffix)
def next_suffix(self, taxon): """Get next suffix for taxon.""" ct = canonical_taxon(taxon) if ct in self.taxon_suffix: cur_suffix = self.taxon_suffix[ct] next_suffix = self._increment_suffix(cur_suffix) else: next_suffix = 'A' self.taxon_suffix[ct] = next_suffix return next_suffix
def __init__(self, prev_genomes, cur_genomes): """Initialization.""" self.logger = logging.getLogger('timestamp') self.taxon_suffix_manager = TaxonSuffixManager() self.prev_genomes = prev_genomes self.cur_genomes = cur_genomes # get species assignment of previous GTDB species clusters self.gtdb_sp_epithets = defaultdict(set) self.gtdb_canonical_sp_epithets = defaultdict(set) self.sp_epithets_rid = defaultdict(lambda: {}) for rid, _sp in prev_genomes.sp_clusters.species(): gtdb_genus = prev_genomes[rid].gtdb_taxa.genus gtdb_sp_epithet = prev_genomes[rid].gtdb_taxa.specific_epithet self.gtdb_sp_epithets[gtdb_genus].add(gtdb_sp_epithet) self.gtdb_canonical_sp_epithets[gtdb_genus].add( canonical_taxon(gtdb_sp_epithet)) self.sp_epithets_rid[gtdb_genus][gtdb_sp_epithet] = rid
def resolve_gtdb_genus(self, gid_anis, ncbi_sp, type_gids, cur_genomes): """Resolve by identifying genomes with a conflicting GTDB genus assignments.""" ncbi_genus = 'g__' + generic_name(ncbi_sp) untrustworthy_gids = {} matched_genus = 0 for gid in type_gids: canonical_gtdb_genus = canonical_taxon(cur_genomes[gid].gtdb_taxa.genus) if ncbi_genus == canonical_gtdb_genus: matched_genus += 1 else: untrustworthy_gids[gid] = f'Conflicting GTDB genus assignment of {cur_genomes[gid].gtdb_taxa.genus}, expected {ncbi_genus}' all_similar = self.check_strain_ani(gid_anis, untrustworthy_gids) if all_similar and len(untrustworthy_gids) > 0 and matched_genus > 0: return True, untrustworthy_gids return False, {}
def run(self, cur_gtdb_metadata_file, cur_genomic_path_file, qc_passed_file, ncbi_genbank_assembly_file, ltp_taxonomy_file, gtdb_type_strains_ledger, untrustworthy_type_ledger, ncbi_env_bioproject_ledger): """Resolve cases where a species has multiple genomes assembled from the type strain.""" # get species in LTP reference database self.logger.info( 'Determining species defined in LTP reference database.') ltp_defined_species = self.ltp_defined_species(ltp_taxonomy_file) self.logger.info( f' - identified {len(ltp_defined_species):,} species.') # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_ledger, ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger) cur_genomes.load_genomic_file_paths(cur_genomic_path_file) # parsing genomes manually established to be untrustworthy as type self.logger.info( 'Determining genomes manually annotated as untrustworthy as type.') manual_untrustworthy_types = self.parse_untrustworthy_type_ledger( untrustworthy_type_ledger) self.logger.info( f' - identified {len(manual_untrustworthy_types):,} genomes manually annotated as untrustworthy as type.' ) # Identify NCBI species with multiple genomes assembled from type strain of species. This # is done using a series of heuristics that aim to ensure that the selected type strain # genome is reliable. More formal evaluation and a manuscript descirbing this selection # process is ultimately required. Ideally, the community will eventually adopt a # database that indicates a single `type genome assembly` for each species instead # of just indicating a type strain from which many (sometimes dissimilar) assemblies exist. self.logger.info( 'Determining number of type strain genomes in each NCBI species.') multi_type_strains_sp = self.sp_with_mult_type_strains(cur_genomes) self.logger.info( f' - identified {len(multi_type_strains_sp):,} NCBI species with multiple assemblies indicated as being type strain genomes.' ) # resolve species with multiple type strain genomes fout = open( os.path.join(self.output_dir, 'multi_type_strain_species.tsv'), 'w') fout.write( 'NCBI species\tNo. type strain genomes\t>=99% ANI\tMean ANI\tStd ANI\tMean AF\tStd AF\tResolution\tGenome IDs\n' ) fout_genomes = open( os.path.join(self.output_dir, 'type_strain_genomes.tsv'), 'w') fout_genomes.write( 'Genome ID\tUntrustworthy\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tConflict with prior GTDB assignment' ) fout_genomes.write( '\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\tReason for GTDB untrustworthy as type\n' ) fout_unresolved = open( os.path.join(self.output_dir, 'unresolved_type_strain_genomes.tsv'), 'w') fout_unresolved.write( 'Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species') fout_unresolved.write( '\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n' ) fout_high_divergence = open( os.path.join(self.output_dir, 'highly_divergent_type_strain_genomes.tsv'), 'w') fout_high_divergence.write( 'Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n' ) fout_untrustworthy = open( os.path.join(self.output_dir, 'untrustworthy_type_material.tsv'), 'w') fout_untrustworthy.write( 'Genome ID\tNCBI species\tGTDB species\tLTP species\tReason for declaring untrustworthy\n' ) for gid in manual_untrustworthy_types: ncbi_sp, reason = manual_untrustworthy_types[gid] fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format( gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species, '<not tested>', 'n/a', 'Manual curation: ' + reason)) processed = 0 num_divergent = 0 unresolved_sp_count = 0 ncbi_ltp_resolved = 0 intra_ani_resolved = 0 ncbi_type_resolved = 0 ncbi_rep_resolved = 0 gtdb_family_resolved = 0 gtdb_genus_resolved = 0 gtdb_sp_resolved = 0 ltp_resolved = 0 # *** Perhaps should be an external flag, but used right now to speed up debugging use_pickled_results = False if use_pickled_results: self.logger.warning( 'Using previously calculated ANI results in: {}'.format( self.ani_pickle_dir)) prev_gtdb_sp_conflicts = 0 self.logger.info( 'Resolving species with multiple type strain genomes:') for ncbi_sp, type_gids in sorted(multi_type_strains_sp.items(), key=lambda kv: len(kv[1])): assert len(type_gids) > 1 status_str = '-> Processing {} with {:,} type strain genomes [{:,} of {:,} ({:.2f}%)].'.format( ncbi_sp, len(type_gids), processed + 1, len(multi_type_strains_sp), (processed + 1) * 100.0 / len(multi_type_strains_sp)).ljust(128) sys.stdout.write('{}\r'.format(status_str)) sys.stdout.flush() processed += 1 # calculate ANI between type strain genomes all_similar, anis, afs, gid_anis, gid_afs = self.calculate_type_strain_ani( ncbi_sp, type_gids, cur_genomes, use_pickled_results) # read LTP metadata for genomes ltp_metadata = self.parse_ltp_metadata(type_gids, cur_genomes) untrustworthy_gids = {} gtdb_resolved_sp_conflict = False unresolved_species = False note = 'All type strain genomes have ANI >99% and AF >65%.' if not all_similar: note = '' # need to establish which genomes are untrustworthy as type num_divergent += 1 unresolved_species = True # write out highly divergent cases for manual inspection; # these should be compared to the automated selection if np_mean(anis) < 95: for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_high_divergence.write( '{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n' .format(gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa)) # filter genomes marked as `untrustworthy as type` at NCBI and where the LTP # assignment also suggest the asserted type material is incorrect resolved, untrustworthy_gids = self.resolve_validated_untrustworthy_ncbi_genomes( gid_anis, ncbi_sp, type_gids, ltp_metadata, ltp_defined_species, cur_genomes) if resolved: note = "Species resolved by removing genomes considered `untrustworthy as type` and with a LTP BLAST hit confirming the assembly is likely untrustworthy" ncbi_ltp_resolved += 1 # try to resolve by LTP 16S BLAST results if not resolved: resolved, untrustworthy_gids = self.resolve_ltp_conflict( gid_anis, ncbi_sp, type_gids, ltp_metadata, 0) if resolved: note = 'Species resolved by identifying conflicting or lack of LTP BLAST results' ltp_resolved += 1 # try to resolve species using intra-specific ANI test if not resolved: resolved, untrustworthy_gids = self.resolve_by_intra_specific_ani( gid_anis) if resolved: note = 'Species resolved by intra-specific ANI test' intra_ani_resolved += 1 # try to resolve by GTDB family assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_family( gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB family classifications' gtdb_family_resolved += 1 # try to resolve by GTDB genus assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_genus( gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB genus classifications' gtdb_genus_resolved += 1 # try to resolve by GTDB species assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_species( gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB species classifications' gtdb_sp_resolved += 1 # try to resolve by considering genomes annotated as type material at NCBI, # which includes considering if genomes are marked as untrustworthy as type if not resolved: resolved, untrustworthy_gids = self.resolve_by_ncbi_types( gid_anis, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting NCBI assembled from type metadata' ncbi_type_resolved += 1 # try to resovle by considering genomes annotated as representative genomes at NCBI if not resolved: resolved, untrustworthy_gids = self.resolve_by_ncbi_reps( gid_anis, type_gids, cur_genomes) if resolved: note = 'Species resolved by considering NCBI representative genomes' ncbi_rep_resolved += 1 if resolved: unresolved_species = False # check if type strain genomes marked as trusted or untrusted conflict # with current GTDB species assignment untrustworthy_gtdb_sp_match = False trusted_gtdb_sp_match = False for gid in type_gids: gtdb_canonical_epithet = canonical_taxon( specific_epithet( cur_genomes[gid].gtdb_taxa.species)) if gtdb_canonical_epithet == specific_epithet(ncbi_sp): if gid in untrustworthy_gids: untrustworthy_gtdb_sp_match = True else: trusted_gtdb_sp_match = True if untrustworthy_gtdb_sp_match and not trusted_gtdb_sp_match: prev_gtdb_sp_conflicts += 1 gtdb_resolved_sp_conflict = True else: note = 'Species is unresolved; manual curation is required!' unresolved_sp_count += 1 if unresolved_species: for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_unresolved.write( '{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n' .format(gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa)) # remove genomes marked as untrustworthy as type at NCBI if one or more potential type strain genomes remaining ncbi_untrustworthy_gids = set([ gid for gid in type_gids if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note ]) if len(type_gids - set(untrustworthy_gids) - ncbi_untrustworthy_gids) >= 1: for gid in ncbi_untrustworthy_gids: untrustworthy_gids[ gid] = "Genome annotated as `untrustworthy as type` at NCBI and there are other potential type strain genomes available" # report cases where genomes marked as untrustworthy as type at NCBI are being retained as potential type strain genomes num_ncbi_untrustworthy = len(ncbi_untrustworthy_gids) for gid in type_gids: if (gid not in untrustworthy_gids and 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note): self.logger.warning( "Retaining genome {} from {} despite being marked as `untrustworthy as type` at NCBI [{:,} of {:,} considered untrustworthy]." .format(gid, ncbi_sp, num_ncbi_untrustworthy, len(type_gids))) # write out genomes identified as being untrustworthy for gid, reason in untrustworthy_gids.items(): ltp_species = self.ltp_species(gid, ltp_metadata) if 'untrustworthy as type' in cur_genomes[ gid].excluded_from_refseq_note: reason += "; considered `untrustworthy as type` at NCBI" fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format( gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), reason)) # Sanity check that if the untrustworthy genome has an LTP to only the # expected species, that all other genomes also have a hit to the # expected species (or potentially no hit). Otherwise, more consideration # should be given to the genome with the conflicting LTP hit. if len(ltp_species) == 1 and ncbi_sp in ltp_species: other_sp = set() for test_gid in type_gids: ltp_species = self.ltp_species(test_gid, ltp_metadata) if ltp_species and ncbi_sp not in ltp_species: other_sp.update(ltp_species) if other_sp: self.logger.warning( f'Genome {gid} marked as untrustworthy, but this conflicts with high confidence LTP 16S rRNA assignment.' ) # write out information about all type genomes for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_genomes.write( '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\t{}\n' .format(gid, gid in untrustworthy_gids, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), gtdb_resolved_sp_conflict, np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa, untrustworthy_gids.get(gid, ''))) fout.write( '{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\n'.format( ncbi_sp, len(type_gids), all_similar, np_mean(anis), np_std(anis), np_mean(afs), np_std(afs), note, ', '.join(type_gids))) sys.stdout.write('\n') fout.close() fout_unresolved.close() fout_high_divergence.close() fout_genomes.close() fout_untrustworthy.close() self.logger.info( f'Identified {num_divergent:,} species with 1 or more divergent type strain genomes.' ) self.logger.info( f' - resolved {ncbi_ltp_resolved:,} species by removing NCBI `untrustworthy as type` genomes with a conflicting LTP 16S rRNA classifications.' ) self.logger.info( f' - resolved {ltp_resolved:,} species by considering conflicting LTP 16S rRNA classifications.' ) self.logger.info( f' - resolved {intra_ani_resolved:,} species by considering intra-specific ANI values.' ) self.logger.info( f' - resolved {gtdb_family_resolved:,} species by considering conflicting GTDB family classifications.' ) self.logger.info( f' - resolved {gtdb_genus_resolved:,} species by considering conflicting GTDB genus classifications.' ) self.logger.info( f' - resolved {gtdb_sp_resolved:,} species by considering conflicting GTDB species classifications.' ) self.logger.info( f' - resolved {ncbi_type_resolved:,} species by considering type material designations at NCBI.' ) self.logger.info( f' - resolved {ncbi_rep_resolved:,} species by considering RefSeq reference and representative designations at NCBI.' ) if unresolved_sp_count > 0: self.logger.warning( f'There are {unresolved_sp_count:,} unresolved species with multiple type strain genomes.' ) self.logger.warning( 'These should be handled before proceeding with the next step of GTDB species updating.' ) self.logger.warning( "This can be done by manual curation and adding genomes to 'untrustworthy_type_ledger'." ) self.logger.info( f'Identified {prev_gtdb_sp_conflicts:,} cases where resolved type strain conflicts with prior GTDB assignment.' )
def infer_epithet_map(self, gids_of_interest, mc_species, cur_genomes, cur_clusters): """Infer mapping of NCBI epithet to GTDB epithet which may be different due to gender of genus.""" # ************************************** # This should be updated so it only includes valid transfers, and not # results due to misclassifications at NCBI. For example, right now this # code reports Enterobacter cancerogenus being transferred to Pantoea, but # really this is just a misclassified NCBI genome. # get species in GTDB genus generic_rids = defaultdict(list) for rid in cur_clusters: if rid not in gids_of_interest: continue gtdb_generic = cur_genomes[rid].gtdb_taxa.genus.replace('g__', '') if rid in mc_species: gtdb_generic = generic_name(mc_species[rid]) generic_rids[gtdb_generic].append(rid) # establish epithets that are nearly identical # except for small change to suffix which is # assumed to be due to a gender change for gtdb_generic, rids in generic_rids.items(): ncbi_sp_epithet_list = defaultdict(list) for rid in rids: ncbi_species = cur_genomes[rid].ncbi_taxa.species if ncbi_species == 's__': continue ncbi_generic = generic_name(ncbi_species) ncbi_specific = specific_epithet(ncbi_species) if rid in mc_species: gtdb_species = mc_species[rid] else: gtdb_species = cur_genomes[rid].gtdb_taxa.species gtdb_specific = canonical_taxon(specific_epithet(gtdb_species)) self.gtdb_ncbi_generic_map[gtdb_generic][gtdb_specific].append( ncbi_generic) if test_same_epithet(ncbi_specific, gtdb_specific): ncbi_sp_epithet_list[ncbi_specific].append(gtdb_specific) for ncbi_specific, gtdb_specific_list in ncbi_sp_epithet_list.items( ): gtdb_specific_counter = Counter(gtdb_specific_list) top_gtdb_specific, count = gtdb_specific_counter.most_common( 1)[0] map_perc = count * 100.0 / len(gtdb_specific_list) if map_perc >= 50: self.sp_epithet_map[gtdb_generic][ ncbi_specific] = top_gtdb_specific if map_perc != 100: self.logger.warning( 'Imperfect suffix mapping between from {} {} to {} at {:.1f}%.' .format(gtdb_generic, top_gtdb_specific, ncbi_specific, count * 100.0 / len(gtdb_specific_list)))
def update(self, orig_rid, new_rid): """Determine if species name can be updated.""" orig_prev_gtdb_genus = self.prev_genomes[orig_rid].gtdb_taxa.genus orig_prev_gtdb_sp = self.prev_genomes[orig_rid].gtdb_taxa.species orig_prev_gtdb_sp_epithet = self.prev_genomes[ orig_rid].gtdb_taxa.specific_epithet orig_prev_ncbi_sp_epithet = self.prev_genomes[ orig_rid].ncbi_taxa.specific_epithet new_cur_ncbi_sp_epithet = self.cur_genomes[ new_rid].ncbi_taxa.specific_epithet new_cur_gtdb_sp = 's__{} {}'.format(orig_prev_gtdb_genus[3:], new_cur_ncbi_sp_epithet) new_is_type_strain = self.cur_genomes[ new_rid].is_effective_type_strain() actions = [] if new_cur_ncbi_sp_epithet == orig_prev_gtdb_sp_epithet: # given that the species epithet has not changed, there is no # need to update the GTDB species name actions.append(('UNCHANGED', orig_rid, new_rid, orig_prev_gtdb_sp, None, 0, 0)) elif new_is_type_strain: # new representative is a type strain with a new species epithet so the # GTDB species name needs to be updated if new_cur_ncbi_sp_epithet in self.sp_epithets_rid[ orig_prev_gtdb_genus]: # the new species name already exists in the GTDB conflicting_rid = self.sp_epithets_rid[orig_prev_gtdb_genus][ new_cur_ncbi_sp_epithet] ani, af = self.fastani.symmetric_ani_cached( new_rid, conflicting_rid, self.cur_genomes[new_rid].genomic_file, self.cur_genomes[conflicting_rid].genomic_file) if self.cur_genomes[conflicting_rid].is_gtdb_type_strain(): # this is an issue since we have two GTDB clusters claiming # to be represented by a type strain genome actions.append(('CONFLICT', orig_rid, new_rid, new_cur_gtdb_sp + '-CONFLICT', conflicting_rid, ani, af)) else: # need to update the GTDB species cluster that currently # has this species name since it is not based on the type strain actions.append(('UPDATED', orig_rid, new_rid, new_cur_gtdb_sp, conflicting_rid, ani, af)) conflicting_epithet = self._nontype_sp_epithet( new_cur_ncbi_sp_epithet, orig_prev_gtdb_genus) actions.append( ('UPDATED', conflicting_rid, conflicting_rid, 's__{} {}'.format(orig_prev_gtdb_genus[3:], conflicting_epithet), None, 0, 0)) else: # new species name doesn't exist in GTDB so can update name of GTDB cluster epithet = new_cur_ncbi_sp_epithet actions.append(('UPDATED', orig_rid, new_rid, new_cur_gtdb_sp, None, 0, 0)) else: # should only update the name of a GTDB species cluster when necessary, # which occurs when the cluster does not properly reflect a valid # or effectively published species name if (new_cur_ncbi_sp_epithet and new_cur_ncbi_sp_epithet != canonical_taxon(orig_prev_gtdb_sp_epithet)): # new representative has a different species epithet than # reflected by the current GTDB species name if new_cur_ncbi_sp_epithet not in self.gtdb_canonical_sp_epithets[ orig_prev_gtdb_genus]: # first occurrence of species in genus so should update GTDB # cluster to reflect this species name actions.append(('UPDATED', orig_rid, new_rid, new_cur_gtdb_sp, None, 0, 0)) else: # species epithet already occurs in this genus so need to check # if it directly conflicts with the current name of the GTDB # species cluster if is_placeholder_sp_epithet(orig_prev_gtdb_sp_epithet): # GTDB species cluster has a placeholder name so does # not conflict with epithet of new representative actions.append(('UNCHANGED', orig_rid, new_rid, orig_prev_gtdb_sp, None, 0, 0)) else: # GTDB species cluster reflects a valid or effectively published # species name that is in conflict with the new representative assert orig_prev_gtdb_sp_epithet != new_cur_ncbi_sp_epithet # how can the correct name for the species cluster be determined? actions.append(('MANUAL_CURATION', orig_rid, new_rid, orig_prev_gtdb_sp, None, 0, 0)) else: # species epithet of new representative is reflected by current # GTDB species name so no change is required actions.append(('UNCHANGED', orig_rid, new_rid, orig_prev_gtdb_sp, None, 0, 0)) assert len(actions) >= 1 return actions
def run(self, cur_gtdb_metadata_file, cur_genomic_path_file, qc_passed_file, ncbi_genbank_assembly_file, ltp_taxonomy_file, gtdb_type_strains_ledger, untrustworthy_type_ledger): """Resolve cases where a species has multiple genomes assembled from the type strain.""" # get species in LTP reference database self.logger.info('Determining species defined in LTP reference database.') ltp_defined_species = self.ltp_defined_species(ltp_taxonomy_file) self.logger.info(f' ... identified {len(ltp_defined_species):,} species.') # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, uba_genome_file=None, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_ledger) cur_genomes.load_genomic_file_paths(cur_genomic_path_file) self.logger.info(f' ... current genome set contains {len(cur_genomes):,} genomes.') # update current genomes with GTDB-Tk classifications self.logger.info('Updating current genomes with GTDB-Tk classifications.') num_updated, num_ncbi_sp = cur_genomes.set_gtdbtk_classification(gtdbtk_classify_file, prev_genomes) self.logger.info(f' ... set GTDB taxa for {num_updated:,} genomes with {num_ncbi_sp:,} genomes using NCBI genus and species name.') # parsing genomes manually established to be untrustworthy as type self.logger.info('Determining genomes manually annotated as untrustworthy as type.') manual_untrustworthy_types = {} with open(untrustworthy_type_ledger) as f: header = f.readline().strip().split('\t') ncbi_sp_index = header.index('NCBI species') reason_index = header.index('Reason for declaring untrustworthy') for line in f: tokens = line.strip().split('\t') gid = canonical_gid(tokens[0]) manual_untrustworthy_types[gid] = (tokens[ncbi_sp_index], tokens[reason_index]) self.logger.info(f' ... identified {len(manual_untrustworthy_types):,} genomes manually annotated as untrustworthy as type.') # identify NCBI species with multiple genomes assembled from type strain of species self.logger.info('Determining number of type strain genomes in each NCBI species.') sp_type_strain_genomes = defaultdict(set) for gid in cur_genomes: if cur_genomes[gid].is_effective_type_strain(): ncbi_sp = cur_genomes[gid].ncbi_taxa.species if ncbi_sp != 's__': # yes, NCBI has genomes marked as assembled from type material # that do not actually have a binomial species name sp_type_strain_genomes[ncbi_sp].add(gid) multi_type_strains_sp = [ncbi_sp for ncbi_sp, gids in sp_type_strain_genomes.items() if len(gids) > 1] self.logger.info(f' ... identified {len(multi_type_strains_sp):,} NCBI species with multiple assemblies indicated as being type strain genomes.') # sort by number of genome assemblies self.logger.info('Calculating ANI between type strain genomes in each species.') fout = open(os.path.join(self.output_dir, 'multi_type_strain_species.tsv'), 'w') fout.write('NCBI species\tNo. type strain genomes\t>=99% ANI\tMean ANI\tStd ANI\tMean AF\tStd AF\tResolution\tGenome IDs\n') fout_genomes = open(os.path.join(self.output_dir, 'type_strain_genomes.tsv'), 'w') fout_genomes.write('Genome ID\tUntrustworthy\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tConflict with prior GTDB assignment') fout_genomes.write('\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n') fout_unresolved = open(os.path.join(self.output_dir, 'unresolved_type_strain_genomes.tsv'), 'w') fout_unresolved.write('Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species') fout_unresolved.write('\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n') fout_high_divergence = open(os.path.join(self.output_dir, 'highly_divergent_type_strain_genomes.tsv'), 'w') fout_high_divergence.write('Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n') fout_untrustworthy = open(os.path.join(self.output_dir, 'untrustworthy_type_material.tsv'), 'w') fout_untrustworthy.write('Genome ID\tNCBI species\tGTDB species\tLTP species\tReason for declaring untrustworthy\n') for gid in manual_untrustworthy_types: ncbi_sp, reason = manual_untrustworthy_types[gid] fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format( gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species, '<not tested>', 'n/a', 'Manual curation: ' + reason)) processed = 0 num_divergent = 0 unresolved_sp_count = 0 ncbi_ltp_resolved = 0 intra_ani_resolved = 0 ncbi_type_resolved = 0 gtdb_family_resolved = 0 gtdb_genus_resolved = 0 gtdb_sp_resolved = 0 ltp_resolved = 0 use_pickled_results = False #*** if use_pickled_results: self.logger.warning('Using previously calculated ANI results in: {}'.format(self.ani_pickle_dir)) prev_gtdb_sp_conflicts = 0 for ncbi_sp, type_gids in sorted(sp_type_strain_genomes.items(), key=lambda kv: len(kv[1])): if len(type_gids) == 1: continue status_str = '-> Processing {} with {:,} type strain genomes [{:,} of {:,} ({:.2f}%)].'.format( ncbi_sp, len(type_gids), processed+1, len(multi_type_strains_sp), (processed+1)*100.0/len(multi_type_strains_sp)).ljust(128) sys.stdout.write('{}\r'.format(status_str)) sys.stdout.flush() processed += 1 # calculate ANI between type strain genomes ncbi_sp_str = ncbi_sp[3:].lower().replace(' ', '_') if not use_pickled_results: #*** ani_af = self.fastani.pairwise(type_gids, cur_genomes.genomic_files) pickle.dump(ani_af, open(os.path.join(self.ani_pickle_dir, f'{ncbi_sp_str}.pkl'), 'wb')) else: ani_af = pickle.load(open(os.path.join(self.ani_pickle_dir, f'{ncbi_sp_str}.pkl'), 'rb')) anis = [] afs = [] gid_anis = defaultdict(lambda: {}) gid_afs = defaultdict(lambda: {}) all_similar = True for gid1, gid2 in combinations(type_gids, 2): ani, af = symmetric_ani(ani_af, gid1, gid2) if ani < 99 or af < 0.65: all_similar = False anis.append(ani) afs.append(af) gid_anis[gid1][gid2] = ani gid_anis[gid2][gid1] = ani gid_afs[gid1][gid2] = af gid_afs[gid2][gid1] = af note = 'All type strain genomes have ANI >99% and AF >65%.' unresolved_species = False # read LTP metadata for genomes ltp_metadata = self.parse_ltp_metadata(type_gids, cur_genomes) untrustworthy_gids = {} gtdb_resolved_sp_conflict = False if not all_similar: # need to establish which genomes are untrustworthy as type num_divergent += 1 unresolved_species = True # write out highly divergent cases for manual inspection; # these should be compared to the automated selection if np_mean(anis) < 95: for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_high_divergence.write('{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format( gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa)) # filter genomes marked as `untrustworthy as type` at NCBI and where the LTP # assignment also suggest the asserted type material is incorrect resolved, untrustworthy_gids = self.resolve_validated_untrustworthy_ncbi_genomes(gid_anis, ncbi_sp, type_gids, ltp_metadata, ltp_defined_species, cur_genomes) if resolved: note = "Species resolved by removing genomes considered `untrustworthy as type` and with a LTP BLAST hit confirming the assembly is likely untrustworthy" ncbi_ltp_resolved += 1 # try to resolve by LTP 16S BLAST results if not resolved: resolved, untrustworthy_gids = self.resolve_ltp_conflict(gid_anis, ncbi_sp, type_gids, ltp_metadata, 0) if resolved: note = 'Species resolved by identifying conflicting or lack of LTP BLAST results' ltp_resolved += 1 # try to resolve species using intra-specific ANI test if not resolved: resolved, untrustworthy_gids = self.resolve_by_intra_specific_ani(gid_anis) if resolved: note = 'Species resolved by intra-specific ANI test' intra_ani_resolved += 1 # try to resolve by GTDB family assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_family(gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB family classifications' gtdb_family_resolved += 1 # try to resolve by GTDB genus assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_genus(gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB genus classifications' gtdb_genus_resolved += 1 # try to resolve by GTDB species assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_species(gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB species classifications' gtdb_sp_resolved += 1 # try to resolve by considering genomes annotated as type material at NCBI, # which includes considering if genomes are marked as untrustworthy as type if not resolved: resolved, untrustworthy_gids = self.resolve_by_ncbi_types(gid_anis, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting NCBI assembled from type metadata' ncbi_type_resolved += 1 if resolved: unresolved_species = False # check if type strain genomes marked as trusted or untrusted conflict # with current GTDB species assignment untrustworthy_gtdb_sp_match = False trusted_gtdb_sp_match = False for gid in type_gids: gtdb_canonical_epithet = canonical_taxon(specific_epithet(cur_genomes[gid].gtdb_taxa.species)) if gtdb_canonical_epithet == specific_epithet(ncbi_sp): if gid in untrustworthy_gids: untrustworthy_gtdb_sp_match = True else: trusted_gtdb_sp_match = True if untrustworthy_gtdb_sp_match and not trusted_gtdb_sp_match: prev_gtdb_sp_conflicts += 1 gtdb_resolved_sp_conflict = True # write results to file for gid, reason in untrustworthy_gids.items(): ltp_species = self.ltp_species(gid, ltp_metadata) if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note: reason += "; considered `untrustworthy as type` at NCBI" fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format(gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), reason)) # Sanity check that if the untrustworthy genome has an LTP to only the # expected species, that all other genomes also have a hit to the # expected species (or potentially no hit). Otherwise, more consideration # should be given to the genome with the conflicting LTP hit. if len(ltp_species) == 1 and ncbi_sp in ltp_species: other_sp = set() for test_gid in type_gids: ltp_species = self.ltp_species(test_gid, ltp_metadata) if ltp_species and ncbi_sp not in ltp_species: other_sp.update(ltp_species) if other_sp: self.logger.warning(f'Genome {gid} marked as untrustworthy, but this conflicts with high confidence LTP 16S rRNA assignment.') num_ncbi_untrustworthy = sum([1 for gid in type_gids if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note]) if num_ncbi_untrustworthy != len(type_gids): for gid in type_gids: if (gid not in untrustworthy_gids and 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note): self.logger.warning("Retaining genome {} from {} despite being marked as `untrustworthy as type` at NCBI [{:,} of {:,} considered untrustworthy].".format( gid, ncbi_sp, num_ncbi_untrustworthy, len(type_gids))) else: note = 'Species is unresolved; manual curation is required!' unresolved_sp_count += 1 if unresolved_species: for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_unresolved.write('{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format( gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa)) for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_genomes.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format( gid, gid in untrustworthy_gids, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), gtdb_resolved_sp_conflict, np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa)) fout.write('{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\n'.format( ncbi_sp, len(type_gids), all_similar, np_mean(anis), np_std(anis), np_mean(afs), np_std(afs), note, ', '.join(type_gids))) sys.stdout.write('\n') fout.close() fout_unresolved.close() fout_high_divergence.close() fout_genomes.close() fout_untrustworthy.close() self.logger.info(f'Identified {num_divergent:,} species with 1 or more divergent type strain genomes.') self.logger.info(f' ... resolved {ncbi_ltp_resolved:,} species by removing NCBI `untrustworthy as type` genomes with a conflicting LTP 16S rRNA classifications.') self.logger.info(f' ... resolved {ltp_resolved:,} species by considering conflicting LTP 16S rRNA classifications.') self.logger.info(f' ... resolved {intra_ani_resolved:,} species by considering intra-specific ANI values.') self.logger.info(f' ... resolved {gtdb_family_resolved:,} species by considering conflicting GTDB family classifications.') self.logger.info(f' ... resolved {gtdb_genus_resolved:,} species by considering conflicting GTDB genus classifications.') self.logger.info(f' ... resolved {gtdb_sp_resolved:,} species by considering conflicting GTDB species classifications.') self.logger.info(f' ... resolved {ncbi_type_resolved:,} species by considering type material designations at NCBI.') if unresolved_sp_count > 0: self.logger.warning(f'There are {unresolved_sp_count:,} unresolved species with multiple type strain genomes.') self.logger.warning('These should be handled before proceeding with the next step of GTDB species updating.') self.logger.warning("This can be done by manual curation and adding genomes to 'untrustworthy_type_ledger'.") self.logger.info(f'Identified {prev_gtdb_sp_conflicts:,} cases where resolved type strain conflicts with prior GTDB assignment.')