class IntraGenusANI(object): """Calculate intra-genus ANI/AF values between GTDB representative genomes.""" def __init__(self, ani_cache_file, cpus, output_dir): """Initialization.""" check_dependencies(['fastANI']) self.cpus = cpus self.output_dir = output_dir self.logger = logging.getLogger('timestamp') self.fastani = FastANI(ani_cache_file, cpus) def run(self, target_genus, gtdb_metadata_file, genomic_path_file): """Dereplicate GTDB species clusters using ANI/AF criteria.""" # create GTDB genome sets self.logger.info('Creating GTDB genome set.') genomes = Genomes() genomes.load_from_metadata_file(gtdb_metadata_file) genomes.load_genomic_file_paths(genomic_path_file) self.logger.info( ' - genome set has {:,} species clusters spanning {:,} genomes.'. format(len(genomes.sp_clusters), genomes.sp_clusters.total_num_genomes())) # identify GTDB representatives from target genus self.logger.info('Identifying GTDB representatives from target genus.') target_gids = set() for gid in genomes: if genomes[gid].is_gtdb_sp_rep( ) and genomes[gid].gtdb_taxa.genus == target_genus: target_gids.add(gid) self.logger.info(' - identified {:,} genomes.'.format( len(target_gids))) # calculate FastANI ANI/AF between target genomes self.logger.info('Calculating pairwise ANI between target genomes.') ani_af = self.fastani.pairwise(target_gids, genomes.genomic_files, check_cache=True) self.fastani.write_cache(silence=True) # write out results genus_label = target_genus.replace('g__', '').lower() fout = open( os.path.join(self.output_dir, '{}_rep_ani.tsv'.format(genus_label)), 'w') fout.write( 'Query ID\tQuery species\tTarget ID\tTarget species\tANI\tAF\n') for qid in target_gids: for rid in target_gids: ani, af = FastANI.symmetric_ani(ani_af, qid, rid) fout.write('{}\t{}\t{}\t{}\t{:.3f}\t{:.3f}\n'.format( qid, genomes[qid].gtdb_taxa.species, rid, genomes[rid].gtdb_taxa.species, ani, af)) fout.close()
class ResolveTypes(): """Resolve cases where a species has multiple genomes assembled from the type strain.""" def __init__(self, ani_cache_file, cpus, output_dir): """Initialization.""" self.ltp_dir = 'rna_ltp_132' self.ltp_results_file = 'ssu.taxonomy.tsv' self.LTP_METADATA = namedtuple( 'LTP_METADATA', 'taxonomy taxa species ssu_len evalue bitscore aln_len perc_iden perc_aln' ) self.ltp_pi_threshold = 99.0 self.ltp_pa_threshold = 90.0 self.ltp_ssu_len_threshold = 900 self.ltp_evalue_threshold = 1e-10 self.output_dir = output_dir self.logger = logging.getLogger('timestamp') self.cpus = cpus self.fastani = FastANI(ani_cache_file, cpus) self.ani_pickle_dir = os.path.join(self.output_dir, 'ani_pickles') if not os.path.exists(self.ani_pickle_dir): os.makedirs(self.ani_pickle_dir) def _parse_ltp_taxonomy_str(self, ltp_taxonomy_str): """Parse taxa and species from LTP taxonomy string.""" if ';type sp.|' in ltp_taxonomy_str: taxa = ltp_taxonomy_str.split(';type sp.|')[0].split(';') elif ';|' in ltp_taxonomy_str: taxa = ltp_taxonomy_str.split(';|')[0].split(';') elif '|' in ltp_taxonomy_str: taxa = ltp_taxonomy_str.split('|')[0].split(';') elif ltp_taxonomy_str[-1] == ';': taxa = ltp_taxonomy_str[0:-1].split(';') else: taxa = ltp_taxonomy_str.split(';') sp = taxa[-1] if ' subsp. ' in sp: sp = ' '.join(sp.split()[0:2]) # validate that terminal taxon appears to be a # valid binomial species name if (sp[0].islower() or any(c.isdigit() for c in sp) or any(c.isupper() for c in sp[1:])): print(ltp_taxonomy_str, taxa) assert False return taxa, 's__' + sp def parse_ltp_metadata(self, type_gids, cur_genomes): """Parse Living Tree Project 16S rRNA metadata.""" metadata = defaultdict(list) for gid in type_gids: genome_path = os.path.dirname( os.path.abspath(cur_genomes[gid].genomic_file)) ltp_file = os.path.join(genome_path, self.ltp_dir, self.ltp_results_file) if os.path.exists(ltp_file): with open(ltp_file) as f: header = f.readline().strip().split('\t') taxonomy_index = header.index('taxonomy') ssu_len_index = header.index('length') evalue_index = header.index('blast_evalue') bitscore_index = header.index('blast_bitscore') aln_len_index = header.index('blast_align_len') pi_index = header.index('blast_perc_identity') for line in f: tokens = line.strip().split('\t') taxonomy = tokens[taxonomy_index] ssu_len = int(tokens[ssu_len_index]) evalue = float(tokens[evalue_index]) bitscore = float(tokens[bitscore_index]) aln_len = int(tokens[aln_len_index]) pi = float(tokens[pi_index]) taxa, sp = self._parse_ltp_taxonomy_str(taxonomy) metadata[gid].append( self.LTP_METADATA(taxonomy=taxonomy, taxa=taxa, species=sp, ssu_len=ssu_len, evalue=evalue, bitscore=bitscore, aln_len=aln_len, perc_iden=pi, perc_aln=aln_len * 100.0 / ssu_len)) return metadata def ltp_defined_species(self, ltp_taxonomy_file): """Get all species present in the LTP database.""" ltp_species = set() with open(ltp_taxonomy_file, encoding='utf-8') as f: for line in f: tokens = line.strip().split('\t') taxonomy = tokens[1] _taxa, sp = self._parse_ltp_taxonomy_str(taxonomy) ltp_species.add(sp) return ltp_species def ltp_species(self, gid, ltp_metadata): """Get high confident species assignments.""" sp = set() for hit in ltp_metadata[gid]: # check if hit should be trusted if (hit.perc_iden >= self.ltp_pi_threshold and hit.perc_aln >= self.ltp_pa_threshold and hit.ssu_len >= self.ltp_ssu_len_threshold and hit.evalue < self.ltp_evalue_threshold): sp.add(hit.species) return sp def check_strain_ani(self, gid_anis, untrustworthy_gids): """Check if genomes meet strain ANI criteria.""" for gid1, gid2 in combinations(gid_anis, 2): if gid1 in untrustworthy_gids or gid2 in untrustworthy_gids: continue if gid_anis[gid1][gid2] < 99: return False return True def resolve_by_intra_specific_ani(self, gid_anis): """Resolve by removing intra-specific genomes with divergent ANI values.""" if len(gid_anis) <= 2: return False, {} # consider most divergent genome as untrustworthy untrustworthy_gids = {} while True: # find most divergent genome min_ani = 100 untrustworthy_gid = None for gid in gid_anis: if gid in untrustworthy_gids: continue anis = [ ani for cur_gid, ani in gid_anis[gid].items() if cur_gid not in untrustworthy_gids ] if np_mean(anis) < min_ani: min_ani = np_mean(anis) untrustworthy_gid = gid untrustworthy_gids[ untrustworthy_gid] = f'{min_ani:.2f}% ANI to other type strain genomes' all_similar = self.check_strain_ani(gid_anis, untrustworthy_gids) if all_similar: return True, untrustworthy_gids remaining_genomes = len(gid_anis) - len(untrustworthy_gids) if remaining_genomes <= 2 or len(untrustworthy_gids) >= len( gid_anis): return False, {} def resolve_by_ncbi_types(self, gid_anis, type_gids, cur_genomes): """Resolve by consulting NCBI type material metadata.""" untrustworthy_gids = {} ncbi_type_count = 0 for gid in type_gids: if not cur_genomes[gid].is_ncbi_type_strain(): untrustworthy_gids[ gid] = 'Not classified as assembled from type material at NCBI' else: ncbi_type_count += 1 all_similar = self.check_strain_ani(gid_anis, untrustworthy_gids) if all_similar and len(untrustworthy_gids) > 0 and ncbi_type_count > 0: return True, untrustworthy_gids return False, {} def resolve_by_ncbi_reps(self, gid_anis, type_gids, cur_genomes): """Resovle by considering genomes annotated as representative genomes at NCBI.""" untrustworthy_gids = {} ncbi_rep_count = 0 for gid in type_gids: if not cur_genomes[gid].is_ncbi_representative(): untrustworthy_gids[ gid] = 'Excluded in favour of RefSeq representative or reference genome' else: ncbi_rep_count += 1 all_similar = self.check_strain_ani(gid_anis, untrustworthy_gids) if all_similar and ncbi_rep_count >= 1: return True, untrustworthy_gids return False, {} def resolve_gtdb_family(self, gid_anis, ncbi_sp, type_gids, cur_genomes): """Resolve by identifying genomes with a conflicting GTDB family assignment.""" genus = 'g__' + generic_name(ncbi_sp) gtdb_genus_rep = cur_genomes.gtdb_type_species_of_genus(genus) if not gtdb_genus_rep: return False, {} expected_gtdb_family = cur_genomes[gtdb_genus_rep].gtdb_taxa.family untrustworthy_gids = {} matched_family = 0 for gid in type_gids: if cur_genomes[gid].gtdb_taxa.family == expected_gtdb_family: matched_family += 1 else: # genome is classified to a different GTDB family than # expected for this species untrustworthy_gids[ gid] = f'Conflicting GTDB family assignment of {cur_genomes[gid].gtdb_taxa.family}, expected {expected_gtdb_family}' all_similar = self.check_strain_ani(gid_anis, untrustworthy_gids) # conflict is resolved if remaining genomes pass ANI similarity test, if all_similar and len(untrustworthy_gids) > 0 and matched_family > 0: return True, untrustworthy_gids return False, {} def resolve_gtdb_genus(self, gid_anis, ncbi_sp, type_gids, cur_genomes): """Resolve by identifying genomes with a conflicting GTDB genus assignments.""" ncbi_genus = 'g__' + generic_name(ncbi_sp) untrustworthy_gids = {} matched_genus = 0 for gid in type_gids: canonical_gtdb_genus = canonical_taxon( cur_genomes[gid].gtdb_taxa.genus) if ncbi_genus == canonical_gtdb_genus: matched_genus += 1 else: untrustworthy_gids[ gid] = f'Conflicting GTDB genus assignment of {cur_genomes[gid].gtdb_taxa.genus}, expected {ncbi_genus}' all_similar = self.check_strain_ani(gid_anis, untrustworthy_gids) if all_similar and len(untrustworthy_gids) > 0 and matched_genus > 0: return True, untrustworthy_gids return False, {} def resolve_gtdb_species(self, gid_anis, ncbi_sp, type_gids, cur_genomes): """Resolve by identifying genomes with a conflicting GTDB species assignments to different type material.""" ncbi_sp_epithet = specific_epithet(ncbi_sp) untrustworthy_gids = {} matched_sp_epithet = 0 for gid in type_gids: if ncbi_sp_epithet == cur_genomes[gid].gtdb_taxa.specific_epithet: matched_sp_epithet += 1 else: # check if genome is classified to a GTDB species cluster supported # by a type strain genome in which case we should consider this # genome untrustworthy gtdb_sp = cur_genomes[gid].gtdb_taxa.species if gtdb_sp != 's__': gtdb_sp_rid = cur_genomes.gtdb_sp_rep(gtdb_sp) if cur_genomes[gtdb_sp_rid].is_effective_type_strain(): # genome has been assigned to another species # defined by a type strain genome ani, af = self.fastani.symmetric_ani_cached( gid, gtdb_sp_rid, cur_genomes[gid].genomic_file, cur_genomes[gtdb_sp_rid].genomic_file) untrustworthy_gids[ gid] = f'Conflicting GTDB species assignment of {cur_genomes[gid].gtdb_taxa.species} [ANI={ani:.2f}%; AF={af:.2f}%]' all_similar = self.check_strain_ani(gid_anis, untrustworthy_gids) # conflict is resolved if remaining genomes pass ANI similarity test, if all_similar and len( untrustworthy_gids) > 0 and matched_sp_epithet > 0: return True, untrustworthy_gids return False, {} def resolve_validated_untrustworthy_ncbi_genomes(self, gid_anis, ncbi_sp, type_gids, ltp_metadata, ltp_defined_species, cur_genomes): """Resolve by identifying genomes marked as `untrustworthy as type` at NCBI and with conflicting LTP assignments.""" if ncbi_sp not in ltp_defined_species: return False, {} untrustworthy_gids = {} for gid in type_gids: if 'untrustworthy as type' in cur_genomes[ gid].excluded_from_refseq_note.lower(): ltp_species = self.ltp_species(gid, ltp_metadata) if ncbi_sp not in ltp_species and len(ltp_species) > 0: untrustworthy_gids[ gid] = f"Conflicting 16S rRNA hits to LTP database of {' / '.join(set(ltp_species))}" all_similar = self.check_strain_ani(gid_anis, untrustworthy_gids) # conflict is resolved if remaining genomes pass ANI similarity test, if all_similar and len(untrustworthy_gids) > 0: return True, untrustworthy_gids return False, {} def resolve_ltp_conflict(self, gid_anis, ncbi_sp, type_gids, ltp_metadata, require_conflict_sp): """Resolve by considering BLAST hits of 16S rRNA genes to LTP database.""" untrustworthy_gids = {} genomes_matching_expected_sp = 0 for gid in type_gids: expected_sp_count = 0 match_unexpected_sp = [] for hit in ltp_metadata[gid]: # check if hit should be trusted if (hit.perc_iden >= self.ltp_pi_threshold and hit.perc_aln >= self.ltp_pa_threshold and hit.ssu_len >= self.ltp_ssu_len_threshold and hit.evalue < self.ltp_evalue_threshold): ltp_sp = hit.species if ltp_sp == ncbi_sp: expected_sp_count += 1 else: match_unexpected_sp.append(ltp_sp) if expected_sp_count == 0 and len( match_unexpected_sp) >= require_conflict_sp: if len(match_unexpected_sp) > 0: untrustworthy_gids[ gid] = f"Conflicting 16S rRNA hits to LTP database of {' / '.join(set(match_unexpected_sp))}" else: untrustworthy_gids[ gid] = "Lack of 16S rRNA hits to LTP database" elif expected_sp_count > len(match_unexpected_sp): genomes_matching_expected_sp += 1 all_similar = self.check_strain_ani(gid_anis, untrustworthy_gids) if all_similar and len( untrustworthy_gids) > 0 and genomes_matching_expected_sp > 0: return True, untrustworthy_gids return False, {} def parse_untrustworthy_type_ledger(self, untrustworthy_type_ledger): """Parse file indicating genomes considered to be untrustworthy as type material.""" manual_untrustworthy_types = {} with open(untrustworthy_type_ledger) as f: header = f.readline().strip().split('\t') ncbi_sp_index = header.index('NCBI species') reason_index = header.index('Reason for declaring untrustworthy') for line in f: tokens = line.strip().split('\t') gid = canonical_gid(tokens[0]) manual_untrustworthy_types[gid] = (tokens[ncbi_sp_index], tokens[reason_index]) return manual_untrustworthy_types def sp_with_mult_type_strains(self, cur_genomes): """Identify NCBI species with multiple type strain of species genomes.""" sp_type_strain_genomes = defaultdict(set) for gid in cur_genomes: if cur_genomes[gid].is_effective_type_strain(): ncbi_sp = cur_genomes[gid].ncbi_taxa.species if ncbi_sp != 's__': # yes, NCBI has genomes marked as assembled from type material # that do not actually have a binomial species name sp_type_strain_genomes[ncbi_sp].add(gid) multi_type_strains_sp = { ncbi_sp: gids for ncbi_sp, gids in sp_type_strain_genomes.items() if len(gids) > 1 } return multi_type_strains_sp def calculate_type_strain_ani(self, ncbi_sp, type_gids, cur_genomes, use_pickled_results): """Calculate pairwise ANI between type strain genomes.""" ncbi_sp_str = ncbi_sp[3:].lower().replace(' ', '_') if not use_pickled_results: # *** ani_af = self.fastani.pairwise(type_gids, cur_genomes.genomic_files) pickle.dump( ani_af, open(os.path.join(self.ani_pickle_dir, f'{ncbi_sp_str}.pkl'), 'wb')) else: ani_af = pickle.load( open(os.path.join(self.ani_pickle_dir, f'{ncbi_sp_str}.pkl'), 'rb')) anis = [] afs = [] gid_anis = defaultdict(lambda: {}) gid_afs = defaultdict(lambda: {}) all_similar = True for gid1, gid2 in combinations(type_gids, 2): ani, af = FastANI.symmetric_ani(ani_af, gid1, gid2) if ani < 99 or af < 0.65: all_similar = False anis.append(ani) afs.append(af) gid_anis[gid1][gid2] = ani gid_anis[gid2][gid1] = ani gid_afs[gid1][gid2] = af gid_afs[gid2][gid1] = af return all_similar, anis, afs, gid_anis, gid_afs def run(self, cur_gtdb_metadata_file, cur_genomic_path_file, qc_passed_file, ncbi_genbank_assembly_file, ltp_taxonomy_file, gtdb_type_strains_ledger, untrustworthy_type_ledger, ncbi_env_bioproject_ledger): """Resolve cases where a species has multiple genomes assembled from the type strain.""" # get species in LTP reference database self.logger.info( 'Determining species defined in LTP reference database.') ltp_defined_species = self.ltp_defined_species(ltp_taxonomy_file) self.logger.info( f' - identified {len(ltp_defined_species):,} species.') # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_ledger, ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger) cur_genomes.load_genomic_file_paths(cur_genomic_path_file) # parsing genomes manually established to be untrustworthy as type self.logger.info( 'Determining genomes manually annotated as untrustworthy as type.') manual_untrustworthy_types = self.parse_untrustworthy_type_ledger( untrustworthy_type_ledger) self.logger.info( f' - identified {len(manual_untrustworthy_types):,} genomes manually annotated as untrustworthy as type.' ) # Identify NCBI species with multiple genomes assembled from type strain of species. This # is done using a series of heuristics that aim to ensure that the selected type strain # genome is reliable. More formal evaluation and a manuscript descirbing this selection # process is ultimately required. Ideally, the community will eventually adopt a # database that indicates a single `type genome assembly` for each species instead # of just indicating a type strain from which many (sometimes dissimilar) assemblies exist. self.logger.info( 'Determining number of type strain genomes in each NCBI species.') multi_type_strains_sp = self.sp_with_mult_type_strains(cur_genomes) self.logger.info( f' - identified {len(multi_type_strains_sp):,} NCBI species with multiple assemblies indicated as being type strain genomes.' ) # resolve species with multiple type strain genomes fout = open( os.path.join(self.output_dir, 'multi_type_strain_species.tsv'), 'w') fout.write( 'NCBI species\tNo. type strain genomes\t>=99% ANI\tMean ANI\tStd ANI\tMean AF\tStd AF\tResolution\tGenome IDs\n' ) fout_genomes = open( os.path.join(self.output_dir, 'type_strain_genomes.tsv'), 'w') fout_genomes.write( 'Genome ID\tUntrustworthy\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tConflict with prior GTDB assignment' ) fout_genomes.write( '\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\tReason for GTDB untrustworthy as type\n' ) fout_unresolved = open( os.path.join(self.output_dir, 'unresolved_type_strain_genomes.tsv'), 'w') fout_unresolved.write( 'Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species') fout_unresolved.write( '\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n' ) fout_high_divergence = open( os.path.join(self.output_dir, 'highly_divergent_type_strain_genomes.tsv'), 'w') fout_high_divergence.write( 'Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n' ) fout_untrustworthy = open( os.path.join(self.output_dir, 'untrustworthy_type_material.tsv'), 'w') fout_untrustworthy.write( 'Genome ID\tNCBI species\tGTDB species\tLTP species\tReason for declaring untrustworthy\n' ) for gid in manual_untrustworthy_types: ncbi_sp, reason = manual_untrustworthy_types[gid] fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format( gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species, '<not tested>', 'n/a', 'Manual curation: ' + reason)) processed = 0 num_divergent = 0 unresolved_sp_count = 0 ncbi_ltp_resolved = 0 intra_ani_resolved = 0 ncbi_type_resolved = 0 ncbi_rep_resolved = 0 gtdb_family_resolved = 0 gtdb_genus_resolved = 0 gtdb_sp_resolved = 0 ltp_resolved = 0 # *** Perhaps should be an external flag, but used right now to speed up debugging use_pickled_results = False if use_pickled_results: self.logger.warning( 'Using previously calculated ANI results in: {}'.format( self.ani_pickle_dir)) prev_gtdb_sp_conflicts = 0 self.logger.info( 'Resolving species with multiple type strain genomes:') for ncbi_sp, type_gids in sorted(multi_type_strains_sp.items(), key=lambda kv: len(kv[1])): assert len(type_gids) > 1 status_str = '-> Processing {} with {:,} type strain genomes [{:,} of {:,} ({:.2f}%)].'.format( ncbi_sp, len(type_gids), processed + 1, len(multi_type_strains_sp), (processed + 1) * 100.0 / len(multi_type_strains_sp)).ljust(128) sys.stdout.write('{}\r'.format(status_str)) sys.stdout.flush() processed += 1 # calculate ANI between type strain genomes all_similar, anis, afs, gid_anis, gid_afs = self.calculate_type_strain_ani( ncbi_sp, type_gids, cur_genomes, use_pickled_results) # read LTP metadata for genomes ltp_metadata = self.parse_ltp_metadata(type_gids, cur_genomes) untrustworthy_gids = {} gtdb_resolved_sp_conflict = False unresolved_species = False note = 'All type strain genomes have ANI >99% and AF >65%.' if not all_similar: note = '' # need to establish which genomes are untrustworthy as type num_divergent += 1 unresolved_species = True # write out highly divergent cases for manual inspection; # these should be compared to the automated selection if np_mean(anis) < 95: for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_high_divergence.write( '{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n' .format(gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa)) # filter genomes marked as `untrustworthy as type` at NCBI and where the LTP # assignment also suggest the asserted type material is incorrect resolved, untrustworthy_gids = self.resolve_validated_untrustworthy_ncbi_genomes( gid_anis, ncbi_sp, type_gids, ltp_metadata, ltp_defined_species, cur_genomes) if resolved: note = "Species resolved by removing genomes considered `untrustworthy as type` and with a LTP BLAST hit confirming the assembly is likely untrustworthy" ncbi_ltp_resolved += 1 # try to resolve by LTP 16S BLAST results if not resolved: resolved, untrustworthy_gids = self.resolve_ltp_conflict( gid_anis, ncbi_sp, type_gids, ltp_metadata, 0) if resolved: note = 'Species resolved by identifying conflicting or lack of LTP BLAST results' ltp_resolved += 1 # try to resolve species using intra-specific ANI test if not resolved: resolved, untrustworthy_gids = self.resolve_by_intra_specific_ani( gid_anis) if resolved: note = 'Species resolved by intra-specific ANI test' intra_ani_resolved += 1 # try to resolve by GTDB family assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_family( gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB family classifications' gtdb_family_resolved += 1 # try to resolve by GTDB genus assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_genus( gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB genus classifications' gtdb_genus_resolved += 1 # try to resolve by GTDB species assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_species( gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB species classifications' gtdb_sp_resolved += 1 # try to resolve by considering genomes annotated as type material at NCBI, # which includes considering if genomes are marked as untrustworthy as type if not resolved: resolved, untrustworthy_gids = self.resolve_by_ncbi_types( gid_anis, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting NCBI assembled from type metadata' ncbi_type_resolved += 1 # try to resovle by considering genomes annotated as representative genomes at NCBI if not resolved: resolved, untrustworthy_gids = self.resolve_by_ncbi_reps( gid_anis, type_gids, cur_genomes) if resolved: note = 'Species resolved by considering NCBI representative genomes' ncbi_rep_resolved += 1 if resolved: unresolved_species = False # check if type strain genomes marked as trusted or untrusted conflict # with current GTDB species assignment untrustworthy_gtdb_sp_match = False trusted_gtdb_sp_match = False for gid in type_gids: gtdb_canonical_epithet = canonical_taxon( specific_epithet( cur_genomes[gid].gtdb_taxa.species)) if gtdb_canonical_epithet == specific_epithet(ncbi_sp): if gid in untrustworthy_gids: untrustworthy_gtdb_sp_match = True else: trusted_gtdb_sp_match = True if untrustworthy_gtdb_sp_match and not trusted_gtdb_sp_match: prev_gtdb_sp_conflicts += 1 gtdb_resolved_sp_conflict = True else: note = 'Species is unresolved; manual curation is required!' unresolved_sp_count += 1 if unresolved_species: for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_unresolved.write( '{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n' .format(gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa)) # remove genomes marked as untrustworthy as type at NCBI if one or more potential type strain genomes remaining ncbi_untrustworthy_gids = set([ gid for gid in type_gids if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note ]) if len(type_gids - set(untrustworthy_gids) - ncbi_untrustworthy_gids) >= 1: for gid in ncbi_untrustworthy_gids: untrustworthy_gids[ gid] = "Genome annotated as `untrustworthy as type` at NCBI and there are other potential type strain genomes available" # report cases where genomes marked as untrustworthy as type at NCBI are being retained as potential type strain genomes num_ncbi_untrustworthy = len(ncbi_untrustworthy_gids) for gid in type_gids: if (gid not in untrustworthy_gids and 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note): self.logger.warning( "Retaining genome {} from {} despite being marked as `untrustworthy as type` at NCBI [{:,} of {:,} considered untrustworthy]." .format(gid, ncbi_sp, num_ncbi_untrustworthy, len(type_gids))) # write out genomes identified as being untrustworthy for gid, reason in untrustworthy_gids.items(): ltp_species = self.ltp_species(gid, ltp_metadata) if 'untrustworthy as type' in cur_genomes[ gid].excluded_from_refseq_note: reason += "; considered `untrustworthy as type` at NCBI" fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format( gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), reason)) # Sanity check that if the untrustworthy genome has an LTP to only the # expected species, that all other genomes also have a hit to the # expected species (or potentially no hit). Otherwise, more consideration # should be given to the genome with the conflicting LTP hit. if len(ltp_species) == 1 and ncbi_sp in ltp_species: other_sp = set() for test_gid in type_gids: ltp_species = self.ltp_species(test_gid, ltp_metadata) if ltp_species and ncbi_sp not in ltp_species: other_sp.update(ltp_species) if other_sp: self.logger.warning( f'Genome {gid} marked as untrustworthy, but this conflicts with high confidence LTP 16S rRNA assignment.' ) # write out information about all type genomes for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_genomes.write( '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\t{}\n' .format(gid, gid in untrustworthy_gids, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), gtdb_resolved_sp_conflict, np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa, untrustworthy_gids.get(gid, ''))) fout.write( '{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\n'.format( ncbi_sp, len(type_gids), all_similar, np_mean(anis), np_std(anis), np_mean(afs), np_std(afs), note, ', '.join(type_gids))) sys.stdout.write('\n') fout.close() fout_unresolved.close() fout_high_divergence.close() fout_genomes.close() fout_untrustworthy.close() self.logger.info( f'Identified {num_divergent:,} species with 1 or more divergent type strain genomes.' ) self.logger.info( f' - resolved {ncbi_ltp_resolved:,} species by removing NCBI `untrustworthy as type` genomes with a conflicting LTP 16S rRNA classifications.' ) self.logger.info( f' - resolved {ltp_resolved:,} species by considering conflicting LTP 16S rRNA classifications.' ) self.logger.info( f' - resolved {intra_ani_resolved:,} species by considering intra-specific ANI values.' ) self.logger.info( f' - resolved {gtdb_family_resolved:,} species by considering conflicting GTDB family classifications.' ) self.logger.info( f' - resolved {gtdb_genus_resolved:,} species by considering conflicting GTDB genus classifications.' ) self.logger.info( f' - resolved {gtdb_sp_resolved:,} species by considering conflicting GTDB species classifications.' ) self.logger.info( f' - resolved {ncbi_type_resolved:,} species by considering type material designations at NCBI.' ) self.logger.info( f' - resolved {ncbi_rep_resolved:,} species by considering RefSeq reference and representative designations at NCBI.' ) if unresolved_sp_count > 0: self.logger.warning( f'There are {unresolved_sp_count:,} unresolved species with multiple type strain genomes.' ) self.logger.warning( 'These should be handled before proceeding with the next step of GTDB species updating.' ) self.logger.warning( "This can be done by manual curation and adding genomes to 'untrustworthy_type_ledger'." ) self.logger.info( f'Identified {prev_gtdb_sp_conflicts:,} cases where resolved type strain conflicts with prior GTDB assignment.' )