def __init__(self): """Initialization.""" self.genomes = {} self.sp_clusters = SpeciesClusters() self.genomic_files = { } # this should be removed, but the FastANI interface # currently requires data to be passed in as a dictionary self.full_gid = {} # translate from canonical GID to NCBI accession self.logger = logging.getLogger('timestamp')
def __init__(self): """Initialization.""" self.genomes = {} self.sp_clusters = SpeciesClusters() self.genomic_files = {} # this should be removed, but the FastANI interface # currently requires data to be passed in as a dictionary self.user_uba_id_map = {} # can be removed once UBA genomes are no longer part # of the archaeal genome set self.uba_user_id_map = {} self.logger = logging.getLogger('timestamp')
class Genomes(object): """A collection of genomes.""" def __init__(self): """Initialization.""" self.genomes = {} self.sp_clusters = SpeciesClusters() self.genomic_files = {} # this should be removed, but the FastANI interface # currently requires data to be passed in as a dictionary self.user_uba_id_map = {} # can be removed once UBA genomes are no longer part # of the archaeal genome set self.uba_user_id_map = {} self.logger = logging.getLogger('timestamp') def __str__(self): """User-friendly string representation.""" return '{{num_genomes:{}}}'.format(len(self.genomes)) def __iter__(self): """Iterate over genome IDs comprising genome set.""" for gid in self.genomes: yield gid def __getitem__(self, gid): """Get genome.""" gid = self.user_uba_id_map.get(gid, gid) return self.genomes[gid] def __contains__(self, gid): """Check if genome is in genome set.""" gid = self.user_uba_id_map.get(gid, gid) return gid in self.genomes def __len__(self): """Size of genome set.""" return len(self.genomes) def _convert_int(self, value, default_value=0): """Convert database value to integer.""" return int(value) if value and value != 'none' else default_value def _convert_float(self, value, default_value=0.0): """Convert database value to float.""" return float(value) if value and value != 'none' else default_value def _apply_ncbi_taxonomy_ledgers(self, species_exception_file, genus_exception_file): """Apply corrections to NCBI taxonomy.""" species_updates = {} if species_exception_file: with open(species_exception_file, encoding='utf-8') as f: f.readline() for line in f: line_split = [token.strip() for token in line.strip().split('\t')] gid = canonical_gid(line_split[0].strip()) sp = line_split[1].strip().replace('Candidatus ', '') if gid not in self.genomes: self.logger.warning(f'Genome {gid} in species exception list not defined in genome set.') continue if not sp.startswith('s__'): sp = 's__' + sp self.genomes[gid].ncbi_taxa.species = sp species_updates[gid] = sp if genus_exception_file: with open(genus_exception_file, encoding='utf-8') as f: f.readline() for line in f: line_split = [token.strip() for token in line.strip().split('\t')] gid = canonical_gid(line_split[0].strip()) genus = line_split[1].strip() if gid not in self.genomes: self.logger.warning(f'Genome {gid} in genus exception list not defined in genome set.') continue if genus.startswith('g__'): genus = genus[3:] self.genomes[gid].ncbi_taxa.genus = f'g__{genus}' species = self.genomes[gid].ncbi_taxa.species if species != 's__': specific = self.genomes[gid].ncbi_taxa.specific_epithet self.genomes[gid].ncbi_taxa.species = f's__{genus} {specific}' # sanity check ledgers if gid in species_updates and genus not in species_updates[gid]: self.logger.error(f'Species and genus ledgers have conflicting assignments for {gid}.') sys.exit(-1) def gtdb_type_species_of_genus(self, gtdb_genus): """Get genome assembled from type species of genus.""" for gid in self.genomes: if not self.genomes[gid].is_gtdb_type_species(): continue if self.genomes[gid].gtdb_taxa.genus == gtdb_genus: return gid return None def gtdb_sp_rep(self, gtdb_sp): """Get representative genome for GTDB species cluster.""" for gid in self.genomes: if not self.genomes[gid].is_gtdb_sp_rep(): continue if self.genomes[gid].gtdb_taxa.species == gtdb_sp: return gid self.logger.error(f'Failed to find representative of GTDB species for {gtdb_sp}.') sys.exit(-1) def ncbi_sp_effective_type_genomes(self): """Get effect type genomes for each NCBI species.""" ncbi_sp_type_strain_genomes = defaultdict(set) for gid in self.genomes: if self.genomes[gid].is_effective_type_strain(): ncbi_sp = self.genomes[gid].ncbi_taxa.species if ncbi_sp != 's__': # yes, NCBI has genomes marked as assembled from type material # that do not actually have a binomial species name ncbi_sp_type_strain_genomes[ncbi_sp].add(gid) return ncbi_sp_type_strain_genomes def sort_by_assembly_score(self): """Return genomes sorted by their assembly score.""" for gid in sorted(self.genomes.keys(), key=lambda gid: self.genomes[gid].score_assembly(), reverse=True): yield gid def get_gid(self, idx): """Get ID of genome at specific index.""" return list(self.genomes)[idx] def gtdb_type_strain_genomes(self): """Get genomes considered type strain of species by GTDB.""" type_strain_gids = set() for gid, genome in self.genomes.items(): if genome.is_gtdb_type_strain(): type_strain_gids.add(gid) return type_strain_gids def get_ncbi_type_strain_genomes(self): """Get type strain genomes for NCBI species.""" type_strain_genomes = defaultdict(set) for gid, genome in self.genomes.items(): if genome.is_effective_type_strain(): type_strain_genomes[genome.ncbi_taxa.species].add(gid) return type_strain_genomes def named_ncbi_species(self): """Get genomes in valid or effectively published, including Candidatus, species in NCBI taxonomy.""" named_ncbi_sp = defaultdict(set) for gid in self.genomes: if not is_placeholder_taxon(self.genomes[gid].ncbi_taxa.species): named_ncbi_sp[self.genomes[gid].ncbi_taxa.species].add(gid) return named_ncbi_sp def load_genomic_file_paths(self, genome_path_file): """Determine path to genomic FASTA file for each genome.""" for line in open(genome_path_file): line_split = line.strip().split('\t') gid = line_split[0] gid = canonical_gid(gid) if gid in self.genomes: genome_path = line_split[1] accession = os.path.basename(os.path.normpath(genome_path)) genomic_file = os.path.join(genome_path, accession + '_genomic.fna') self.genomes[gid].genomic_file = genomic_file self.genomic_files[gid] = genomic_file #*** Need to handle UBA genomes that can reference via their U_ ID when run through MASH self.genomic_files[accession] = genomic_file def parse_untrustworthy_type_ledger(self, untrustworth_type_ledger): """Determine genomes that should be considered untrustworthy as type material.""" untrustworthy_as_type = set() with open(untrustworth_type_ledger) as f: f.readline() for line in f: tokens = line.strip().split('\t') untrustworthy_as_type.add(canonical_gid(tokens[0])) return untrustworthy_as_type def set_gtdbtk_classification(self, gtdbtk_classify_file, prev_genomes): """Update classification of genomes based on GTDB-Tk results.""" # get species names in previous GTDB release prev_ncbi_sp = set() prev_gtdb_sp = set() for gid in prev_genomes: prev_ncbi_sp.add(prev_genomes[gid].ncbi_taxa.species) prev_gtdb_sp.add(prev_genomes[gid].gtdb_taxa.species) # set new genomes to the predicted GTDB-Tk classification, but # change genus and species classifications of a genome if it has # a previously unseen NCBI species assignment. This is a problematic # case for curation as GTDB-Tk is unaware of these assignments. # A common example is a new genome being the most basal # member of a genus, this genome being classified to this genus by # GTDB-Tk, but this genome being from a newly proposed genera which # should be favored in order to have the GTDB reflect the opinion # of the community. New NCBI taxa above the rank of genus are not considered # as these are relatively uncommon and are picked up for manual curation # using curation trees that specifically highlight genomes with previously # unseen NCBI taxon names (see update_curation_trees) gtdbtk_classifications = read_gtdbtk_classifications(gtdbtk_classify_file) num_updated = 0 num_ncbi_sp = 0 for gid in self.genomes: if gid in gtdbtk_classifications: num_updated += 1 gtdbtk_taxa = Taxa(';'.join(gtdbtk_classifications[gid])) self.genomes[gid].gtdb_taxa.update_taxa(gtdbtk_taxa) ncbi_sp = self.genomes[gid].ncbi_taxa.species if (ncbi_sp == 's__' or ncbi_sp in prev_ncbi_sp or ncbi_sp in prev_gtdb_sp): continue self.genomes[gid].gtdb_taxa.set_taxa(5, self.genomes[gid].ncbi_taxa.genus) self.genomes[gid].gtdb_taxa.set_taxa(6, ncbi_sp) num_ncbi_sp += 1 return num_updated, num_ncbi_sp def set_prev_gtdb_classifications(self, prev_genomes): """Set genomes to GTDB assignments in previous release.""" # set current genomes to have same GTDB assignments as in previous # GTDB release. This is necessary since genomes may have different # NCBI accession numbers between releases and thus the previous GTDB # taxonomy will not be reflected in the latest GTDB database. The # exception is if a genome has changed domains, in which case the # previous assignment is invalid. self.logger.info('Setting GTDB taxonomy of genomes in current genome set.') update_count = 0 conflicting_domain_count = 0 for prev_gid in prev_genomes: if prev_gid in self.genomes: if prev_genomes[prev_gid].gtdb_taxa != self.genomes[prev_gid].gtdb_taxa: if prev_genomes[prev_gid].gtdb_taxa.domain == self.genomes[prev_gid].gtdb_taxa.domain: # verify updating genomes without assigned GTDB taxa below domain assert self.genomes[prev_gid].gtdb_taxa.phylum == 'p__' update_count += 1 self.genomes[prev_gid].gtdb_taxa.update_taxa(prev_genomes[prev_gid].gtdb_taxa) else: conflicting_domain_count += 1 self.logger.info(f' - updated {update_count:,} genomes.') self.logger.info(f' - identified {conflicting_domain_count:,} genomes with conflicting domain assignments.') def load_from_metadata_file(self, metadata_file, species_exception_file=None, genus_exception_file=None, gtdb_type_strains_ledger=None, create_sp_clusters=True, uba_genome_file=None, qc_passed_file=None, ncbi_genbank_assembly_file=None, untrustworthy_type_ledger=None): """Create genome set from file(s).""" pass_qc_gids = set() if qc_passed_file: with open(qc_passed_file) as f: f.readline() for line in f: line_split = line.strip().split('\t') pass_qc_gids.add(line_split[0].strip()) self.logger.info(f' - identified {len(pass_qc_gids):,} genomes passing QC.') valid_uba_ids = set() if uba_genome_file: with open(uba_genome_file) as f: for line in f: line_split = line.strip().split('\t') valid_uba_ids.add(line_split[0].strip()) self.logger.info(f' - identified {len(valid_uba_ids):,} UBA genomes to retain.') gtdb_type_strains = set() if gtdb_type_strains_ledger: with open(gtdb_type_strains_ledger) as f: f.readline() for line in f: tokens = line.strip().split('\t') gid = canonical_gid(tokens[0].strip()) gtdb_type_strains.add(gid) self.logger.info(f' - identified {len(gtdb_type_strains):,} manually annotated as type strain genomes.') excluded_from_refseq_note = {} if ncbi_genbank_assembly_file: excluded_from_refseq_note = exclude_from_refseq(ncbi_genbank_assembly_file) untrustworthy_as_type = set() if untrustworthy_type_ledger: untrustworthy_as_type = self.parse_untrustworthy_type_ledger(untrustworthy_type_ledger) self.logger.info(f' - identified {len(untrustworthy_as_type):,} genomes annotated as untrustworthy as type.') with open(metadata_file, encoding='utf-8') as f: headers = f.readline().strip().split('\t') genome_index = headers.index('accession') gtdb_taxonomy_index = headers.index('gtdb_taxonomy') ncbi_taxonomy_index = headers.index('ncbi_taxonomy') ncbi_taxonomy_unfiltered_index = headers.index('ncbi_taxonomy_unfiltered') gtdb_type_index = headers.index('gtdb_type_designation') gtdb_type_sources_index = headers.index('gtdb_type_designation_sources') gtdb_type_species_of_genus_index = headers.index('gtdb_type_species_of_genus') ncbi_strain_identifiers_index = headers.index('ncbi_strain_identifiers') ncbi_type_index = headers.index('ncbi_type_material_designation') ncbi_asm_level_index = headers.index('ncbi_assembly_level') ncbi_genome_representation_index = headers.index('ncbi_genome_representation') ncbi_refseq_cat_index = headers.index('ncbi_refseq_category') ncbi_genome_cat_index = headers.index('ncbi_genome_category') comp_index = headers.index('checkm_completeness') cont_index = headers.index('checkm_contamination') sh_100_index = None if 'checkm_strain_heterogeneity_100' in headers: sh_100_index = headers.index('checkm_strain_heterogeneity_100') gs_index = headers.index('genome_size') contig_count_index = headers.index('contig_count') n50_index = headers.index('n50_contigs') scaffold_count_index = headers.index('scaffold_count') ambiguous_bases_index = headers.index('ambiguous_bases') total_gap_len_index = headers.index('total_gap_length') ssu_count_index = headers.index('ssu_count') ssu_length_index = headers.index('ssu_length') ncbi_molecule_count_index = headers.index('ncbi_molecule_count') ncbi_unspanned_gaps_index = headers.index('ncbi_unspanned_gaps') ncbi_spanned_gaps_index = headers.index('ncbi_spanned_gaps') gtdb_genome_rep_index = headers.index('gtdb_genome_representative') gtdb_rep_index = headers.index('gtdb_representative') if 'lpsn_priority_year' in headers: # this information will be missing from the previous # GTDB metadata file as we strip this out due to # concerns over republishing this information lpsn_priority_index = headers.index('lpsn_priority_year') dsmz_priority_index = headers.index('dsmz_priority_year') straininfo_priority_index = headers.index('straininfo_priority_year') for line in f: line_split = line.strip().split('\t') ncbi_accn = line_split[genome_index] gid = canonical_gid(ncbi_accn) if gid.startswith('U_'): # check if genome has a UBA identifier org_name_index = headers.index('organism_name') org_name = line_split[org_name_index] if '(UBA' in org_name: uba_id = org_name[org_name.find('(')+1:-1] if uba_id in valid_uba_ids: self.user_uba_id_map[gid] = uba_id self.uba_user_id_map[uba_id] = gid gid = uba_id else: continue # retain only valid UBA genomes else: continue # skip non-UBA user genomes if pass_qc_gids and gid not in pass_qc_gids: continue gtdb_taxonomy = Taxa(line_split[gtdb_taxonomy_index]) ncbi_taxonomy = Taxa(line_split[ncbi_taxonomy_index]) ncbi_taxonomy_unfiltered = Taxa(line_split[ncbi_taxonomy_unfiltered_index]) gtdb_type = line_split[gtdb_type_index] gtdb_type_sources = line_split[gtdb_type_sources_index] if gid in gtdb_type_strains: gtdb_type = 'type strain of species' gtdb_type_sources = 'GTDB curator' gtdb_type_species_of_genus = line_split[gtdb_type_species_of_genus_index] == 't' ncbi_type = line_split[ncbi_type_index] ncbi_strain_identifiers = line_split[ncbi_strain_identifiers_index] ncbi_asm_level = line_split[ncbi_asm_level_index] ncbi_genome_representation = line_split[ncbi_genome_representation_index] ncbi_refseq_cat = line_split[ncbi_refseq_cat_index] ncbi_genome_cat = line_split[ncbi_genome_cat_index] comp = float(line_split[comp_index]) cont = float(line_split[cont_index]) sh_100 = 0 if sh_100_index: sh_100 = self._convert_float(line_split[sh_100_index]) gs = int(line_split[gs_index]) contig_count = int(line_split[contig_count_index]) n50 = int(line_split[n50_index]) scaffold_count = int(line_split[scaffold_count_index]) ambiguous_bases = int(line_split[ambiguous_bases_index]) total_gap_len = int(line_split[total_gap_len_index]) ssu_count = int(line_split[ssu_count_index]) ssu_length = self._convert_int(line_split[ssu_length_index]) ncbi_molecule_count = self._convert_int(line_split[ncbi_molecule_count_index]) ncbi_unspanned_gaps = self._convert_int(line_split[ncbi_unspanned_gaps_index]) ncbi_spanned_gaps = self._convert_int(line_split[ncbi_spanned_gaps_index]) gtdb_is_rep = line_split[gtdb_rep_index] == 't' gtdb_rid = canonical_gid(line_split[gtdb_genome_rep_index]) if create_sp_clusters: self.sp_clusters.update_sp_cluster(gtdb_rid, gid, gtdb_taxonomy.species) if 'lpsn_priority_year' in headers: lpsn_priority_year = self._convert_int(line_split[lpsn_priority_index], Genome.NO_PRIORITY_YEAR) dsmz_priority_year = self._convert_int(line_split[dsmz_priority_index], Genome.NO_PRIORITY_YEAR) straininfo_priority_year = self._convert_int(line_split[straininfo_priority_index], Genome.NO_PRIORITY_YEAR) else: lpsn_priority_year = Genome.NO_PRIORITY_YEAR dsmz_priority_year = Genome.NO_PRIORITY_YEAR straininfo_priority_year = Genome.NO_PRIORITY_YEAR self.genomes[gid] = Genome(gid, ncbi_accn, gtdb_rid, gtdb_is_rep, gtdb_taxonomy, ncbi_taxonomy, ncbi_taxonomy_unfiltered, gtdb_type, gtdb_type_sources, gtdb_type_species_of_genus, gid in untrustworthy_as_type, ncbi_type, ncbi_strain_identifiers, ncbi_asm_level, ncbi_genome_representation, ncbi_refseq_cat, ncbi_genome_cat, excluded_from_refseq_note.get(gid, ''), comp, cont, sh_100, gs, contig_count, n50, scaffold_count, ambiguous_bases, total_gap_len, ssu_count, ssu_length, ncbi_molecule_count, ncbi_unspanned_gaps, ncbi_spanned_gaps, lpsn_priority_year, dsmz_priority_year, straininfo_priority_year) self._apply_ncbi_taxonomy_ledgers(species_exception_file, genus_exception_file)
def run(self, prev_gtdb_metadata_file, cur_gtdb_metadata_file, cur_uba_gid_file, genomes_new_updated_file, qc_passed_file, gtdbtk_classify_file, ncbi_genbank_assembly_file, untrustworthy_type_file, gtdb_type_strains_ledger): """Identify species representatives that have changed from previous release.""" # create previous and current GTDB genome sets self.logger.info('Creating previous GTDB genome set.') prev_genomes = Genomes() prev_genomes.load_from_metadata_file(prev_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, uba_genome_file=cur_uba_gid_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info(f' ...previous genome set contains {len(prev_genomes):,} genomes.') self.logger.info(' ...previous genome set has {:,} species clusters spanning {:,} genomes.'.format( len(prev_genomes.sp_clusters), prev_genomes.sp_clusters.total_num_genomes())) self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, uba_genome_file=cur_uba_gid_file, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info(f' ...current genome set contains {len(cur_genomes):,} genomes.') # get previous and current genomes from type strains self.logger.info('Determining genomes identified as being assembled from type strain.') prev_type_strain_gids = prev_genomes.gtdb_type_strain_genomes() cur_type_strain_gids = cur_genomes.gtdb_type_strain_genomes() new_type_strain_gids = cur_type_strain_gids - prev_type_strain_gids self.logger.info(' ...identified {:,} previous and {:,} current genomes from type strain.'.format( len(prev_type_strain_gids), len(cur_type_strain_gids))) self.logger.info(' ...{:,} type strain genomes are new to the current genome set.'.format( len(new_type_strain_gids))) # created expanded previous GTDB species clusters self.logger.info('Creating species clusters of new and updated genomes based on GTDB-Tk classifications.') new_updated_sp_clusters = SpeciesClusters() new_updated_sp_clusters.create_expanded_clusters(prev_genomes.sp_clusters, genomes_new_updated_file, qc_passed_file, gtdbtk_classify_file) self.logger.info('Identified {:,} expanded species clusters spanning {:,} genomes.'.format( len(new_updated_sp_clusters), new_updated_sp_clusters.total_num_genomes())) # determine status of each previous GTDB representative self.logger.info('Determining status of each previous GTDB representative.') fout_summary = open(os.path.join(self.output_dir, 'rep_change_summary.tsv'), 'w') fout_summary.write('Genome ID\tPrevious GTDB species\tNo. genomes in cluster') fout_summary.write('\tGENOMIC_CHANGE\tNCBI_SPECIES_CHANGE\tTYPE_STRAIN_CHANGE\tDOMAIN_CHECK') fout_summary.write('\tNew type strains\tRepresentative changed\n') fout_detailed = open(os.path.join(self.output_dir, 'rep_change_detailed.tsv'), 'w') fout_detailed.write('Genome ID\tPrevious GTDB species\tChange type\tChange\n') unchanged_genome = set() updated_genome = set() lost_genome = set() user_genome = set() unchanged_sp = set() reassigned_sp = set() unchanged_type_strain = set() lost_type_strain = set() gain_type_strain = set() new_type_strain = set() changed_domain = set() unchanged_domain = set() num_rep_changes = 0 first_type_strain = set() for prev_rid, prev_gtdb_sp in prev_genomes.sp_clusters.species(): fout_summary.write(f'{prev_rid}\t{prev_gtdb_sp}\t{len(prev_genomes.sp_clusters[prev_rid])}') if prev_rid in cur_genomes: # check if genome assembly has been updated if prev_rid in new_updated_sp_clusters.updated_gids: updated_genome.add(prev_rid) fout_summary.write('\tUPDATED') prev_ncbi_accn = prev_genomes[prev_rid].ncbi_accn cur_ncbi_accn = cur_genomes[prev_rid].ncbi_accn assert(prev_ncbi_accn != cur_ncbi_accn) fout_detailed.write((f'{prev_rid}\t{prev_gtdb_sp}\tGENOMIC_CHANGE:UPDATED\tNCBI accession updated from ' f'{prev_genomes[prev_rid].ncbi_accn} to {cur_genomes[prev_rid].ncbi_accn}\n')) else: unchanged_genome.add(prev_rid) fout_summary.write('\tUNCHANGED') # check if NCBI species assignment has changed prev_ncbi_sp = prev_genomes[prev_rid].ncbi_taxa.species cur_ncbi_sp = cur_genomes[prev_rid].ncbi_taxa.species if prev_genomes[prev_rid].ncbi_taxa.specific_epithet == cur_genomes[prev_rid].ncbi_taxa.specific_epithet: unchanged_sp.add(prev_rid) fout_summary.write('\tUNCHANGED') else: reassigned_sp.add(prev_rid) fout_summary.write('\tREASSIGNED') fout_detailed.write(f'{prev_rid}\t{prev_gtdb_sp}\tNCBI_SPECIES_CHANGE:REASSIGNED\tNCBI species reassigned from {prev_ncbi_sp} to {cur_ncbi_sp}\n') # check if type material status has changed if prev_rid in prev_type_strain_gids and prev_rid in cur_type_strain_gids: unchanged_type_strain.add(prev_rid) fout_summary.write('\tUNCHANGED') elif prev_rid not in prev_type_strain_gids and prev_rid not in cur_type_strain_gids: unchanged_type_strain.add(prev_rid) fout_summary.write('\tUNCHANGED') elif prev_rid in prev_type_strain_gids and prev_rid not in cur_type_strain_gids: lost_type_strain.add(prev_rid) fout_summary.write('\tLOST') fout_detailed.write(f'{prev_rid}\t{prev_gtdb_sp}\tTYPE_STRAIN_CHANGE:LOST\tNo longer considered a genome from type strain\n') elif prev_rid not in prev_type_strain_gids and prev_rid in cur_type_strain_gids: gain_type_strain.add(prev_rid) fout_summary.write('\tGAINED') fout_detailed.write(f'{prev_rid}\t{prev_gtdb_sp}\tTYPE_STRAIN_CHANGE:GAINED\tNow considered a genome from type strain\n') else: assert(False) # check if domain assignment has changed if prev_genomes[prev_rid].gtdb_taxa.domain != cur_genomes[prev_rid].gtdb_taxa.domain: changed_domain.add(prev_rid) fout_detailed.write('{}\t{}\tDOMAIN_CHECK:REASSIGNED\tRepresentative changed from {} to {}\n'.format( prev_rid, prev_gtdb_sp, prev_genomes[prev_rid].gtdb_taxa.domain, cur_genomes[prev_rid].gtdb_taxa.domain)) fout_summary.write('\tREASSIGNED') else: unchanged_domain.add(prev_rid) fout_summary.write('\tUNCHANGED') # check if genome cluster has new genomes assembled from the type strain of the species sp_gids = prev_genomes.sp_clusters[prev_rid] if prev_rid in new_updated_sp_clusters: sp_gids = sp_gids.union(new_updated_sp_clusters[prev_rid]) new_ts = new_type_strain_gids.intersection(sp_gids) if new_ts: if not prev_type_strain_gids.intersection(sp_gids) and not new_ts.intersection(gain_type_strain): first_type_strain.add(prev_gtdb_sp) new_type_strain.add(prev_rid) fout_detailed.write('{}\t{}\tNEW_TYPE_STRAINS:NEW\tSpecies cluster has {:,} new genomes from type strain: {}\n'.format( prev_rid, prev_gtdb_sp, len(new_ts), ','.join(new_ts))) fout_summary.write(f'\t{len(new_ts)}') if (prev_rid in unchanged_genome and prev_rid in unchanged_sp and prev_rid in unchanged_type_strain and prev_rid in unchanged_domain): fout_summary.write('\tNO') else: fout_summary.write('\tYES') num_rep_changes += 1 fout_summary.write('\n') else: lost_genome.add(prev_rid) fout_summary.write('\t{}\t{}\t{}\t{}\t{}\n'.format('LOST','N/A','N/A','N/A', 'YES')) fout_detailed.write(f'{prev_rid}\t{prev_gtdb_sp}\tGENOMIC_CHANGE:LOST\tGenome not present in current GTDB release\n') num_rep_changes += 1 fout_summary.close() fout_detailed.close() num_prev_sp_clusters = len(prev_genomes.sp_clusters) num_rep_changes_perc = num_rep_changes*100.0/num_prev_sp_clusters self.logger.info(f' ... identified {num_rep_changes:,} ({num_rep_changes_perc:.1f}%) species with a change to the representative genome.') self.logger.info('Genomic changes:') unchanged_perc = len(unchanged_genome)*100.0 / num_prev_sp_clusters updated_perc = len(updated_genome)*100.0 / num_prev_sp_clusters lost_perc = len(lost_genome)*100.0 / num_prev_sp_clusters user_perc = len(user_genome)*100.0 / num_prev_sp_clusters self.logger.info(f' unchanged_genome: {len(unchanged_genome):,} ({unchanged_perc:.1f}%)') self.logger.info(f' updated_genome: {len(updated_genome):,} ({updated_perc:.1f}%)') self.logger.info(f' lost_genome: {len(lost_genome):,} ({lost_perc:.1f}%)') self.logger.info(f' user_genome: {len(user_genome):,} ({user_perc:.1f}%)') self.logger.info('NCBI species assignment changes:') cur_sp_count = len(unchanged_genome) + len(updated_genome) unchanged_sp_perc = len(unchanged_sp)*100.0 / cur_sp_count reassigned_sp_perc = len(reassigned_sp)*100.0 / cur_sp_count self.logger.info(f' unchanged_sp: {len(unchanged_sp):,} ({unchanged_sp_perc:.1f}%)') self.logger.info(f' reassigned_sp: {len(reassigned_sp):,} ({reassigned_sp_perc:.1f}%)') self.logger.info('Status of type strain genome declarations:') prev_ts_count = len(unchanged_type_strain) + len(lost_type_strain) unchanged_type_strain_perc = len(unchanged_type_strain)*100.0 / prev_ts_count lost_type_strain_perc = len(lost_type_strain)*100.0 / prev_ts_count gain_type_strain_perc = len(gain_type_strain)*100.0 / prev_ts_count new_type_strain_perc = len(new_type_strain)*100.0 / prev_ts_count self.logger.info(f' unchanged_type_strain: {len(unchanged_type_strain):,} ({unchanged_type_strain_perc:.1f}%)') self.logger.info(f' lost_type_strain: {len(lost_type_strain):,} ({lost_type_strain_perc:.1f}%)') self.logger.info(f' gain_type_strain: {len(gain_type_strain):,} ({gain_type_strain_perc:.1f}%)') self.logger.info(f' new_type_strain: {len(new_type_strain):,} ({new_type_strain_perc:.1f}%)') self.logger.info('GTDB domain assignment change:') unchanged_domain_perc = len(unchanged_domain)*100.0 / num_prev_sp_clusters changed_domain_perc = len(changed_domain)*100.0 / num_prev_sp_clusters self.logger.info(f' unchanged: {len(unchanged_domain):,} ({unchanged_domain_perc:.1f}%)') self.logger.info(f' reassigned: {len(changed_domain):,} ({changed_domain_perc:.1f}%)') print('first_type_strain', len(first_type_strain))
def run(self, rep_change_summary_file, prev_gtdb_metadata_file, prev_genomic_path_file, cur_gtdb_metadata_file, cur_genomic_path_file, uba_genome_paths, genomes_new_updated_file, qc_passed_file, gtdbtk_classify_file, ncbi_genbank_assembly_file, untrustworthy_type_file, gtdb_type_strains_ledger, sp_priority_ledger): """Perform initial actions required for changed representatives.""" # create previous and current GTDB genome sets self.logger.info('Creating previous GTDB genome set.') prev_genomes = Genomes() prev_genomes.load_from_metadata_file( prev_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, uba_genome_file=uba_genome_paths, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info( ' ... previous genome set has {:,} species clusters spanning {:,} genomes.' .format(len(prev_genomes.sp_clusters), prev_genomes.sp_clusters.total_num_genomes())) self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, uba_genome_file=uba_genome_paths, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info( f' ... current genome set contains {len(cur_genomes):,} genomes.') # get path to previous and current genomic FASTA files self.logger.info( 'Reading path to previous and current genomic FASTA files.') prev_genomes.load_genomic_file_paths(prev_genomic_path_file) prev_genomes.load_genomic_file_paths(uba_genome_paths) cur_genomes.load_genomic_file_paths(cur_genomic_path_file) cur_genomes.load_genomic_file_paths(uba_genome_paths) # created expanded previous GTDB species clusters new_updated_sp_clusters = SpeciesClusters() self.logger.info( 'Creating species clusters of new and updated genomes based on GTDB-Tk classifications.' ) new_updated_sp_clusters.create_expanded_clusters( prev_genomes.sp_clusters, genomes_new_updated_file, qc_passed_file, gtdbtk_classify_file) self.logger.info( 'Identified {:,} expanded species clusters spanning {:,} genomes.'. format(len(new_updated_sp_clusters), new_updated_sp_clusters.total_num_genomes())) # initialize species priority manager self.sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger) # take required action for each changed representatives self.action_genomic_lost(rep_change_summary_file, prev_genomes, cur_genomes, new_updated_sp_clusters) self.action_genomic_update(rep_change_summary_file, prev_genomes, cur_genomes, new_updated_sp_clusters) self.action_type_strain_lost(rep_change_summary_file, prev_genomes, cur_genomes, new_updated_sp_clusters) self.action_domain_change(rep_change_summary_file, prev_genomes, cur_genomes) if True: #*** improved_reps = self.action_improved_rep(prev_genomes, cur_genomes, new_updated_sp_clusters) pickle.dump( improved_reps, open(os.path.join(self.output_dir, 'improved_reps.pkl'), 'wb')) else: self.logger.warning( 'Reading improved_reps for pre-cached file. Generally used only for debugging.' ) improved_reps = pickle.load( open(os.path.join(self.output_dir, 'improved_reps.pkl'), 'rb')) for prev_rid, (new_rid, action) in improved_reps.items(): self.update_rep(prev_rid, new_rid, action) self.action_naming_priority(prev_genomes, cur_genomes, new_updated_sp_clusters) # report basic statistics num_retired_sp = sum( [1 for v in self.new_reps.values() if v[0] is None]) num_replaced_rids = sum( [1 for v in self.new_reps.values() if v[0] is not None]) self.logger.info(f'Identified {num_retired_sp:,} retired species.') self.logger.info( f'Identified {num_replaced_rids:,} species with a modified representative genome.' ) self.action_log.close() # write out representatives for existing species clusters fout = open(os.path.join(self.output_dir, 'updated_species_reps.tsv'), 'w') fout.write( 'Previous representative ID\tNew representative ID\tAction\tRepresentative status\n' ) for rid in prev_genomes.sp_clusters: if rid in self.new_reps: new_rid, action = self.new_reps[rid] if new_rid is not None: fout.write(f'{rid}\t{new_rid}\t{action}\tREPLACED\n') else: fout.write(f'{rid}\t{new_rid}\t{action}\tLOST\n') else: fout.write(f'{rid}\t{rid}\tNONE\tUNCHANGED\n') fout.close() # write out updated species clusters out_file = os.path.join(self.output_dir, 'updated_sp_clusters.tsv') self.write_updated_clusters(prev_genomes, cur_genomes, self.new_reps, new_updated_sp_clusters, out_file)
def run(self, prev_gtdb_metadata_file, cur_gtdb_metadata_file, genomes_new_updated_file, qc_passed_file, gtdbtk_classify_file, ncbi_genbank_assembly_file, untrustworthy_type_file, disband_cluster_ledger, gtdb_type_strains_ledger, ncbi_env_bioproject_ledger): """Identify species representatives that have changed from previous release.""" # create previous and current GTDB genome sets self.logger.info('Creating previous GTDB genome set.') prev_genomes = Genomes() prev_genomes.load_from_metadata_file( prev_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file, ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger) self.logger.info( f' - previous genome set contains {len(prev_genomes):,} genomes.') self.logger.info( ' - previous genome set has {:,} species clusters spanning {:,} genomes.' .format(len(prev_genomes.sp_clusters), prev_genomes.sp_clusters.total_num_genomes())) self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file, ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger) # get previous and current genomes from type strains self.logger.info( 'Determining genomes identified as being assembled from type strain.' ) prev_type_strain_gids = prev_genomes.gtdb_type_strain_genomes() cur_type_strain_gids = cur_genomes.gtdb_type_strain_genomes() new_type_strain_gids = cur_type_strain_gids - prev_type_strain_gids self.logger.info( ' - identified {:,} previous and {:,} current genomes from type strain.' .format(len(prev_type_strain_gids), len(cur_type_strain_gids))) self.logger.info( ' - {:,} type strain genomes are new to the current genome set.'. format(len(new_type_strain_gids))) # create expanded previous GTDB species clusters self.logger.info( 'Creating species clusters of new and updated genomes based on GTDB-Tk classifications.' ) new_updated_sp_clusters = SpeciesClusters() new_updated_sp_clusters.create_expanded_clusters( prev_genomes, genomes_new_updated_file, qc_passed_file, gtdbtk_classify_file) self.logger.info( 'Identified {:,} expanded species clusters spanning {:,} genomes.'. format(len(new_updated_sp_clusters), new_updated_sp_clusters.total_num_genomes())) # read GTDB clusters to be disbanded self.logger.info( 'Parsing ledger indicating GTDB clusters to be disbanded.') disbanded_rids = parse_disbanded_cluster_ledger(disband_cluster_ledger) self.logger.info(' - identified {:,} clusters to be disbanded.'.format( len(disbanded_rids))) # determine status of each previous GTDB representative self.logger.info( 'Determining status of each previous GTDB representative.') fout_summary = open( os.path.join(self.output_dir, 'rep_change_summary.tsv'), 'w') fout_summary.write( 'Genome ID\tPrevious GTDB species\tNo. genomes in cluster') fout_summary.write( '\tGENOMIC_CHANGE\tNCBI_SPECIES_CHANGE\tTYPE_STRAIN_CHANGE\tDOMAIN_CHECK\tNCBI_ASSEMBLY_QUALITY\tDISBANDED_CHECK' ) fout_summary.write('\tNew type strains\tRepresentative changed\n') fout_detailed = open( os.path.join(self.output_dir, 'rep_change_detailed.tsv'), 'w') fout_detailed.write( 'Genome ID\tPrevious GTDB species\tChange type\tChange\n') unchanged_genome = set() updated_genome = set() lost_genome = set() unchanged_sp = set() reassigned_sp = set() unchanged_type_strain = set() lost_type_strain = set() gain_type_strain = set() new_type_strain = set() ncbi_anomalous_assembly = set() changed_domain = set() unchanged_domain = set() num_rep_changes = 0 disbanded_count = 0 for prev_rid, prev_gtdb_sp in prev_genomes.sp_clusters.species(): fout_summary.write( f'{prev_rid}\t{prev_gtdb_sp}\t{len(prev_genomes.sp_clusters[prev_rid])}' ) if prev_rid in cur_genomes: # check if genome assembly has been updated if prev_rid in new_updated_sp_clusters.updated_gids: updated_genome.add(prev_rid) fout_summary.write('\tUPDATED') prev_ncbi_accn = prev_genomes[prev_rid].ncbi_accn cur_ncbi_accn = cur_genomes[prev_rid].ncbi_accn assert prev_ncbi_accn != cur_ncbi_accn fout_detailed.write(( f'{prev_rid}\t{prev_gtdb_sp}\tGENOMIC_CHANGE:UPDATED\tNCBI accession updated from ' f'{prev_genomes[prev_rid].ncbi_accn} to {cur_genomes[prev_rid].ncbi_accn}\n' )) else: unchanged_genome.add(prev_rid) fout_summary.write('\tUNCHANGED') # check if NCBI species assignment has changed prev_ncbi_sp = prev_genomes[prev_rid].ncbi_taxa.species cur_ncbi_sp = cur_genomes[prev_rid].ncbi_taxa.species if prev_genomes[ prev_rid].ncbi_taxa.specific_epithet == cur_genomes[ prev_rid].ncbi_taxa.specific_epithet: unchanged_sp.add(prev_rid) fout_summary.write('\tUNCHANGED') else: reassigned_sp.add(prev_rid) fout_summary.write('\tREASSIGNED') fout_detailed.write( f'{prev_rid}\t{prev_gtdb_sp}\tNCBI_SPECIES_CHANGE:REASSIGNED\tNCBI species reassigned from {prev_ncbi_sp} to {cur_ncbi_sp}\n' ) # check if type material status has changed if prev_rid in prev_type_strain_gids and prev_rid in cur_type_strain_gids: unchanged_type_strain.add(prev_rid) fout_summary.write('\tUNCHANGED') elif prev_rid not in prev_type_strain_gids and prev_rid not in cur_type_strain_gids: unchanged_type_strain.add(prev_rid) fout_summary.write('\tUNCHANGED') elif prev_rid in prev_type_strain_gids and prev_rid not in cur_type_strain_gids: lost_type_strain.add(prev_rid) fout_summary.write('\tLOST') fout_detailed.write( f'{prev_rid}\t{prev_gtdb_sp}\tTYPE_STRAIN_CHANGE:LOST\tNo longer considered a genome from type strain\n' ) elif prev_rid not in prev_type_strain_gids and prev_rid in cur_type_strain_gids: gain_type_strain.add(prev_rid) fout_summary.write('\tGAINED') fout_detailed.write( f'{prev_rid}\t{prev_gtdb_sp}\tTYPE_STRAIN_CHANGE:GAINED\tNow considered a genome from type strain\n' ) else: assert False # check if domain assignment has changed if prev_genomes[prev_rid].gtdb_taxa.domain != cur_genomes[ prev_rid].gtdb_taxa.domain: changed_domain.add(prev_rid) fout_detailed.write( '{}\t{}\tDOMAIN_CHECK:REASSIGNED\tRepresentative changed from {} to {}\n' .format(prev_rid, prev_gtdb_sp, prev_genomes[prev_rid].gtdb_taxa.domain, cur_genomes[prev_rid].gtdb_taxa.domain)) fout_summary.write('\tREASSIGNED') else: unchanged_domain.add(prev_rid) fout_summary.write('\tUNCHANGED') # check if NCBI has marked genome assembly as problematic if cur_genomes[prev_rid].is_ncbi_many_frameshifted_proteins( ) or cur_genomes[prev_rid].is_ncbi_anomalous_assembly(): ncbi_anomalous_assembly.add(prev_rid) fout_summary.write('\tNCBI_ANOMALOUS_ASSEMBLY') fout_detailed.write( '{}\t{}\tNCBI_ASSEMBLY_METADATA:NCBI_ANOMALOUS_ASSEMBLY\tExcluded = {}\n' .format( prev_rid, prev_gtdb_sp, cur_genomes[prev_rid].excluded_from_refseq_note)) else: fout_summary.write('\tNCBI_GOOD_ASSEMBLY') # check if GTDB species cluster is flagged to be disbanded if prev_rid in disbanded_rids: disbanded_count += 1 fout_summary.write('\tTRUE') fout_detailed.write( '{}\t{}\tEXPLICIT_UPDATE:DISBANDED\t\n'.format( prev_rid, prev_gtdb_sp)) else: fout_summary.write('\tFALSE') # check if genome cluster has new genomes assembled from the type strain of the species sp_gids = prev_genomes.sp_clusters[prev_rid] if prev_rid in new_updated_sp_clusters: sp_gids = sp_gids.union(new_updated_sp_clusters[prev_rid]) new_ts = new_type_strain_gids.intersection(sp_gids) if new_ts: new_type_strain.add(prev_rid) fout_detailed.write( '{}\t{}\tNEW_TYPE_STRAINS:NEW\tSpecies cluster has {:,} new genomes from type strain: {}\n' .format(prev_rid, prev_gtdb_sp, len(new_ts), ','.join(new_ts))) fout_summary.write(f'\t{len(new_ts)}') # check if representative has changed if (prev_rid in unchanged_genome and prev_rid in unchanged_sp and prev_rid in unchanged_type_strain and prev_rid in unchanged_domain and prev_rid not in disbanded_rids): fout_summary.write('\tNO') else: fout_summary.write('\tYES') num_rep_changes += 1 fout_summary.write('\n') else: lost_genome.add(prev_rid) fout_summary.write('\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( 'LOST', 'N/A', 'N/A', 'N/A', 'N/A', 'YES')) fout_detailed.write( f'{prev_rid}\t{prev_gtdb_sp}\tGENOMIC_CHANGE:LOST\tGenome not present in current GTDB release\n' ) num_rep_changes += 1 fout_summary.close() fout_detailed.close() num_prev_sp_clusters = len(prev_genomes.sp_clusters) num_rep_changes_perc = num_rep_changes * 100.0 / num_prev_sp_clusters self.logger.info( f' - identified {num_rep_changes:,} ({num_rep_changes_perc:.1f}%) species with a change to the representative genome.' ) self.logger.info('Genomic changes:') unchanged_perc = len(unchanged_genome) * 100.0 / num_prev_sp_clusters updated_perc = len(updated_genome) * 100.0 / num_prev_sp_clusters lost_perc = len(lost_genome) * 100.0 / num_prev_sp_clusters self.logger.info( f' unchanged_genome: {len(unchanged_genome):,} ({unchanged_perc:.1f}%)' ) self.logger.info( f' updated_genome: {len(updated_genome):,} ({updated_perc:.1f}%)') self.logger.info( f' lost_genome: {len(lost_genome):,} ({lost_perc:.1f}%)') self.logger.info('NCBI species assignment changes:') cur_sp_count = len(unchanged_genome) + len(updated_genome) unchanged_sp_perc = len(unchanged_sp) * 100.0 / cur_sp_count reassigned_sp_perc = len(reassigned_sp) * 100.0 / cur_sp_count self.logger.info( f' unchanged_sp: {len(unchanged_sp):,} ({unchanged_sp_perc:.1f}%)' ) self.logger.info( f' reassigned_sp: {len(reassigned_sp):,} ({reassigned_sp_perc:.1f}%)' ) self.logger.info('Status of type strain genome declarations:') prev_ts_count = len(unchanged_type_strain) + len(lost_type_strain) unchanged_type_strain_perc = len( unchanged_type_strain) * 100.0 / prev_ts_count lost_type_strain_perc = len(lost_type_strain) * 100.0 / prev_ts_count gain_type_strain_perc = len(gain_type_strain) * 100.0 / prev_ts_count new_type_strain_perc = len(new_type_strain) * 100.0 / prev_ts_count self.logger.info( f' unchanged_type_strain: {len(unchanged_type_strain):,} ({unchanged_type_strain_perc:.1f}%)' ) self.logger.info( f' lost_type_strain: {len(lost_type_strain):,} ({lost_type_strain_perc:.1f}%)' ) self.logger.info( f' gain_type_strain: {len(gain_type_strain):,} ({gain_type_strain_perc:.1f}%)' ) self.logger.info( f' new_type_strain: {len(new_type_strain):,} ({new_type_strain_perc:.1f}%)' ) self.logger.info('GTDB domain assignment change:') unchanged_domain_perc = len( unchanged_domain) * 100.0 / num_prev_sp_clusters changed_domain_perc = len( changed_domain) * 100.0 / num_prev_sp_clusters self.logger.info( f' unchanged: {len(unchanged_domain):,} ({unchanged_domain_perc:.1f}%)' ) self.logger.info( f' reassigned: {len(changed_domain):,} ({changed_domain_perc:.1f}%)' ) self.logger.info( 'Identified {:,} representatives marked as anomalous assemblies at NCBI.' .format(len(ncbi_anomalous_assembly))) self.logger.info( 'Identified {:,} GTDB clusters to be disbanded.'.format( disbanded_count))