def run(self, gtdb_clusters_file, prev_gtdb_metadata_file, cur_gtdb_metadata_file, uba_genome_paths, qc_passed_file): """Perform initial actions required for changed representatives.""" # create previous and current GTDB genome sets self.logger.info('Creating previous GTDB genome set.') prev_genomes = Genomes() prev_genomes.load_from_metadata_file(prev_gtdb_metadata_file, uba_genome_file=uba_genome_paths) self.logger.info( ' ... previous genome set has {:,} species clusters spanning {:,} genomes.' .format(len(prev_genomes.sp_clusters), prev_genomes.sp_clusters.total_num_genomes())) self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file, create_sp_clusters=False, uba_genome_file=uba_genome_paths, qc_passed_file=qc_passed_file) self.logger.info( f' ... current genome set contains {len(cur_genomes):,} genomes.') # read named GTDB species clusters self.logger.info('Reading GTDB species clusters.') cur_clusters, _ = read_clusters(gtdb_clusters_file) self.logger.info( ' ... identified {:,} clusters spanning {:,} genomes.'.format( len(cur_clusters), sum([len(gids) + 1 for gids in cur_clusters.values()]))) # create curate tree and table indicating new NCBI taxa as these # should be considered by GTDB curators self.new_ncbi_taxa(prev_genomes, cur_genomes, cur_clusters)
def run(self, gtdb_clusters_file, cur_gtdb_metadata_file, uba_genome_paths, qc_passed_file, ncbi_misclassified_file, ncbi_genbank_assembly_file, untrustworthy_type_file, ani_af_rep_vs_nonrep, gtdb_type_strains_ledger, sp_priority_ledger, genus_priority_ledger, dsmz_bacnames_file): """Cluster genomes to selected GTDB representatives.""" # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, uba_genome_file=uba_genome_paths, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info(f' ... current genome set contains {len(cur_genomes):,} genomes.') # read named GTDB species clusters self.logger.info('Reading named and previous placeholder GTDB species clusters.') cur_clusters, rep_radius = read_clusters(gtdb_clusters_file) self.logger.info(' ... identified {:,} clusters spanning {:,} genomes.'.format( len(cur_clusters), sum([len(gids) + 1 for gids in cur_clusters.values()]))) # identified genomes with misclassified species assignments at NCBI self.logger.info('Identify genomes with misclassified NCBI species assignments.') ncbi_species_mngr = NCBI_SpeciesManager(cur_genomes, cur_clusters, self.output_dir) ncbi_misclassified_gids = ncbi_species_mngr.parse_ncbi_misclassified_table(ncbi_misclassified_file) self.logger.info(' - identified {:,} genomes with erroneous NCBI species assignments'.format( len(ncbi_misclassified_gids))) # identify NCBI species considered to be synonyms under the GTDB type_strain_synonyms = ncbi_species_mngr.identify_type_strain_synonyms(ncbi_misclassified_gids) consensus_synonyms = ncbi_species_mngr.identify_consensus_synonyms(ncbi_misclassified_gids) # read ANI and AF between representatives and non-representative genomes self.logger.info('Reading ANI and AF between representative and non-representative genomes.') ani_af = pickle.load(open(ani_af_rep_vs_nonrep, 'rb')) # write out synonyms ncbi_species_mngr.write_synonym_table(type_strain_synonyms, consensus_synonyms, ani_af, sp_priority_ledger, genus_priority_ledger, dsmz_bacnames_file)
def run(self, gtdb_clusters_file, cur_gtdb_metadata_file, cur_genomic_path_file, uba_genome_paths, qc_passed_file, ncbi_genbank_assembly_file, untrustworthy_type_file, gtdb_type_strains_ledger, sp_priority_ledger, genus_priority_ledger, dsmz_bacnames_file): """Cluster genomes to selected GTDB representatives.""" # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, uba_genome_file=uba_genome_paths, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info( f' ... current genome set contains {len(cur_genomes):,} genomes.') # get path to previous and current genomic FASTA files self.logger.info('Reading path to current genomic FASTA files.') cur_genomes.load_genomic_file_paths(cur_genomic_path_file) cur_genomes.load_genomic_file_paths(uba_genome_paths) # read named GTDB species clusters self.logger.info( 'Reading named and previous placeholder GTDB species clusters.') cur_clusters, rep_radius = read_clusters(gtdb_clusters_file) self.logger.info( ' ... identified {:,} clusters spanning {:,} genomes.'.format( len(cur_clusters), sum([len(gids) + 1 for gids in cur_clusters.values()]))) # identify genomes with erroneous NCBI species assignments self.logger.info( 'Identifying genomes with erroneous NCBI species assignments as established by ANI type strain genomes.' ) self.identify_misclassified_genomes_ani(cur_genomes, cur_clusters) self.logger.info( 'Identifying genomes with erroneous NCBI species assignments as established by GTDB cluster of type strain genomes.' ) self.identify_misclassified_genomes_cluster(cur_genomes, cur_clusters)
def run(self, gtdb_clusters_file, prev_gtdb_metadata_file, cur_gtdb_metadata_file, qc_passed_file, ncbi_genbank_assembly_file, untrustworthy_type_file, gtdb_type_strains_ledger, ncbi_env_bioproject_ledger): """Perform initial actions required for changed representatives.""" # create previous and current GTDB genome sets self.logger.info('Creating previous GTDB genome set.') prev_genomes = Genomes() prev_genomes.load_from_metadata_file( prev_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file, ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger) self.logger.info( ' - previous genome set has {:,} species clusters spanning {:,} genomes.' .format(len(prev_genomes.sp_clusters), prev_genomes.sp_clusters.total_num_genomes())) self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file, ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger) # read named GTDB species clusters self.logger.info('Reading GTDB species clusters.') cur_clusters, _ = read_clusters(gtdb_clusters_file) self.logger.info( ' - identified {:,} clusters spanning {:,} genomes.'.format( len(cur_clusters), sum([len(gids) + 1 for gids in cur_clusters.values()]))) # create curate tree and table indicating new NCBI taxa as these # should be considered by GTDB curators self.new_ncbi_taxa(prev_genomes, cur_genomes, cur_clusters)
def run(self, gtdb_clusters_file, prev_gtdb_metadata_file, cur_gtdb_metadata_file, uba_genome_paths, qc_passed_file, gtdbtk_classify_file, ncbi_genbank_assembly_file, untrustworthy_type_file, gtdb_type_strains_ledger, sp_priority_ledger, gtdb_taxa_updates_ledger, dsmz_bacnames_file): """Perform initial actions required for changed representatives.""" # create previous and current GTDB genome sets self.logger.info('Creating previous GTDB genome set.') prev_genomes = Genomes() prev_genomes.load_from_metadata_file( prev_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, uba_genome_file=uba_genome_paths, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info( ' ... previous genome set has {:,} species clusters spanning {:,} genomes.' .format(len(prev_genomes.sp_clusters), prev_genomes.sp_clusters.total_num_genomes())) self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, uba_genome_file=uba_genome_paths, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file, gtdbtk_classify_file=gtdbtk_classify_file) self.logger.info( f' ... current genome set contains {len(cur_genomes):,} genomes.') # read named GTDB species clusters self.logger.info('Reading GTDB species clusters.') cur_clusters, _ = read_clusters(gtdb_clusters_file) self.logger.info( ' ... identified {:,} clusters spanning {:,} genomes.'.format( len(cur_clusters), sum([len(gids) + 1 for gids in cur_clusters.values()]))) # set current genomes to have same GTDB assignments as in previous # GTDB release. This is necessary, since genomes may have different # NCBI accession numbers between releases and thus the previous GTDB # taxonomy will not be reflected in the latest GTDB database. The # exception is if a genome has changed domains, in which case the # previous assignment is invalid. self.logger.info( 'Setting GTDB taxonomy of genomes in current genome set.') update_count = 0 conflicting_domain_count = 0 for prev_gid in prev_genomes: if prev_gid in cur_genomes: if prev_genomes[prev_gid].gtdb_taxa != cur_genomes[ prev_gid].gtdb_taxa: if prev_genomes[prev_gid].gtdb_taxa.domain == cur_genomes[ prev_gid].gtdb_taxa.domain: update_count += 1 cur_genomes[prev_gid].gtdb_taxa.update_taxa( prev_genomes[prev_gid].gtdb_taxa) else: conflicting_domain_count += 1 self.logger.info(f' ... updated {update_count:,} genomes.') self.logger.info( f' ... identified {conflicting_domain_count:,} genomes with conflicting domain assignments.' ) # get explicit updates to previous GTDB taxa self.logger.info('Reading explicit taxa updates.') explicit_taxon_updates = self._parse_explicit_taxa_updates( gtdb_taxa_updates_ledger) self.logger.info( f' ... identified {len(explicit_taxon_updates):,} updates.') self.logger.info( 'Updating current genomes to reflect explicit taxa updates.') update_count = 0 for cur_taxon, new_taxon in explicit_taxon_updates.items(): rank_prefix = cur_taxon[0:3] rank_index = Taxonomy.rank_prefixes.index(rank_prefix) for gid in cur_genomes: if cur_genomes[gid].gtdb_taxa.get_taxa( rank_index) == cur_taxon: update_count += 1 cur_genomes[gid].gtdb_taxa.set_taxa(rank_index, new_taxon) if rank_prefix == 'g__': # should also update the species name new_sp = cur_genomes[gid].gtdb_taxa.species.replace( cur_taxon[3:], new_taxon[3:]) cur_genomes[gid].gtdb_taxa.set_taxa( rank_index + 1, new_sp) self.logger.info(f' ... updated {update_count:,} genomes.') # initialize species priority manager self.sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger, dsmz_bacnames_file) # create table with new NCBI genera that likely need to be incorporated into # this release of the GTDB self.new_ncbi_genera(prev_genomes, cur_genomes, cur_clusters, gtdbtk_classify_file) self.new_ncbi_families(prev_genomes, cur_genomes, cur_clusters, gtdbtk_classify_file)
def run(self, qc_file, gtdb_metadata_file, gtdb_final_clusters, species_exception_file, output_dir): """Quality check all potential GTDB genomes.""" # identify genomes failing quality criteria self.logger.info('Reading QC file.') passed_qc = read_qc_file(qc_file) self.logger.info('Identified %d genomes passing QC.' % len(passed_qc)) # get GTDB and NCBI taxonomy strings for each genome self.logger.info( 'Reading NCBI and GTDB taxonomy from GTDB metadata file.') ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy( gtdb_metadata_file, species_exception_file) prev_gtdb_taxonomy = read_gtdb_taxonomy(gtdb_metadata_file) self.logger.info( 'Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count)) self.logger.info('Read GTDB taxonomy for %d genomes.' % len(prev_gtdb_taxonomy)) # get GTDB metadata type_metadata = read_gtdb_metadata(gtdb_metadata_file, [ 'gtdb_type_designation', 'gtdb_type_designation_sources', 'gtdb_type_species_of_genus' ]) quality_metadata = read_quality_metadata(gtdb_metadata_file) # read species clusters sp_clusters, species, _rep_radius = read_clusters(gtdb_final_clusters) self.logger.info('Read %d species clusters.' % len(sp_clusters)) # sanity check species clusters all defined by genomes passing QC for gid in sp_clusters: if gid not in passed_qc: self.logger.error( 'Genome %s defines a species cluster, but fails QC.' % gid) sys.exit(-1) # modify GTDB taxonomy to reflect new species clustering and report incongruencies self.logger.info( 'Identifying species with incongruent specific names.') self._incongruent_specific_names(species, ncbi_taxonomy, prev_gtdb_taxonomy, type_metadata, output_dir) self._incongruent_genus_names(species, ncbi_taxonomy, prev_gtdb_taxonomy, type_metadata, output_dir) # get GIDs for canonical and validation trees fout_bac_can_gtdb = open( os.path.join(output_dir, 'bac_can_taxonomy.tsv'), 'w') fout_bac_val_gtdb = open( os.path.join(output_dir, 'bac_val_taxonomy.tsv'), 'w') fout_ar_can_gtdb = open( os.path.join(output_dir, 'ar_can_taxonomy.tsv'), 'w') fout_ar_val_gtdb = open( os.path.join(output_dir, 'ar_val_taxonomy.tsv'), 'w') fout_bac_val = open( os.path.join(output_dir, 'gids_bac_validation.lst'), 'w') fout_ar_val = open(os.path.join(output_dir, 'gids_ar_validation.lst'), 'w') fout_bac_can = open(os.path.join(output_dir, 'gids_bac_canonical.lst'), 'w') fout_ar_can = open(os.path.join(output_dir, 'gids_ar_canonical.lst'), 'w') fout_bac_val.write('#Accession\tSpecies\tNote\n') fout_ar_val.write('#Accession\tSpecies\tNote\n') fout_bac_can.write('#Accession\tSpecies\tNote\n') fout_ar_can.write('#Accession\tSpecies\tNote\n') for rid in sp_clusters: domain = prev_gtdb_taxonomy[rid][0] if domain == 'd__Bacteria': fout_val = fout_bac_val fout_can = fout_bac_can fout_can_gtdb = fout_bac_can_gtdb fout_val_gtdb = fout_bac_val_gtdb elif domain == 'd__Archaea': fout_val = fout_ar_val fout_can = fout_ar_can fout_can_gtdb = fout_ar_can_gtdb fout_val_gtdb = fout_ar_val_gtdb else: self.logger.error('Genome %s has no GTDB domain assignment.' % rid) sys.exit(-1) # substitute proposed species name into GTDB taxonomy taxa = prev_gtdb_taxonomy[rid][0:6] + [species[rid]] new_gtdb_str = '; '.join(taxa) fout_can_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str)) fout_val_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str)) fout_val.write( '%s\t%s\t%s\n' % (rid, species[rid], 'GTDB type or representative genome')) fout_can.write( '%s\t%s\t%s\n' % (rid, species[rid], 'GTDB type or representative genome')) cluster_gids = set(sp_clusters[rid]) for gid in cluster_gids: if gid not in passed_qc: self.logger.error( 'Genome %s is in a species cluster, but fails QC.' % gid) sys.exit(-1) if len(cluster_gids) > 0: # select highest-quality genome q = quality_score(cluster_gids, quality_metadata) gid = max(q.items(), key=operator.itemgetter(1))[0] taxa = prev_gtdb_taxonomy[gid][0:6] + [species[rid]] new_gtdb_str = '; '.join(taxa) fout_val.write( '%s\t%s\t%s\n' % (gid, species[rid], 'selected highest-quality genome (Q=%.2f)' % q[gid])) fout_val_gtdb.write('%s\t%s\n' % (gid, new_gtdb_str)) fout_bac_val.close() fout_ar_val.close() fout_bac_can.close() fout_ar_can.close() fout_bac_can_gtdb.close() fout_bac_val_gtdb.close() fout_ar_can_gtdb.close() fout_ar_val_gtdb.close()
def run(self, updated_sp_rep_file, gtdb_clusters_file, prev_gtdb_metadata_file, cur_gtdb_metadata_file, uba_genome_paths, qc_passed_file, gtdbtk_classify_file, ncbi_genbank_assembly_file, untrustworthy_type_file, synonym_file, gtdb_type_strains_ledger): """Summary statistics indicating changes to GTDB species clusters.""" # create previous and current GTDB genome sets self.logger.info('Creating previous GTDB genome set.') prev_genomes = Genomes() prev_genomes.load_from_metadata_file( prev_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, uba_genome_file=uba_genome_paths, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info( ' ... previous genome set has {:,} species clusters spanning {:,} genomes.' .format(len(prev_genomes.sp_clusters), prev_genomes.sp_clusters.total_num_genomes())) self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, uba_genome_file=uba_genome_paths, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info( f' ... current genome set contains {len(cur_genomes):,} genomes.') # update current genomes with GTDB-Tk classifications self.logger.info( 'Updating current genomes with GTDB-Tk classifications.') num_updated, num_ncbi_sp = cur_genomes.set_gtdbtk_classification( gtdbtk_classify_file, prev_genomes) self.logger.info( f' ... set GTDB taxa for {num_updated:,} genomes with {num_ncbi_sp:,} genomes using NCBI genus and species name.' ) # report changes in genome sets self.logger.info('Comparing previous and current genome sets.') prev_gids = set(prev_genomes) new_gids = set(cur_genomes) num_same_genomes = len(prev_gids.intersection(new_gids)) num_lost_genomes = len(prev_gids - new_gids) num_new_genomes = len(new_gids - prev_gids) self.logger.info( f' ... identified {num_same_genomes:,} genomes as being present in both genome sets.' ) self.logger.info( f' ... identified {num_lost_genomes:,} genomes as being lost from the previous genome set.' ) self.logger.info( f' ... identified {num_new_genomes:,} genomes as being new to the current genome set.' ) # get changes to representatives of previous GTDB species clusters updated_rids = self._parse_updated_sp_reps(updated_sp_rep_file) # get new GTDB species clusters self.logger.info('Reading current GTDB clusters.') new_clusters, _ = read_clusters(gtdb_clusters_file) self.logger.info( ' ... current genome set has {:,} species clusters spanning {:,} genomes.' .format(len(new_clusters), sum(len(cids) for cids in new_clusters.values()))) new_rid_map = {} for rid, cids in new_clusters.items(): for cid in cids: new_rid_map[cid] = rid # UBA genome sanity check prev_uba_count = 0 for gid in prev_genomes: if gid.startswith('UBA'): prev_uba_count += 1 cur_uba_count = 0 for gid in cur_genomes: if gid.startswith('UBA'): cur_uba_count += 1 new_uba_count = 0 for rid, cids in new_clusters.items(): for cid in cids: if cid.startswith('UBA'): new_uba_count += 1 self.logger.info( f'Verified all genome / cluster sets contain the same number of UBA genomes: {prev_uba_count:,}' ) assert prev_uba_count == cur_uba_count == new_uba_count # tabulate changes in GTDB species clusters self.logger.info('Calculating statistics of GTDB species clusters.') fout = open( os.path.join(self.output_dir, 'gtdb_sp_clusters_change_stats.tsv'), 'w') fout.write( 'Previous representative\tPrevious name\tNew representative\tNew name\tRepresentative status\tName status' ) fout.write( '\tNo. previous genomes\tNo. current genomes\tNo. same\tNo. lost\tNo. new\tNo. migrated in\tNo. migrated out\tNote\n' ) rep_lost_count = 0 rep_changed_count = 0 rep_unchanged_count = 0 rep_merged_count = 0 name_lost_count = 0 name_changed_count = 0 name_unchanged_count = 0 name_merged_count = 0 prev_cluster_ids = set() total_num_same = 0 total_num_lost = 0 total_num_new = 0 total_num_migrated_in = 0 total_num_migrated_out = 0 for prev_rid, prev_cids in prev_genomes.sp_clusters.items(): prev_gtdb_sp = prev_genomes[prev_rid].gtdb_taxa.species new_rid = updated_rids[prev_rid] prev_cluster_ids.add(new_rid) note = '' if new_rid is None: new_rid = 'none' new_sp = 'none' rep_status = 'LOST' name_status = 'LOST' # what does this mean; presumable a species name can be recycled elsewhere! new_cluster = set() rep_lost_count += 1 name_lost_count += 1 elif new_rid not in new_clusters: # representative must have been merged when selecting # representatives for NCBI species merged_rid = new_rid_map[new_rid] merged_sp = cur_genomes[merged_rid].gtdb_taxa.species note = 'merged with {} with representative {}'.format( merged_sp, merged_rid) new_rid = 'none' rep_status = 'MERGED' name_status = 'MERGED' new_cluster = set() rep_merged_count += 1 name_merged_count += 1 else: new_gtdb_sp = cur_genomes[new_rid].gtdb_taxa.species new_cluster = new_clusters[new_rid] if prev_rid == new_rid: rep_status = 'UNCHANGED' rep_unchanged_count += 1 else: rep_status = 'CHANGED' rep_changed_count += 1 if prev_gtdb_sp == new_gtdb_sp: name_status = 'UNCHANGED' name_unchanged_count += 1 else: name_status = 'CHANGED' name_changed_count += 1 fout.write('{}\t{}\t{}\t{}\t{}\t{}'.format(prev_rid, prev_gtdb_sp, new_rid, new_gtdb_sp, rep_status, name_status)) num_same = len(new_cluster.intersection(prev_cids)) num_lost = len(prev_cids - new_gids) num_new = len(new_cluster - prev_gids) num_migrated_in = len( (new_cluster - prev_cids).intersection(prev_gids)) num_migrated_out = len( (prev_cids - new_cluster).intersection(new_gids)) assert len(new_cluster) == len( prev_cids ) - num_lost + num_new + num_migrated_in - num_migrated_out assert len(prev_cids) == num_same + num_lost + num_migrated_out fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( len(prev_cids), len(new_cluster), num_same, num_lost, num_new, num_migrated_in, num_migrated_out, note)) total_num_same += num_same total_num_lost += num_lost total_num_new += num_new total_num_migrated_in += num_migrated_in total_num_migrated_out += num_migrated_out # add in new GTDB species clusters new_cluster_count = 0 for new_rid in new_clusters: if new_rid in prev_cluster_ids: continue new_gtdb_sp = cur_genomes[new_rid].gtdb_taxa.species rep_status = 'NEW' name_status = 'NEW' new_cluster_count += 1 fout.write('{}\t{}\t{}\t{}\t{}\t{}'.format('n/a', 'n/a', new_rid, new_gtdb_sp, rep_status, name_status)) num_new = len(new_clusters[new_rid] - prev_gids) num_migrated_in = len( new_clusters[new_rid].intersection(prev_gids)) assert len(new_clusters[new_rid]) == num_new + num_migrated_in fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( 0, len(new_clusters[new_rid]), 0, 0, num_new, num_migrated_in, 0, '')) total_num_new += num_new total_num_migrated_in += num_migrated_in # report genome statistics num_union = len(new_gids.union(prev_gids)) assert len( new_gids.union(prev_gids) ) == total_num_same + total_num_lost + total_num_new + total_num_migrated_in assert total_num_migrated_in == total_num_migrated_out self.logger.info( f'There were {len(prev_gids):,} genomes in the previous genome sets.' ) self.logger.info( ' ... identified {:,} ({:.2f}%) genomes that were assigned to same species cluster.' .format(total_num_same, total_num_same * 100.0 / len(prev_gids))) self.logger.info( ' ... identified {:,} ({:.2f}%) genomes that were lost from the species cluster.' .format(total_num_lost, total_num_lost * 100.0 / len(prev_gids))) self.logger.info( ' ... identified {:,} ({:.2f}%) genomes that migrated between species cluster.' .format(total_num_migrated_in, total_num_migrated_in * 100.0 / len(prev_gids))) self.logger.info( ' ... identified {:,} new genomes which is a {:.2f}% increase.'. format(total_num_new, len(new_gids) * 100.0 / len(prev_gids) - 100)) # report representative statistics assert len(new_clusters) == len( prev_genomes.sp_clusters ) + new_cluster_count - rep_lost_count - rep_merged_count self.logger.info( f'There are {len(new_clusters):,} total GTDB species representatives.' ) self.logger.info( ' ... identified {:,} ({:.2f}%) unchanged representatives.'.format( rep_unchanged_count, rep_unchanged_count * 100.0 / len(prev_genomes.sp_clusters))) self.logger.info( ' ... identified {:,} ({:.2f}%) changed representatives.'.format( rep_changed_count, rep_changed_count * 100.0 / len(prev_genomes.sp_clusters))) self.logger.info( ' ... identified {:,} ({:.2f}%) lost representatives.'.format( rep_lost_count, rep_lost_count * 100.0 / len(prev_genomes.sp_clusters))) self.logger.info( ' ... identified {:,} ({:.2f}%) merged representatives.'.format( rep_merged_count, rep_merged_count * 100.0 / len(prev_genomes.sp_clusters))) self.logger.info( ' ... identified {:,} new representatives which is a {:.2f}% increase.' .format( new_cluster_count, len(new_clusters) * 100.0 / len(prev_genomes.sp_clusters) - 100)) self.logger.info( ' ... identified {:,} ({:.2f}%) cluster names.'.format( name_unchanged_count, name_unchanged_count * 100.0 / len(prev_genomes.sp_clusters))) self.logger.info( ' ... identified {:,} ({:.2f}%) changed cluster names.'.format( name_changed_count, name_changed_count * 100.0 / len(prev_genomes.sp_clusters))) self.logger.info( ' ... identified {:,} ({:.2f}%) lost cluster names.'.format( name_lost_count, name_lost_count * 100.0 / len(prev_genomes.sp_clusters))) self.logger.info( ' ... identified {:,} ({:.2f}%) merged cluster names.'.format( name_merged_count, name_merged_count * 100.0 / len(prev_genomes.sp_clusters))) self.logger.info( ' ... identified {:,} ({:.2f}%) new cluster names.'.format( new_cluster_count, new_cluster_count * 100.0 / len(prev_genomes.sp_clusters)))
def run(self, gtdb_clusters_file, prev_gtdb_metadata_file, cur_gtdb_metadata_file, qc_passed_file, ncbi_genbank_assembly_file, untrustworthy_type_file, gtdb_type_strains_ledger, ncbi_env_bioproject_ledger): """Summary statistics indicating changes to GTDB species cluster membership.""" # create previous and current GTDB genome sets self.logger.info('Creating previous GTDB genome set.') prev_genomes = Genomes() prev_genomes.load_from_metadata_file(prev_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file, ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger) self.logger.info(' - previous genome set has {:,} species clusters spanning {:,} genomes.'.format( len(prev_genomes.sp_clusters), prev_genomes.sp_clusters.total_num_genomes())) self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file, ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger) # report changes in genome sets self.logger.info('Comparing previous and current genome sets.') prev_gids = set(prev_genomes) new_gids = set(cur_genomes) num_same_genomes = len(prev_gids.intersection(new_gids)) num_lost_genomes = len(prev_gids - new_gids) num_new_genomes = len(new_gids - prev_gids) self.logger.info( f' - identified {num_same_genomes:,} genomes as being present in both genome sets.') self.logger.info( f' - identified {num_lost_genomes:,} genomes as being lost from the previous genome set.') self.logger.info( f' - identified {num_new_genomes:,} genomes as being new to the current genome set.') # get new GTDB species clusters self.logger.info('Reading current GTDB clusters.') new_clusters, _ = read_clusters(gtdb_clusters_file) self.logger.info(' - current genome set has {:,} species clusters spanning {:,} genomes.'.format( len(new_clusters), sum(len(cids) for cids in new_clusters.values()))) new_rid_map = {} for rid, cids in new_clusters.items(): for cid in cids: new_rid_map[cid] = rid # get mapping of previous GTDB representatives to new GTDB species clusters self.logger.info( 'Mapping previous GTDB representatives to new representatives.') prev_to_new_rid = prev_genomes.sp_clusters.updated_representatives( new_clusters) self.logger.info( ' - mapped {:,} previous representatives.'.format(len(prev_to_new_rid))) new_to_prev_rids = defaultdict(list) for prev_rid, new_rid in prev_to_new_rid.items(): new_to_prev_rids[new_rid].append(prev_rid) # tabulate changes in GTDB species clusters self.logger.info('Calculating statistics of GTDB species clusters.') fout = open(os.path.join(self.output_dir, 'gtdb_sp_clusters_change_stats.tsv'), 'w') fout.write( 'New representative\tPrevious representative(s)\tPrevious name(s)\tRepresentative status') fout.write( '\tNo. previous genomes\tNo. current genomes\tNo. same\tNo. lost\tNo. new\tNo. migrated in\tNo. migrated out\n') rep_lost_count = 0 rep_changed_count = 0 rep_unchanged_count = 0 rep_merger_count = 0 prev_cluster_ids = set() total_num_same = 0 total_num_lost = 0 total_num_new = 0 total_num_migrated_in = 0 total_num_migrated_out = 0 for new_rid, prev_rids in new_to_prev_rids.items(): prev_cluster_ids.add(new_rid) prev_gtdb_sp = [ prev_genomes[prev_rid].gtdb_taxa.species for prev_rid in prev_rids] prev_cids = set() for prev_rid in prev_rids: prev_cids.update(prev_genomes.sp_clusters[prev_rid]) if new_rid is None: new_rid = 'none' rep_status = 'LOST' new_cluster = set() rep_lost_count += len(prev_rids) else: new_cluster = new_clusters[new_rid] if len(prev_rids) == 1: if prev_rids[0] == new_rid: rep_status = 'UNCHANGED' rep_unchanged_count += 1 else: rep_status = 'CHANGED' rep_changed_count += 1 else: rep_status = 'MERGER' rep_merger_count += len(prev_rids) fout.write('{}\t{}\t{}\t{}'.format( new_rid, ', '.join(prev_rids), ', '.join(prev_gtdb_sp), rep_status)) num_same = len(new_cluster.intersection(prev_cids)) num_new = len(new_cluster - prev_gids) num_lost = len(prev_cids - new_gids) num_migrated_in = len( (new_cluster - prev_cids).intersection(prev_gids)) num_migrated_out = len( (prev_cids - new_cluster).intersection(new_gids)) assert len(new_cluster) == len(prev_cids) - num_lost + \ num_new + num_migrated_in - num_migrated_out assert len(prev_cids) == num_same + num_lost + num_migrated_out fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( len(prev_cids), len(new_cluster), num_same, num_lost, num_new, num_migrated_in, num_migrated_out)) total_num_same += num_same total_num_lost += num_lost total_num_new += num_new total_num_migrated_in += num_migrated_in total_num_migrated_out += num_migrated_out assert len(prev_genomes.sp_clusters) == rep_unchanged_count + \ rep_changed_count + rep_merger_count + rep_lost_count # add in new GTDB species clusters new_cluster_count = 0 for new_rid in new_clusters: if new_rid in prev_cluster_ids: continue new_gtdb_sp = cur_genomes[new_rid].gtdb_taxa.species rep_status = 'NEW' new_cluster_count += 1 fout.write('{}\t{}\t{}\t{}\t{}'.format( 'n/a', 'n/a', new_rid, new_gtdb_sp, rep_status)) num_new = len(new_clusters[new_rid] - prev_gids) num_migrated_in = len( new_clusters[new_rid].intersection(prev_gids)) assert len(new_clusters[new_rid]) == num_new + num_migrated_in fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( 0, len(new_clusters[new_rid]), 0, 0, num_new, num_migrated_in, 0)) total_num_new += num_new total_num_migrated_in += num_migrated_in assert len(new_gids.union(prev_gids)) == total_num_same + \ total_num_lost + total_num_new + total_num_migrated_in assert total_num_migrated_in == total_num_migrated_out # report genome statistics assert len(prev_gids) == total_num_same + \ total_num_lost + total_num_migrated_in self.logger.info( f'There were {len(prev_gids):,} genomes in the previous release.') self.logger.info(' - identified {:,} ({:.2f}%) genomes that were assigned to same species cluster.'.format( total_num_same, total_num_same*100.0/len(prev_gids))) self.logger.info(' - identified {:,} ({:.2f}%) genomes that were lost from the species cluster.'.format( total_num_lost, total_num_lost*100.0/len(prev_gids))) self.logger.info(' - identified {:,} ({:.2f}%) genomes that migrated between species cluster.'.format( total_num_migrated_in, total_num_migrated_in*100.0/len(prev_gids))) self.logger.info('Identified {:,} new genomes which is a {:.2f}% increase.'.format( total_num_new, len(new_gids)*100.0/len(prev_gids) - 100)) # report representative statistics assert len(prev_genomes.sp_clusters) == rep_unchanged_count + \ rep_changed_count + rep_lost_count + rep_merger_count self.logger.info( f'There were {len(prev_genomes.sp_clusters):,} previous GTDB species representatives.') self.logger.info(' - identified {:,} ({:.2f}%) unchanged representatives.'.format( rep_unchanged_count, rep_unchanged_count*100.0/len(prev_genomes.sp_clusters))) self.logger.info(' - identified {:,} ({:.2f}%) changed representatives.'.format( rep_changed_count, rep_changed_count*100.0/len(prev_genomes.sp_clusters))) self.logger.info(' - identified {:,} ({:.2f}%) lost representatives.'.format( rep_lost_count, rep_lost_count*100.0/len(prev_genomes.sp_clusters))) self.logger.info(' - identified {:,} ({:.2f}%) merged representatives.'.format( rep_merger_count, rep_merger_count*100.0/len(prev_genomes.sp_clusters))) self.logger.info('Identified {:,} new representatives which is a {:.2f}% increase.'.format( new_cluster_count, len(new_clusters)*100.0/len(prev_genomes.sp_clusters) - 100))
def run(self, named_cluster_file, cur_gtdb_metadata_file, uba_genome_paths, qc_passed_file, ncbi_genbank_assembly_file, untrustworthy_type_file, ani_af_rep_vs_nonrep, gtdb_type_strains_ledger, sp_priority_ledger): """Cluster genomes to selected GTDB representatives.""" # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, uba_genome_file=uba_genome_paths, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info( f' ... current genome set contains {len(cur_genomes):,} genomes.') # read named GTDB species clusters self.logger.info( 'Reading named and previous placeholder GTDB species clusters.') clusters, rep_radius = read_clusters(named_cluster_file) self.logger.info( ' ... identified {:,} clusters spanning {:,} genomes.'.format( len(clusters), sum([len(gids) + 1 for gids in clusters.values()]))) # identify NCBI species with multiple genomes assembled from type strain of species self.logger.info( 'Determining effective type strain genomes in each NCBI species.') ncbi_sp_type_strain_genomes = cur_genomes.ncbi_sp_effective_type_genomes( ) self.logger.info( ' ... identified effective type strain genomes for {:,} NCBI species.' .format(len(ncbi_sp_type_strain_genomes))) # verify that type genomes for a species are contained in a # single GTDB species cluster rid_map = {} for rid, gids in clusters.items(): rid_map[rid] = rid for gid in gids: rid_map[gid] = rid for ncbi_sp, type_gids in ncbi_sp_type_strain_genomes.items(): gtdb_rids = set( [rid_map[gid] for gid in type_gids if gid in rid_map]) if len(gtdb_rids) > 1: self.logger.warning( 'Type strain genomes from NCBI species {} were assigned to {:,} GTDB species clusters: {}.' .format(ncbi_sp, len(gtdb_rids), [(rid, cur_genomes[rid].gtdb_taxa.species) for rid in gtdb_rids])) # identify synonyms self.logger.info('Identifying synonyms.') synonyms = defaultdict(list) failed_type_strain_priority = 0 for rid, gids in clusters.items(): rep_ncbi_sp = cur_genomes[rid].ncbi_taxa.species # find species that are a synonym to the current representative, # using the best quality genome for each species to establish # synonym statistics such as ANI and AF type_gids = [ gid for gid in gids if cur_genomes[gid].is_effective_type_strain() ] if not cur_genomes[rid].is_effective_type_strain() and len( type_gids) > 0: failed_type_strain_priority += 1 continue q = { gid: cur_genomes[gid].score_type_strain() for gid in type_gids } q_sorted = sorted(q.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) processed_sp = set() for gid, _quality in q_sorted: cur_ncbi_sp = cur_genomes[gid].ncbi_taxa.species if cur_ncbi_sp in processed_sp: continue if cur_ncbi_sp != rep_ncbi_sp: synonyms[rid].append(gid) processed_sp.add(cur_ncbi_sp) self.logger.info( ' ... identified {:,} GTDB representatives resulting in {:,} synonyms.' .format(len(synonyms), sum([len(gids) for gids in synonyms.values()]))) if failed_type_strain_priority: self.logger.warning( f'Identified {failed_type_strain_priority:,} non-type strain representatives that failed to priotize an effective type strain genome.' ) # read ANI and AF between representatives and non-representative genomes self.logger.info( 'Reading ANI and AF between representative and non-representative genomes.' ) ani_af = pickle.load(open(ani_af_rep_vs_nonrep, 'rb')) # write out synonyms self.write_synonym_table(synonyms, cur_genomes, ani_af, sp_priority_ledger)
def run(self, metadata_file, genome_path_file, final_cluster_file): """Cluster User genomes to GTDB species clusters.""" # get path to genome FASTA files self.logger.info('Reading path to genome FASTA files.') genome_files = read_genome_path(genome_path_file) # read existing cluster information self.logger.info('Reading already established species clusters.') sp_clusters, species, rep_radius = read_clusters(final_cluster_file) clustered_genomes = set() for rep_id in sp_clusters: clustered_genomes.add(rep_id) clustered_genomes.update(sp_clusters[rep_id]) self.logger.info( 'Identified %d species clusters spanning %d genomes.' % (len(sp_clusters), len(clustered_genomes))) # get User genomes to cluster self.logger.info('Parse quality statistics for all genomes.') quality_metadata = read_quality_metadata(metadata_file) user_genomes = set() for gid in quality_metadata: if gid in clustered_genomes: continue if (quality_metadata[gid].checkm_completeness > 50 and quality_metadata[gid].checkm_contamination < 10): user_genomes.add(gid) self.logger.info('Identified %d User genomes to cluster.' % len(user_genomes)) # calculate Mash ANI estimates between unclustered genomes self.logger.info( 'Calculating Mash ANI estimates between User genomes and species clusters.' ) mash_anis = self._mash_ani(genome_files, user_genomes, sp_clusters) # cluster User genomes to species clusters self.logger.info('Assigning User genomes to closest species cluster.') self._cluster(genome_files, sp_clusters, rep_radius, user_genomes, mash_anis) clustered_genomes = 0 for rep_id in sp_clusters: clustered_genomes += 1 clustered_genomes += len(sp_clusters[rep_id]) self.logger.info( 'The %d species clusters span %d genomes, including User genomes.' % (len(sp_clusters), clustered_genomes)) # report clustering user_cluster_file = os.path.join(self.output_dir, 'gtdb_user_clusters.tsv') fout = open(user_cluster_file, 'w') fout.write('Type genome\tNo. clustered genomes\tClustered genomes\n') for rep_id in sp_clusters: fout.write('%s\t%d\t%s\n' % (rep_id, len( sp_clusters[rep_id]), ','.join(sp_clusters[rep_id]))) fout.close()