def run(self, qc_file, gtdb_metadata_file, gtdb_final_clusters, species_exception_file, output_dir): """Quality check all potential GTDB genomes.""" # identify genomes failing quality criteria self.logger.info('Reading QC file.') passed_qc = read_qc_file(qc_file) self.logger.info('Identified %d genomes passing QC.' % len(passed_qc)) # get GTDB and NCBI taxonomy strings for each genome self.logger.info( 'Reading NCBI and GTDB taxonomy from GTDB metadata file.') ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy( gtdb_metadata_file, species_exception_file) prev_gtdb_taxonomy = read_gtdb_taxonomy(gtdb_metadata_file) self.logger.info( 'Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count)) self.logger.info('Read GTDB taxonomy for %d genomes.' % len(prev_gtdb_taxonomy)) # get GTDB metadata type_metadata = read_gtdb_metadata(gtdb_metadata_file, [ 'gtdb_type_designation', 'gtdb_type_designation_sources', 'gtdb_type_species_of_genus' ]) quality_metadata = read_quality_metadata(gtdb_metadata_file) # read species clusters sp_clusters, species, _rep_radius = read_clusters(gtdb_final_clusters) self.logger.info('Read %d species clusters.' % len(sp_clusters)) # sanity check species clusters all defined by genomes passing QC for gid in sp_clusters: if gid not in passed_qc: self.logger.error( 'Genome %s defines a species cluster, but fails QC.' % gid) sys.exit(-1) # modify GTDB taxonomy to reflect new species clustering and report incongruencies self.logger.info( 'Identifying species with incongruent specific names.') self._incongruent_specific_names(species, ncbi_taxonomy, prev_gtdb_taxonomy, type_metadata, output_dir) self._incongruent_genus_names(species, ncbi_taxonomy, prev_gtdb_taxonomy, type_metadata, output_dir) # get GIDs for canonical and validation trees fout_bac_can_gtdb = open( os.path.join(output_dir, 'bac_can_taxonomy.tsv'), 'w') fout_bac_val_gtdb = open( os.path.join(output_dir, 'bac_val_taxonomy.tsv'), 'w') fout_ar_can_gtdb = open( os.path.join(output_dir, 'ar_can_taxonomy.tsv'), 'w') fout_ar_val_gtdb = open( os.path.join(output_dir, 'ar_val_taxonomy.tsv'), 'w') fout_bac_val = open( os.path.join(output_dir, 'gids_bac_validation.lst'), 'w') fout_ar_val = open(os.path.join(output_dir, 'gids_ar_validation.lst'), 'w') fout_bac_can = open(os.path.join(output_dir, 'gids_bac_canonical.lst'), 'w') fout_ar_can = open(os.path.join(output_dir, 'gids_ar_canonical.lst'), 'w') fout_bac_val.write('#Accession\tSpecies\tNote\n') fout_ar_val.write('#Accession\tSpecies\tNote\n') fout_bac_can.write('#Accession\tSpecies\tNote\n') fout_ar_can.write('#Accession\tSpecies\tNote\n') for rid in sp_clusters: domain = prev_gtdb_taxonomy[rid][0] if domain == 'd__Bacteria': fout_val = fout_bac_val fout_can = fout_bac_can fout_can_gtdb = fout_bac_can_gtdb fout_val_gtdb = fout_bac_val_gtdb elif domain == 'd__Archaea': fout_val = fout_ar_val fout_can = fout_ar_can fout_can_gtdb = fout_ar_can_gtdb fout_val_gtdb = fout_ar_val_gtdb else: self.logger.error('Genome %s has no GTDB domain assignment.' % rid) sys.exit(-1) # substitute proposed species name into GTDB taxonomy taxa = prev_gtdb_taxonomy[rid][0:6] + [species[rid]] new_gtdb_str = '; '.join(taxa) fout_can_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str)) fout_val_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str)) fout_val.write( '%s\t%s\t%s\n' % (rid, species[rid], 'GTDB type or representative genome')) fout_can.write( '%s\t%s\t%s\n' % (rid, species[rid], 'GTDB type or representative genome')) cluster_gids = set(sp_clusters[rid]) for gid in cluster_gids: if gid not in passed_qc: self.logger.error( 'Genome %s is in a species cluster, but fails QC.' % gid) sys.exit(-1) if len(cluster_gids) > 0: # select highest-quality genome q = quality_score(cluster_gids, quality_metadata) gid = max(q.items(), key=operator.itemgetter(1))[0] taxa = prev_gtdb_taxonomy[gid][0:6] + [species[rid]] new_gtdb_str = '; '.join(taxa) fout_val.write( '%s\t%s\t%s\n' % (gid, species[rid], 'selected highest-quality genome (Q=%.2f)' % q[gid])) fout_val_gtdb.write('%s\t%s\n' % (gid, new_gtdb_str)) fout_bac_val.close() fout_ar_val.close() fout_bac_can.close() fout_ar_can.close() fout_bac_can_gtdb.close() fout_bac_val_gtdb.close() fout_ar_can_gtdb.close() fout_ar_val_gtdb.close()
def run(self, genomes_new_updated_file, qc_passed_file, batch_size): """Perform initial classification of new and updated genomes using GTDB-Tk.""" # get list of genomes passing QC self.logger.info('Reading genomes passing QC.') gids_pass_qc = read_qc_file(qc_passed_file) self.logger.info(f' - identified {len(gids_pass_qc):,} genomes.') # get path to genomes passing QC self.logger.info( 'Reading path to genomic file for new/updated genomes passing QC.') genomic_files = [] new_updated_gids = set() total_count = 0 with open(genomes_new_updated_file, encoding='utf-8') as f: header = f.readline().strip().split('\t') genomic_file_index = header.index('Genomic file') for line in f: tokens = line.strip().split('\t') gid = tokens[0] total_count += 1 if gid in gids_pass_qc: gf = tokens[genomic_file_index] genomic_files.append((gid, gf)) new_updated_gids.add(gid) self.logger.info( f' - identified {len(genomic_files):,} of {total_count:,} genomes as passing QC.') # create batch files genome_batch_files = [] batch_dir = os.path.join(self.output_dir, 'genome_batch_files') if os.path.exists(batch_dir): self.logger.warning( f'Using existing genome batch files in {batch_dir}.') for f in os.listdir(batch_dir): genome_batch_files.append(os.path.join(batch_dir, f)) # check if there are genomes not already in a batch file. Ideally, # this would never happen, but sometimes we process past this step # and then identify genomes missing in the database. These need to # be put into a batch file for processing. missing_gids = set(new_updated_gids) last_batch_idx = 0 for batch_file in os.listdir(batch_dir): idx = int(batch_file.split('_')[1].replace('.lst', '')) if idx > last_batch_idx: last_batch_idx = idx with open(os.path.join(batch_dir, batch_file)) as f: for line in f: tokens = line.strip().split('\t') missing_gids.discard(tokens[1]) if len(missing_gids) > 0: genome_batch_file = os.path.join( batch_dir, f'genomes_{last_batch_idx+1}.lst') genome_batch_files.append(genome_batch_file) self.logger.info('Added the batch file {} with {:,} genomes.'.format( genome_batch_file, len(missing_gids))) fout = open(genome_batch_file, 'w') for gid, gf in genomic_files: if gid in missing_gids: fout.write('{}\t{}\n'.format(gf, gid)) fout.close() else: os.makedirs(batch_dir) for batch_idx, start in enumerate(range(0, len(genomic_files), batch_size)): genome_batch_file = os.path.join( batch_dir, f'genomes_{batch_idx}.lst') genome_batch_files.append(genome_batch_file) fout = open(genome_batch_file, 'w') for i in range(start, min(start+batch_size, len(genomic_files))): gid, gf = genomic_files[i] fout.write('{}\t{}\n'.format(gf, gid)) fout.close() # process genomes with GTDB-Tk in batches for genome_batch_file in genome_batch_files: batch_idx = ntpath.basename(genome_batch_file).split('_')[ 1].replace('.lst', '') out_dir = os.path.join(self.output_dir, f'gtdbtk_batch{batch_idx}') if os.path.exists(out_dir): self.logger.warning( f'Skipping genome batch {batch_idx} as output directory already exists.') continue os.makedirs(out_dir) cmd = 'gtdbtk classify_wf --cpus {} --force --batchfile {} --out_dir {}'.format( self.cpus, genome_batch_file, out_dir) print(cmd) run(cmd) # combine summary files fout = open(os.path.join(self.output_dir, 'gtdbtk_classify.tsv'), 'w') bHeader = True gtdbtk_processed = set() for batch_dir in os.listdir(self.output_dir): if not batch_dir.startswith('gtdbtk_batch'): continue batch_dir = os.path.join(self.output_dir, batch_dir) ar_summary = os.path.join(batch_dir, 'gtdbtk.ar122.summary.tsv') bac_summary = os.path.join(batch_dir, 'gtdbtk.bac120.summary.tsv') for summary_file in [ar_summary, bac_summary]: with open(summary_file, encoding='utf-8') as f: header = f.readline() if bHeader: fout.write(header) bHeader = False for line in f: tokens = line.strip().split('\t') gid = tokens[0] if gid in new_updated_gids: # Ideally, this shouldn't be necessary, but # sometimes we process past this step and then # identify genomes missing in the database. This # can result in GTDB-Tk having been applied to # genomes that looked like they were "new", but # really were just erroneously missing from the # database. fout.write(line) gtdbtk_processed.add(gid) fout.close() self.logger.info( 'Identified {:,} genomes as being processed by GTDB-Tk.'.format(len(gtdbtk_processed))) skipped_gids = new_updated_gids - gtdbtk_processed if len(skipped_gids) > 0: self.logger.warning('Identified {:,} genomes as being skipped by GTDB-Tk.'.format( len(skipped_gids)))
def create_expanded_clusters(self, original_sp_clusters, genomes_new_updated_file, qc_passed_file, gtdbtk_classify_file): """Expand species clusters to include genome in current GTDB release.""" assert (not self.new_gids and not self.updated_gids) # read GTDB-Tk classifications for new and updated genomes gtdbtk_classifications = read_gtdbtk_classifications( gtdbtk_classify_file) self.logger.info( f' ... identified {len(gtdbtk_classifications):,} classifications.' ) # get new and updated genomes in current GTDB release self.new_gids, self.updated_gids = read_cur_new_updated( genomes_new_updated_file) self.logger.info( f' ... identified {len(self.new_gids):,} new and {len(self.updated_gids):,} updated genomes.' ) # get list of genomes passing QC gids_pass_qc = read_qc_file(qc_passed_file) new_pass_qc = len(self.new_gids.intersection(gids_pass_qc)) updated_pass_qc = len(self.updated_gids.intersection(gids_pass_qc)) self.logger.info( f' ... identified {new_pass_qc:,} new and {updated_pass_qc:,} updated genomes as passing QC.' ) # create mapping between species and representatives orig_sp_rid_map = { sp: rid for rid, sp in original_sp_clusters.species_names.items() } # create mapping between all genomes and species orig_gid_sp_map = {} for rid, cids in original_sp_clusters.sp_clusters.items(): sp = original_sp_clusters.species_names[rid] for cid in cids: orig_gid_sp_map[cid] = sp # expand species clusters failed_qc = 0 new_sp = 0 prev_genome_count = 0 for gid, taxa in gtdbtk_classifications.items(): if gid not in gids_pass_qc: # ***HACK: this should not be necessary, except GTDB-Tk was run external # of complete workflow for R95 failed_qc += 1 continue sp = taxa[6] if sp == 's__': new_sp += 1 continue if sp not in orig_sp_rid_map: self.logger.error( f'GTDB-Tk results indicated a new species for {gid}: {sp}') sys.exit(-1) orig_rid = orig_sp_rid_map[sp] if gid in self.new_gids: self.update_sp_cluster(orig_rid, gid, sp) elif gid in self.updated_gids: self.update_sp_cluster(orig_rid, gid, sp) orig_sp = orig_gid_sp_map[gid] if orig_sp != sp: self.logger.warning( f'Updated genomes {gid} reassigned from {orig_sp} to {sp}.' ) sys.exit(-1) # Really, should handle this case. This will be fine so long as the genomes # isn't a species representative. If a species representative has changed to # the point where it no longer clusters with its previous genome that requires # some real thought. else: # ***HACK: should be an error except GTDB-Tk was run external to workflow in R95 #self.logger.error(f"Genome {gid} specified in GTDB-Tk results is neither 'new' or 'updated'") #sys.exit(-1) prev_genome_count += 1 # ***HACK: this should not be necessary, except GTDB-Tk was run external # of complete workflow for R95 print('failed_qc', failed_qc) print('prev_genome_count', prev_genome_count) self.logger.info( f' ... identified {new_sp:,} genomes not assigned to an existing GTDB species cluster' ) assert len(self.sp_clusters) == len(self.species_names)
def run(self, genomes_new_updated_file, qc_passed_file, batch_size): """Perform initial classification of new and updated genomes using GTDB-Tk.""" # get list of genomes passing QC self.logger.info('Reading genomes passing QC.') gids_pass_qc = read_qc_file(qc_passed_file) self.logger.info(f' ... identified {len(gids_pass_qc):,} genomes.') # get path to genomes passing QC self.logger.info( 'Reading path to genomic file for new/updated genomes passing QC.') genomic_files = [] total_count = 0 with open(genomes_new_updated_file, encoding='utf-8') as f: header = f.readline().strip().split('\t') genomic_file_index = header.index('Genomic file') for line in f: line_split = line.strip().split('\t') gid = line_split[0] total_count += 1 if gid in gids_pass_qc: gf = line_split[genomic_file_index] genomic_files.append((gid, gf)) self.logger.info( f' ... identified {len(genomic_files):,} of {total_count:,} genomes as passing QC.' ) # process genomes with GTDB-Tk in batches for batch_idx, start in enumerate( range(0, len(genomic_files), batch_size)): batch_dir = os.path.join(self.output_dir, 'batch_{}'.format(batch_idx)) if os.path.exists(batch_dir): self.logger.warning( f'Skipping {batch_dir} as directory already exists.') continue os.makedirs(batch_dir) genome_list_file = os.path.join(batch_dir, 'genomes.lst') fout = open(genome_list_file, 'w') for i in range(start, start + batch_size): if i < len(genomic_files): gid, gf = genomic_files[i] fout.write('{}\t{}\n'.format(gf, gid)) fout.close() cmd = 'gtdbtk classify_wf --cpus {} --force --batchfile {} --out_dir {}'.format( self.cpus, genome_list_file, batch_dir) print(cmd) os.system(cmd) # combine summary files fout = open(os.path.join(self.output_dir, 'gtdbtk_classify.tsv'), 'w') bHeader = True for batch_dir in os.listdir(self.output_dir): if not batch_dir.startswith('batch_'): continue batch_dir = os.path.join(self.output_dir, batch_dir) ar_summary = os.path.join(batch_dir, 'gtdbtk.ar122.summary.tsv') bac_summary = os.path.join(batch_dir, 'gtdbtk.bac120.summary.tsv') for summary_file in [ar_summary, bac_summary]: with open(summary_file, encoding='utf-8') as f: header = f.readline() if bHeader: fout.write(header) bHeader = False for line in f: fout.write(line) fout.close()
def run(self, qc_file, metadata_file, genome_path_file, named_type_genome_file, type_genome_ani_file, mash_sketch_file, species_exception_file): """Cluster genomes to selected GTDB type genomes.""" # identify genomes failing quality criteria self.logger.info('Reading QC file.') passed_qc = read_qc_file(qc_file) self.logger.info('Identified %d genomes passing QC.' % len(passed_qc)) # get type genomes type_gids = set() species_type_gid = {} with open(named_type_genome_file) as f: header = f.readline().strip().split('\t') type_gid_index = header.index('Type genome') sp_index = header.index('NCBI species') for line in f: line_split = line.strip().split('\t') type_gids.add(line_split[type_gid_index]) species_type_gid[ line_split[type_gid_index]] = line_split[sp_index] self.logger.info('Identified type genomes for %d species.' % len(species_type_gid)) # calculate circumscription radius for type genomes self.logger.info( 'Determining ANI species circumscription for %d type genomes.' % len(type_gids)) type_radius = self._type_genome_radius(type_gids, type_genome_ani_file) assert (len(type_radius) == len(species_type_gid)) write_rep_radius( type_radius, species_type_gid, os.path.join(self.output_dir, 'gtdb_type_genome_ani_radius.tsv')) # get path to genome FASTA files self.logger.info('Reading path to genome FASTA files.') genome_files = read_genome_path(genome_path_file) self.logger.info('Read path for %d genomes.' % len(genome_files)) for gid in set(genome_files): if gid not in passed_qc: genome_files.pop(gid) self.logger.info( 'Considering %d genomes after removing unwanted User genomes.' % len(genome_files)) assert (len(genome_files) == len(passed_qc)) # get GTDB and NCBI taxonomy strings for each genome self.logger.info('Reading NCBI taxonomy from GTDB metadata file.') ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy( metadata_file, species_exception_file) self.logger.info( 'Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count)) # calculate ANI between type and non-type genomes self.logger.info('Calculating ANI between type and non-type genomes.') ani_af = self._calculate_ani(type_gids, genome_files, ncbi_taxonomy, mash_sketch_file) # cluster remaining genomes to type genomes nontype_gids = set(genome_files) - set(type_radius) self.logger.info( 'Clustering %d non-type genomes to type genomes using species specific ANI radii.' % len(nontype_gids)) clusters = self._cluster(ani_af, nontype_gids, type_radius) # write out clusters write_clusters( clusters, type_radius, species_type_gid, os.path.join(self.output_dir, 'gtdb_type_genome_clusters.tsv'))
def run(self, qc_file, metadata_file, gtdb_user_genomes_file, genome_path_file, type_genome_cluster_file, type_genome_synonym_file, ncbi_refseq_assembly_file, ncbi_genbank_assembly_file, ani_af_nontype_vs_type, species_exception_file, rnd_type_genome): """Infer de novo species clusters and type genomes for remaining genomes.""" # identify genomes failing quality criteria self.logger.info('Reading QC file.') passed_qc = read_qc_file(qc_file) self.logger.info('Identified %d genomes passing QC.' % len(passed_qc)) # get NCBI taxonomy strings for each genome self.logger.info('Reading NCBI taxonomy from GTDB metadata file.') ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(metadata_file, species_exception_file) gtdb_taxonomy = read_gtdb_taxonomy(metadata_file) self.logger.info('Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count)) self.logger.info('Read GTDB taxonomy for %d genomes.' % len(gtdb_taxonomy)) # parse NCBI assembly files self.logger.info('Parsing NCBI assembly files.') excluded_from_refseq_note = exclude_from_refseq(ncbi_refseq_assembly_file, ncbi_genbank_assembly_file) # get path to genome FASTA files self.logger.info('Reading path to genome FASTA files.') genome_files = read_genome_path(genome_path_file) self.logger.info('Read path for %d genomes.' % len(genome_files)) for gid in set(genome_files): if gid not in passed_qc: genome_files.pop(gid) self.logger.info('Considering %d genomes as potential representatives after removing unwanted User genomes.' % len(genome_files)) assert(len(genome_files) == len(passed_qc)) # determine type genomes and genomes clustered to type genomes type_species, species_type_gid, type_gids, type_clustered_gids, type_radius = self._parse_type_clusters(type_genome_cluster_file) assert(len(type_species) == len(type_gids)) self.logger.info('Identified %d type genomes.' % len(type_gids)) self.logger.info('Identified %d clustered genomes.' % len(type_clustered_gids)) # calculate quality score for genomes self.logger.info('Parse quality statistics for all genomes.') quality_metadata = read_quality_metadata(metadata_file) # calculate genome quality score self.logger.info('Calculating genome quality score.') genome_quality = quality_score(quality_metadata.keys(), quality_metadata) # determine genomes left to be clustered unclustered_gids = passed_qc - type_gids - type_clustered_gids self.logger.info('Identified %d unclustered genomes passing QC.' % len(unclustered_gids)) # establish closest type genome for each unclustered genome self.logger.info('Determining ANI circumscription for %d unclustered genomes.' % len(unclustered_gids)) nontype_radius = self._nontype_radius(unclustered_gids, type_gids, ani_af_nontype_vs_type) # calculate Mash ANI estimates between unclustered genomes self.logger.info('Calculating Mash ANI estimates between unclustered genomes.') mash_anis = self._mash_ani_unclustered(genome_files, unclustered_gids) # select species representatives genomes in a greedy fashion based on genome quality rep_genomes = self._selected_rep_genomes(genome_files, nontype_radius, unclustered_gids, mash_anis, quality_metadata, rnd_type_genome) # cluster all non-type/non-rep genomes to species type/rep genomes final_cluster_radius = type_radius.copy() final_cluster_radius.update(nontype_radius) final_clusters, ani_af = self._cluster_genomes(genome_files, rep_genomes, type_gids, passed_qc, final_cluster_radius) rep_clusters = {} for gid in rep_genomes: rep_clusters[gid] = final_clusters[gid] # get list of synonyms in order to restrict usage of species names synonyms = self._parse_synonyms(type_genome_synonym_file) self.logger.info('Identified %d synonyms.' % len(synonyms)) # determine User genomes with NCBI accession number that may form species names gtdb_user_to_genbank = self._gtdb_user_genomes(gtdb_user_genomes_file, metadata_file) self.logger.info('Identified %d GTDB User genomes with NCBI accessions.' % len(gtdb_user_to_genbank)) # assign species names to de novo species clusters names_in_use = synonyms.union(type_species) self.logger.info('Identified %d species names already in use.' % len(names_in_use)) self.logger.info('Assigning species name to each de novo species cluster.') cluster_sp_names = self._assign_species_names(rep_clusters, names_in_use, gtdb_taxonomy, gtdb_user_to_genbank) # write out file with details about selected representative genomes self._write_rep_info(rep_clusters, cluster_sp_names, quality_metadata, genome_quality, excluded_from_refseq_note, ani_af, os.path.join(self.output_dir, 'gtdb_rep_genome_info.tsv')) # remove genomes that are not representatives of a species cluster and then write out representative ANI radius for gid in set(final_cluster_radius) - set(final_clusters): del final_cluster_radius[gid] all_species = cluster_sp_names all_species.update(species_type_gid) self.logger.info('Writing %d species clusters to file.' % len(all_species)) self.logger.info('Writing %d cluster radius information to file.' % len(final_cluster_radius)) write_clusters(final_clusters, final_cluster_radius, all_species, os.path.join(self.output_dir, 'gtdb_clusters_final.tsv')) write_rep_radius(final_cluster_radius, all_species, os.path.join(self.output_dir, 'gtdb_ani_radius_final.tsv'))
def create_expanded_clusters(self, prev_genomes, genomes_new_updated_file, qc_passed_file, gtdbtk_classify_file): """Expand species clusters to include genome in current GTDB release.""" assert (not self.new_gids and not self.updated_gids) # read GTDB-Tk classifications for new and updated genomes gtdbtk_classifications = read_gtdbtk_classifications( gtdbtk_classify_file) self.logger.info( f' - identified {len(gtdbtk_classifications):,} classifications.') # get new and updated genomes in current GTDB release self.new_gids, self.updated_gids = read_cur_new_updated( genomes_new_updated_file) self.logger.info( f' - identified {len(self.new_gids):,} new and {len(self.updated_gids):,} updated genomes.' ) # get list of genomes passing QC gids_pass_qc = read_qc_file(qc_passed_file) new_pass_qc = len(self.new_gids.intersection(gids_pass_qc)) updated_pass_qc = len(self.updated_gids.intersection(gids_pass_qc)) self.logger.info( f' - identified {new_pass_qc:,} new and {updated_pass_qc:,} updated genomes as passing QC.' ) # create mapping between species and representatives original_sp_clusters = prev_genomes.sp_clusters orig_sp_rid_map = { sp: rid for rid, sp in original_sp_clusters.species_names.items() } # create mapping between all genomes and species orig_gid_sp_map = {} for rid, cids in original_sp_clusters.sp_clusters.items(): sp = original_sp_clusters.species_names[rid] for cid in cids: orig_gid_sp_map[cid] = sp # expand species clusters new_sp = 0 for gid, taxa in gtdbtk_classifications.items(): sp = taxa[6] if sp == 's__': new_sp += 1 continue if sp not in orig_sp_rid_map: self.logger.error( f'GTDB-Tk results indicated a new species for {gid}: {sp}') sys.exit(-1) orig_rid = orig_sp_rid_map[sp] if gid in self.new_gids: self.update_sp_cluster(orig_rid, gid, sp) elif gid in self.updated_gids: self.update_sp_cluster(orig_rid, gid, sp) orig_sp = orig_gid_sp_map[gid] if orig_sp != sp: if prev_genomes[gid].is_gtdb_sp_rep(): self.logger.warning( f'Updated GTDB representative {gid} reassigned from {orig_sp} to {sp} (manual inspection required to ensure this is properly resolved).' ) # sys.exit(-1) # If a GTDB species representative has changed to the point where # it no longer clusters with its previous genome this requires # some thought to ensure this situation is being handled. else: self.logger.error( f"Genome {gid} specified in GTDB-Tk results is neither 'new' or 'updated'" ) sys.exit(-1) self.logger.info( f' - identified {new_sp:,} genomes not assigned to an existing GTDB species cluster' ) assert len(self.sp_clusters) == len(self.species_names)