def __init__(self, ani_ncbi_erroneous, ani_cache_file, cpus, output_dir): """Initialization.""" self.output_dir = output_dir self.logger = logging.getLogger('timestamp') self.ani_ncbi_erroneous = ani_ncbi_erroneous self.fastani = FastANI(ani_cache_file, cpus)
def nonrep_radius(self, unclustered_gids, rep_gids, ani_af_rep_vs_nonrep): """Calculate circumscription radius for unclustered, nontype genomes.""" # set radius for genomes to default values nonrep_radius = {} for gid in unclustered_gids: nonrep_radius[gid] = GenomeRadius(ani=self.ani_sp, af=None, neighbour_gid=None) # determine closest type ANI neighbour and restrict ANI radius as necessary ani_af = pickle.load(open(ani_af_rep_vs_nonrep, 'rb')) for nonrep_gid in unclustered_gids: if nonrep_gid not in ani_af: continue for rep_gid in rep_gids: if rep_gid not in ani_af[nonrep_gid]: continue ani, af = FastANI.symmetric_ani(ani_af, nonrep_gid, rep_gid) if ani > nonrep_radius[nonrep_gid].ani and af >= self.af_sp: nonrep_radius[nonrep_gid] = GenomeRadius(ani=ani, af=af, neighbour_gid=rep_gid) self.logger.info('ANI circumscription radius: min={:.2f}, mean={:.2f}, max={:.2f}'.format( min([d.ani for d in nonrep_radius.values()]), np_mean([d.ani for d in nonrep_radius.values()]), max([d.ani for d in nonrep_radius.values()]))) return nonrep_radius
def __init__(self, ani_sp, af_sp, ani_cache_file, cpus, output_dir): """Initialization.""" check_dependencies(['fastANI', 'mash']) self.cpus = cpus self.output_dir = output_dir self.logger = logging.getLogger('timestamp') self.true_str = ['t', 'T', 'true', 'True'] self.ani_sp = ani_sp self.af_sp = af_sp self.min_mash_ani = 90.0 self.fastani = FastANI(ani_cache_file, cpus)
def __init__(self, ani_sp, af_sp, ani_cache_file, cpus, output_dir): """Initialization.""" check_dependencies(['fastANI', 'mash']) self.cpus = cpus self.output_dir = output_dir self.logger = logging.getLogger('timestamp') self.ani_sp = ani_sp self.af_sp = af_sp self.max_ani_neighbour = 97.0 self.min_mash_ani = 90.0 self.ClusteredGenome = namedtuple('ClusteredGenome', 'ani af gid') self.fastani = FastANI(ani_cache_file, cpus)
def _cluster(self, ani_af, non_reps, rep_radius): """Cluster non-representative to representative genomes using species specific ANI thresholds.""" clusters = {} for rep_id in rep_radius: clusters[rep_id] = [] num_clustered = 0 for idx, non_rid in enumerate(non_reps): if idx % 100 == 0: sys.stdout.write('==> Processed {:,} of {:,} genomes [no. clustered = {:,}].\r'.format( idx+1, len(non_reps), num_clustered)) sys.stdout.flush() if non_rid not in ani_af: continue closest_rid = None closest_ani = 0 closest_af = 0 for rid in rep_radius: if rid not in ani_af[non_rid]: continue ani, af = FastANI.symmetric_ani(ani_af, rid, non_rid) if af >= self.af_sp: if ani > closest_ani or (ani == closest_ani and af > closest_af): closest_rid = rid closest_ani = ani closest_af = af if closest_rid: if closest_ani > rep_radius[closest_rid].ani: num_clustered += 1 clusters[closest_rid].append(self.ClusteredGenome(gid=non_rid, ani=closest_ani, af=closest_af)) sys.stdout.write('==> Processed {:,} of {:,} genomes [no. clustered = {:,}].\r'.format( len(non_reps), len(non_reps), num_clustered)) sys.stdout.flush() sys.stdout.write('\n') num_unclustered = len(non_reps) - num_clustered self.logger.info('Assigned {:,} genomes to {:,} representatives; {:,} genomes remain unclustered.'.format( sum([len(clusters[rid]) for rid in clusters]), len(clusters), num_unclustered)) return clusters
def __init__(self, ani_cache_file, cpus, output_dir): """Initialization.""" self.ltp_dir = 'rna_ltp_132' self.ltp_results_file = 'ssu.taxonomy.tsv' self.LTP_METADATA = namedtuple('LTP_METADATA', 'taxonomy taxa species ssu_len evalue bitscore aln_len perc_iden perc_aln') self.ltp_pi_threshold = 99.0 self.ltp_pa_threshold = 90.0 self.ltp_ssu_len_threshold = 900 self.ltp_evalue_threshold = 1e-10 self.output_dir = output_dir self.logger = logging.getLogger('timestamp') self.cpus = cpus self.fastani = FastANI(ani_cache_file, cpus) self.ani_pickle_dir = os.path.join(self.output_dir, 'ani_pickles') if not os.path.exists(self.ani_pickle_dir): os.makedirs(self.ani_pickle_dir)
def __init__(self, ani_cache_file, cpus, output_dir): """Initialization.""" self.output_dir = output_dir self.logger = logging.getLogger('timestamp') self.fastani = FastANI(ani_cache_file, cpus) # action parameters self.genomic_update_ani = 99.0 self.genomic_update_af = 0.80 self.new_rep_ani = 99.0 self.new_rep_af = 0.80 self.new_rep_qs_threshold = 10 # increase in ANI score require to select # new representative self.action_log = open(os.path.join(self.output_dir, 'action_log.tsv'), 'w') self.action_log.write( 'Genome ID\tPrevious GTDB species\tAction\tParameters\n') self.new_reps = {}
class UpdateClusterDeNovo(object): """Infer de novo species clusters and representatives for remaining genomes.""" def __init__(self, ani_sp, af_sp, ani_cache_file, cpus, output_dir): """Initialization.""" check_dependencies(['fastANI', 'mash']) self.cpus = cpus self.output_dir = output_dir self.logger = logging.getLogger('timestamp') self.true_str = ['t', 'T', 'true', 'True'] self.ani_sp = ani_sp self.af_sp = af_sp self.min_mash_ani = 90.0 self.fastani = FastANI(ani_cache_file, cpus) def _parse_named_clusters(self, named_cluster_file): """Parse named GTDB species clusters.""" rep_gids = set() rep_clustered_gids = set() rep_radius = {} with open(named_cluster_file) as f: headers = f.readline().strip().split('\t') rep_index = headers.index('Representative') num_clustered_index = headers.index('No. clustered genomes') clustered_genomes_index = headers.index('Clustered genomes') closest_type_index = headers.index('Closest representative') ani_radius_index = headers.index('ANI radius') af_index = headers.index('AF closest') for line in f: line_split = line.strip().split('\t') rep_gid = line_split[rep_index] rep_gids.add(rep_gid) num_clustered = int(line_split[num_clustered_index]) if num_clustered > 0: for gid in [g.strip() for g in line_split[clustered_genomes_index].split(',')]: rep_clustered_gids.add(gid) rep_radius[rep_gid] = GenomeRadius(ani = float(line_split[ani_radius_index]), af = float(line_split[af_index]), neighbour_gid = line_split[closest_type_index]) return rep_gids, rep_clustered_gids, rep_radius def _nonrep_radius(self, unclustered_gids, rep_gids, ani_af_rep_vs_nonrep): """Calculate circumscription radius for unclustered, nontype genomes.""" # set radius for genomes to default values nonrep_radius = {} for gid in unclustered_gids: nonrep_radius[gid] = GenomeRadius(ani = self.ani_sp, af = None, neighbour_gid = None) # determine closest type ANI neighbour and restrict ANI radius as necessary ani_af = pickle.load(open(ani_af_rep_vs_nonrep, 'rb')) for nonrep_gid in unclustered_gids: if nonrep_gid not in ani_af: continue for rep_gid in rep_gids: if rep_gid not in ani_af[nonrep_gid]: continue ani, af = symmetric_ani(ani_af, nonrep_gid, rep_gid) if ani > nonrep_radius[nonrep_gid].ani and af >= self.af_sp: nonrep_radius[nonrep_gid] = GenomeRadius(ani = ani, af = af, neighbour_gid = rep_gid) self.logger.info('ANI circumscription radius: min={:.2f}, mean={:.2f}, max={:.2f}'.format( min([d.ani for d in nonrep_radius.values()]), np_mean([d.ani for d in nonrep_radius.values()]), max([d.ani for d in nonrep_radius.values()]))) return nonrep_radius def _mash_ani_unclustered(self, cur_genomes, gids): """Calculate pairwise Mash ANI estimates between genomes.""" mash = Mash(self.cpus) # create Mash sketch for potential representative genomes mash_nontype_sketch_file = os.path.join(self.output_dir, 'gtdb_unclustered_genomes.msh') genome_list_file = os.path.join(self.output_dir, 'gtdb_unclustered_genomes.lst') mash.sketch(gids, cur_genomes.genomic_files, genome_list_file, mash_nontype_sketch_file) # get Mash distances mash_dist_file = os.path.join(self.output_dir, 'gtdb_unclustered_genomes.dst') mash.dist_pairwise( float(100 - self.min_mash_ani)/100, mash_nontype_sketch_file, mash_dist_file) # read Mash distances mash_ani = mash.read_ani(mash_dist_file) # report pairs above Mash threshold mash_ani_pairs = [] for qid in mash_ani: for rid in mash_ani[qid]: if mash_ani[qid][rid] >= self.min_mash_ani: n_qid = cur_genomes.user_uba_id_map.get(qid, qid) n_rid = cur_genomes.user_uba_id_map.get(rid, rid) if n_qid != n_rid: mash_ani_pairs.append((n_qid, n_rid)) mash_ani_pairs.append((n_rid, n_qid)) self.logger.info('Identified {:,} genome pairs with a Mash ANI >= {:.1f}%.'.format( len(mash_ani_pairs), self.min_mash_ani)) return mash_ani def _selected_rep_genomes(self, cur_genomes, nonrep_radius, unclustered_qc_gids, mash_ani): """Select de novo representatives for species clusters in a greedy fashion using species-specific ANI thresholds.""" # sort genomes by quality score self.logger.info('Selecting de novo representatives in a greedy manner based on quality.') q = {gid:cur_genomes[gid].score_type_strain() for gid in unclustered_qc_gids} q_sorted = sorted(q.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) # greedily determine representatives for new species clusters cluster_rep_file = os.path.join(self.output_dir, 'cluster_reps.tsv') clusters = set() if not os.path.exists(cluster_rep_file): clustered_genomes = 0 max_ani_pairs = 0 for idx, (cur_gid, _score) in enumerate(q_sorted): # determine reference genomes to calculate ANI between ani_pairs = [] if cur_gid in mash_ani: for rep_gid in clusters: if mash_ani[cur_gid].get(rep_gid, 0) >= self.min_mash_ani: ani_pairs.append((cur_gid, rep_gid)) ani_pairs.append((rep_gid, cur_gid)) # determine if genome clusters with representative clustered = False if ani_pairs: if len(ani_pairs) > max_ani_pairs: max_ani_pairs = len(ani_pairs) ani_af = self.fastani.pairs(ani_pairs, cur_genomes.genomic_files, report_progress=False) closest_rep_gid = None closest_rep_ani = 0 closest_rep_af = 0 for rep_gid in clusters: ani, af = symmetric_ani(ani_af, cur_gid, rep_gid) if af >= self.af_sp: if ani > closest_rep_ani or (ani == closest_rep_ani and af > closest_rep_af): closest_rep_gid = rep_gid closest_rep_ani = ani closest_rep_af = af if ani > nonrep_radius[cur_gid].ani and af >= self.af_sp: nonrep_radius[cur_gid] = GenomeRadius(ani = ani, af = af, neighbour_gid = rep_gid) if closest_rep_gid and closest_rep_ani > nonrep_radius[closest_rep_gid].ani: clustered = True if not clustered: # genome is a new species cluster representative clusters.add(cur_gid) else: clustered_genomes += 1 if (idx+1) % 10 == 0 or idx+1 == len(q_sorted): statusStr = '-> Clustered {:,} of {:,} ({:.2f}%) genomes [ANI pairs: {:,}; clustered genomes: {:,}; clusters: {:,}].'.format( idx+1, len(q_sorted), float(idx+1)*100/len(q_sorted), max_ani_pairs, clustered_genomes, len(clusters)).ljust(96) sys.stdout.write('{}\r'.format(statusStr)) sys.stdout.flush() max_ani_pairs = 0 sys.stdout.write('\n') # write out selected cluster representative fout = open(cluster_rep_file, 'w') for gid in clusters: fout.write('{}\n'.format(gid)) fout.close() else: # read cluster reps from file self.logger.warning('Using previously determined cluster representatives.') for line in open(cluster_rep_file): gid = line.strip() clusters.add(gid) self.logger.info('Selected {:,} representative genomes for de novo species clusters.'.format(len(clusters))) return clusters def _cluster_genomes(self, cur_genomes, de_novo_rep_gids, named_rep_gids, final_cluster_radius): """Cluster new representatives to representatives of named GTDB species clusters.""" all_reps = de_novo_rep_gids.union(named_rep_gids) nonrep_gids = set(cur_genomes.genomes.keys()) - all_reps self.logger.info('Clustering {:,} genomes to {:,} named and de novo representatives.'.format( len(nonrep_gids), len(all_reps))) if True: #*** # calculate MASH distance between non-representatives and representatives genomes mash = Mash(self.cpus) mash_rep_sketch_file = os.path.join(self.output_dir, 'gtdb_rep_genomes.msh') rep_genome_list_file = os.path.join(self.output_dir, 'gtdb_rep_genomes.lst') mash.sketch(all_reps, cur_genomes.genomic_files, rep_genome_list_file, mash_rep_sketch_file) mash_none_rep_sketch_file = os.path.join(self.output_dir, 'gtdb_nonrep_genomes.msh') non_rep_file = os.path.join(self.output_dir, 'gtdb_nonrep_genomes.lst') mash.sketch(nonrep_gids, cur_genomes.genomic_files, non_rep_file, mash_none_rep_sketch_file) # get Mash distances mash_dist_file = os.path.join(self.output_dir, 'gtdb_rep_vs_nonrep_genomes.dst') mash.dist(float(100 - self.min_mash_ani)/100, mash_rep_sketch_file, mash_none_rep_sketch_file, mash_dist_file) # read Mash distances mash_ani = mash.read_ani(mash_dist_file) # calculate ANI between non-representatives and representatives genomes clusters = {} for gid in all_reps: clusters[gid] = [] if False: #*** mash_ani_pairs = [] for gid in nonrep_gids: if gid in mash_ani: for rid in clusters: if mash_ani[gid].get(rid, 0) >= self.min_mash_ani: n_gid = cur_genomes.user_uba_id_map.get(gid, gid) n_rid = cur_genomes.user_uba_id_map.get(rid, rid) if n_gid != n_rid: mash_ani_pairs.append((n_gid, n_rid)) mash_ani_pairs.append((n_rid, n_gid)) mash_ani_pairs = [] for qid in mash_ani: n_qid = cur_genomes.user_uba_id_map.get(qid, qid) assert n_qid in nonrep_gids for rid in mash_ani[qid]: n_rid = cur_genomes.user_uba_id_map.get(rid, rid) assert n_rid in all_reps if (mash_ani[qid][rid] >= self.min_mash_ani and n_qid != n_rid): mash_ani_pairs.append((n_qid, n_rid)) mash_ani_pairs.append((n_rid, n_qid)) self.logger.info('Calculating ANI between {:,} species clusters and {:,} unclustered genomes ({:,} pairs):'.format( len(clusters), len(nonrep_gids), len(mash_ani_pairs))) ani_af = self.fastani.pairs(mash_ani_pairs, cur_genomes.genomic_files) # assign genomes to closest representatives # that is within the representatives ANI radius self.logger.info('Assigning genomes to closest representative.') for idx, cur_gid in enumerate(nonrep_gids): closest_rep_gid = None closest_rep_ani = 0 closest_rep_af = 0 for rep_gid in clusters: ani, af = symmetric_ani(ani_af, cur_gid, rep_gid) if ani >= final_cluster_radius[rep_gid].ani and af >= self.af_sp: if ani > closest_rep_ani or (ani == closest_rep_ani and af > closest_rep_af): closest_rep_gid = rep_gid closest_rep_ani = ani closest_rep_af = af if closest_rep_gid: clusters[closest_rep_gid].append(ClusteredGenome(gid=cur_gid, ani=closest_rep_ani, af=closest_rep_af)) else: self.logger.warning('Failed to assign genome {} to representative.'.format(cur_gid)) if closest_rep_gid: self.logger.warning(' ...closest_rep_gid = {}'.format(closest_rep_gid)) self.logger.warning(' ...closest_rep_ani = {:.2f}'.format(closest_rep_ani)) self.logger.warning(' ...closest_rep_af = {:.2f}'.format(closest_rep_af)) self.logger.warning(' ...closest rep radius = {:.2f}'.format(final_cluster_radius[closest_rep_gid].ani)) else: self.logger.warning(' ...no representative with an AF >{:.2f} identified.'.format(self.af_sp)) statusStr = '-> Assigned {:,} of {:,} ({:.2f}%) genomes.'.format(idx+1, len(nonrep_gids), float(idx+1)*100/len(nonrep_gids)).ljust(86) sys.stdout.write('{}\r'.format(statusStr)) sys.stdout.flush() sys.stdout.write('\n') pickle.dump(clusters, open(os.path.join(self.output_dir, 'clusters.pkl'), 'wb')) pickle.dump(ani_af, open(os.path.join(self.output_dir, 'ani_af_rep_vs_nonrep.de_novo.pkl'), 'wb')) else: self.logger.warning('Using previously calculated results in: {}'.format('clusters.pkl')) clusters = pickle.load(open(os.path.join(self.output_dir, 'clusters.pkl'), 'rb')) self.logger.warning('Using previously calculated results in: {}'.format('ani_af_rep_vs_nonrep.de_novo.pkl')) ani_af = pickle.load(open(os.path.join(self.output_dir, 'ani_af_rep_vs_nonrep.de_novo.pkl'), 'rb')) return clusters, ani_af def run(self, named_cluster_file, cur_gtdb_metadata_file, cur_genomic_path_file, uba_genome_paths, qc_passed_file, ncbi_genbank_assembly_file, untrustworthy_type_file, ani_af_rep_vs_nonrep, gtdb_type_strains_ledger): """Infer de novo species clusters and representatives for remaining genomes.""" # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, uba_genome_file=uba_genome_paths, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info(f' ... current genome set contains {len(cur_genomes):,} genomes.') # get path to previous and current genomic FASTA files self.logger.info('Reading path to current genomic FASTA files.') cur_genomes.load_genomic_file_paths(cur_genomic_path_file) cur_genomes.load_genomic_file_paths(uba_genome_paths) # determine representatives and genomes clustered to each representative self.logger.info('Reading named GTDB species clusters.') named_rep_gids, rep_clustered_gids, rep_radius = self._parse_named_clusters(named_cluster_file) self.logger.info(' ... identified {:,} representative genomes.'.format(len(named_rep_gids))) self.logger.info(' ... identified {:,} clustered genomes.'.format(len(rep_clustered_gids))) # determine genomes left to be clustered unclustered_gids = set(cur_genomes.genomes.keys()) - named_rep_gids - rep_clustered_gids self.logger.info('Identified {:,} unclustered genomes passing QC.'.format(len(unclustered_gids))) # establish closest representative for each unclustered genome self.logger.info('Determining ANI circumscription for {:,} unclustered genomes.'.format(len(unclustered_gids))) nonrep_radius = self._nonrep_radius(unclustered_gids, named_rep_gids, ani_af_rep_vs_nonrep) # calculate Mash ANI estimates between unclustered genomes self.logger.info('Calculating Mash ANI estimates between unclustered genomes.') mash_anis = self._mash_ani_unclustered(cur_genomes, unclustered_gids) # select de novo species representatives in a greedy fashion based on genome quality de_novo_rep_gids = self._selected_rep_genomes(cur_genomes, nonrep_radius, unclustered_gids, mash_anis) # cluster all non-representative genomes to representative genomes final_cluster_radius = rep_radius.copy() final_cluster_radius.update(nonrep_radius) final_clusters, ani_af = self._cluster_genomes(cur_genomes, de_novo_rep_gids, named_rep_gids, final_cluster_radius) # remove genomes that are not representatives of a species cluster and then write out representative ANI radius for gid in set(final_cluster_radius) - set(final_clusters): del final_cluster_radius[gid] self.logger.info('Writing {:,} species clusters to file.'.format(len(final_clusters))) self.logger.info('Writing {:,} cluster radius information to file.'.format(len(final_cluster_radius))) write_clusters(final_clusters, final_cluster_radius, cur_genomes, os.path.join(self.output_dir, 'gtdb_clusters_de_novo.tsv')) write_rep_radius(final_cluster_radius, cur_genomes, os.path.join(self.output_dir, 'gtdb_ani_radius_de_novo.tsv'))
class ClusterNamedTypes(object): """Cluster genomes to selected GTDB type genomes.""" def __init__(self, ani_sp, af_sp, ani_cache_file, cpus, output_dir): """Initialization.""" check_dependencies(['fastANI', 'mash']) self.cpus = cpus self.output_dir = output_dir self.logger = logging.getLogger('timestamp') self.ani_sp = ani_sp self.af_sp = af_sp self.max_ani_neighbour = 97.0 self.min_mash_ani = 90.0 self.ClusteredGenome = namedtuple('ClusteredGenome', 'ani af gid') self.fastani = FastANI(ani_cache_file, cpus) def _type_genome_radius(self, type_gids, type_genome_ani_file): """Calculate circumscription radius for type genomes.""" # set type radius for all type genomes to default values type_radius = {} for gid in type_gids: type_radius[gid] = GenomeRadius(ani=self.ani_sp, af=None, neighbour_gid=None) # determine closest ANI neighbour and restrict ANI radius as necessary with open(type_genome_ani_file) as f: header = f.readline().strip().split('\t') type_gid1_index = header.index('Type genome 1') type_gid2_index = header.index('Type genome 2') ani_index = header.index('ANI') af_index = header.index('AF') for line in f: line_split = line.strip().split('\t') type_gid1 = line_split[type_gid1_index] type_gid2 = line_split[type_gid2_index] if type_gid1 not in type_gids or type_gid2 not in type_gids: continue ani = float(line_split[ani_index]) af = float(line_split[af_index]) if ani > type_radius[type_gid1].ani: if af < self.af_sp: if ani >= self.ani_sp: self.logger.warning( 'ANI for %s and %s is >%.2f, but AF <%.2f [pair skipped].' % (type_gid1, type_gid2, ani, af)) continue if ani > self.max_ani_neighbour: self.logger.error('ANI neighbour %s is >%.2f for %s.' % (type_gid2, ani, type_gid1)) type_radius[type_gid1] = GenomeRadius( ani=ani, af=af, neighbour_gid=type_gid2) self.logger.info( 'ANI circumscription radius: min=%.2f, mean=%.2f, max=%.2f' % (min([d.ani for d in type_radius.values() ]), np_mean([d.ani for d in type_radius.values() ]), max([d.ani for d in type_radius.values()]))) return type_radius def _calculate_ani(self, type_gids, genome_files, ncbi_taxonomy, type_genome_sketch_file): """Calculate ANI between type and non-type genomes.""" mash = Mash(self.cpus) # create Mash sketch for type genomes if not type_genome_sketch_file or not os.path.exists( type_genome_sketch_file): type_genome_list_file = os.path.join(self.output_dir, 'gtdb_type_genomes.lst') type_genome_sketch_file = os.path.join(self.output_dir, 'gtdb_type_genomes.msh') mash.sketch(type_gids, genome_files, type_genome_list_file, type_genome_sketch_file) # create Mash sketch for non-type genomes nontype_gids = set() for gid in genome_files: if gid not in type_gids: nontype_gids.add(gid) nontype_genome_list_file = os.path.join(self.output_dir, 'gtdb_nontype_genomes.lst') nontype_genome_sketch_file = os.path.join(self.output_dir, 'gtdb_nontype_genomes.msh') mash.sketch(nontype_gids, genome_files, nontype_genome_list_file, nontype_genome_sketch_file) # get Mash distances mash_dist_file = os.path.join(self.output_dir, 'gtdb_type_vs_nontype_genomes.dst') mash.dist( float(100 - self.min_mash_ani) / 100, type_genome_sketch_file, nontype_genome_sketch_file, mash_dist_file) # read Mash distances mash_ani = mash.read_ani(mash_dist_file) # get pairs above Mash threshold mash_ani_pairs = [] for qid in mash_ani: for rid in mash_ani[qid]: if mash_ani[qid][rid] >= self.min_mash_ani: if qid != rid: mash_ani_pairs.append((qid, rid)) mash_ani_pairs.append((rid, qid)) self.logger.info( 'Identified %d genome pairs with a Mash ANI >= %.1f%%.' % (len(mash_ani_pairs), self.min_mash_ani)) # calculate ANI between pairs self.logger.info('Calculating ANI between %d genome pairs:' % len(mash_ani_pairs)) if True: #*** ani_af = self.fastani.pairs(mash_ani_pairs, genome_files) pickle.dump( ani_af, open( os.path.join(self.output_dir, 'ani_af_type_vs_nontype.pkl'), 'wb')) else: ani_af = pickle.load( open( os.path.join(self.output_dir, 'ani_af_type_vs_nontype.pkl'), 'rb')) return ani_af def _cluster(self, ani_af, nontype_gids, type_radius): """Cluster non-type genomes to type genomes using species specific ANI thresholds.""" clusters = {} for rep_id in type_radius: clusters[rep_id] = [] for idx, nontype_gid in enumerate(nontype_gids): if idx % 100 == 0: sys.stdout.write('==> Processed %d of %d genomes.\r' % (idx + 1, len(nontype_gids))) sys.stdout.flush() if nontype_gid not in ani_af: continue closest_type_gid = None closest_ani = 0 closest_af = 0 for type_gid in type_radius: if type_gid not in ani_af[nontype_gid]: continue ani, af = symmetric_ani(ani_af, type_gid, nontype_gid) if af >= self.af_sp: if ani > closest_ani or (ani == closest_ani and af > closest_af): closest_type_gid = type_gid closest_ani = ani closest_af = af if closest_type_gid: if closest_ani > type_radius[closest_type_gid].ani: clusters[closest_type_gid].append( self.ClusteredGenome(gid=nontype_gid, ani=closest_ani, af=closest_af)) sys.stdout.write('==> Processed %d of %d genomes.\r' % (idx, len(nontype_gids))) sys.stdout.flush() sys.stdout.write('\n') self.logger.info( 'Assigned %d genomes to representatives.' % sum([len(clusters[type_gid]) for type_gid in clusters])) return clusters def run(self, qc_file, metadata_file, genome_path_file, named_type_genome_file, type_genome_ani_file, mash_sketch_file, species_exception_file): """Cluster genomes to selected GTDB type genomes.""" # identify genomes failing quality criteria self.logger.info('Reading QC file.') passed_qc = read_qc_file(qc_file) self.logger.info('Identified %d genomes passing QC.' % len(passed_qc)) # get type genomes type_gids = set() species_type_gid = {} with open(named_type_genome_file) as f: header = f.readline().strip().split('\t') type_gid_index = header.index('Type genome') sp_index = header.index('NCBI species') for line in f: line_split = line.strip().split('\t') type_gids.add(line_split[type_gid_index]) species_type_gid[ line_split[type_gid_index]] = line_split[sp_index] self.logger.info('Identified type genomes for %d species.' % len(species_type_gid)) # calculate circumscription radius for type genomes self.logger.info( 'Determining ANI species circumscription for %d type genomes.' % len(type_gids)) type_radius = self._type_genome_radius(type_gids, type_genome_ani_file) assert (len(type_radius) == len(species_type_gid)) write_rep_radius( type_radius, species_type_gid, os.path.join(self.output_dir, 'gtdb_type_genome_ani_radius.tsv')) # get path to genome FASTA files self.logger.info('Reading path to genome FASTA files.') genome_files = read_genome_path(genome_path_file) self.logger.info('Read path for %d genomes.' % len(genome_files)) for gid in set(genome_files): if gid not in passed_qc: genome_files.pop(gid) self.logger.info( 'Considering %d genomes after removing unwanted User genomes.' % len(genome_files)) assert (len(genome_files) == len(passed_qc)) # get GTDB and NCBI taxonomy strings for each genome self.logger.info('Reading NCBI taxonomy from GTDB metadata file.') ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy( metadata_file, species_exception_file) self.logger.info( 'Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count)) # calculate ANI between type and non-type genomes self.logger.info('Calculating ANI between type and non-type genomes.') ani_af = self._calculate_ani(type_gids, genome_files, ncbi_taxonomy, mash_sketch_file) # cluster remaining genomes to type genomes nontype_gids = set(genome_files) - set(type_radius) self.logger.info( 'Clustering %d non-type genomes to type genomes using species specific ANI radii.' % len(nontype_gids)) clusters = self._cluster(ani_af, nontype_gids, type_radius) # write out clusters write_clusters( clusters, type_radius, species_type_gid, os.path.join(self.output_dir, 'gtdb_type_genome_clusters.tsv'))
class ClusterDeNovo(object): """Infer de novo species clusters and type genomes for remaining genomes.""" def __init__(self, ani_sp, af_sp, ani_cache_file, cpus, output_dir): """Initialization.""" check_dependencies(['fastANI', 'mash']) self.cpus = cpus self.output_dir = output_dir self.logger = logging.getLogger('timestamp') self.true_str = ['t', 'T', 'true', 'True'] self.ani_sp = ani_sp self.af_sp = af_sp self.min_mash_ani = 90.0 self.ClusteredGenome = namedtuple('ClusteredGenome', 'ani af gid') self.fastani = FastANI(ani_cache_file, cpus) def _parse_type_clusters(self, type_genome_cluster_file): """Parse type genomes clustering information.""" type_species = set() species_type_gid = {} type_gids = set() type_clustered_gids = set() type_radius = {} with open(type_genome_cluster_file) as f: headers = f.readline().strip().split('\t') type_sp_index = headers.index('NCBI species') type_genome_index = headers.index('Type genome') num_clustered_index = headers.index('No. clustered genomes') clustered_genomes_index = headers.index('Clustered genomes') closest_type_index = headers.index('Closest type genome') ani_radius_index = headers.index('ANI radius') af_index = headers.index('AF closest') for line in f: line_split = line.strip().split('\t') type_sp = line_split[type_sp_index] type_species.add(type_sp) type_gid = line_split[type_genome_index] type_gids.add(type_gid) species_type_gid[type_gid] = type_sp num_clustered = int(line_split[num_clustered_index]) if num_clustered > 0: for gid in [g.strip() for g in line_split[clustered_genomes_index].split(',')]: type_clustered_gids.add(gid) type_radius[type_gid] = GenomeRadius(ani = float(line_split[ani_radius_index]), af = float(line_split[af_index]), neighbour_gid = line_split[closest_type_index]) return type_species, species_type_gid, type_gids, type_clustered_gids, type_radius def _parse_synonyms(self, type_genome_synonym_file): """Parse synonyms.""" synonyms = set() with open(type_genome_synonym_file) as f: headers = f.readline().strip().split('\t') synonym_index = headers.index('Synonym') for line in f: line_split = line.strip().split('\t') synonym = line_split[synonym_index] synonyms.add(synonym) return synonyms def _nontype_radius(self, unclustered_gids, type_gids, ani_af_nontype_vs_type): """Calculate circumscription radius for unclustered, nontype genomes.""" # set type radius for all type genomes to default values nontype_radius = {} for gid in unclustered_gids: nontype_radius[gid] = GenomeRadius(ani = self.ani_sp, af = None, neighbour_gid = None) # determine closest type ANI neighbour and restrict ANI radius as necessary ani_af = pickle.load(open(ani_af_nontype_vs_type, 'rb')) for nontype_gid in unclustered_gids: if nontype_gid not in ani_af: continue for type_gid in type_gids: if type_gid not in ani_af[nontype_gid]: continue ani, af = symmetric_ani(ani_af, nontype_gid, type_gid) if ani > nontype_radius[nontype_gid].ani and af >= self.af_sp: nontype_radius[nontype_gid] = GenomeRadius(ani = ani, af = af, neighbour_gid = type_gid) self.logger.info('ANI circumscription radius: min=%.2f, mean=%.2f, max=%.2f' % ( min([d.ani for d in nontype_radius.values()]), np_mean([d.ani for d in nontype_radius.values()]), max([d.ani for d in nontype_radius.values()]))) return nontype_radius def _mash_ani_unclustered(self, genome_files, gids): """Calculate pairwise Mash ANI estimates between genomes.""" mash = Mash(self.cpus) # create Mash sketch for potential representative genomes mash_nontype_sketch_file = os.path.join(self.output_dir, 'gtdb_unclustered_genomes.msh') genome_list_file = os.path.join(self.output_dir, 'gtdb_unclustered_genomes.lst') mash.sketch(gids, genome_files, genome_list_file, mash_nontype_sketch_file) # get Mash distances mash_dist_file = os.path.join(self.output_dir, 'gtdb_unclustered_genomes.dst') mash.dist_pairwise( float(100 - self.min_mash_ani)/100, mash_nontype_sketch_file, mash_dist_file) # read Mash distances mash_ani = mash.read_ani(mash_dist_file) # report pairs above Mash threshold mash_ani_pairs = [] for qid in mash_ani: for rid in mash_ani[qid]: if mash_ani[qid][rid] >= self.min_mash_ani: if qid != rid: mash_ani_pairs.append((qid, rid)) mash_ani_pairs.append((rid, qid)) self.logger.info('Identified %d genome pairs with a Mash ANI >= %.1f%%.' % (len(mash_ani_pairs), self.min_mash_ani)) return mash_ani def _selected_rep_genomes(self, genome_files, nontype_radius, unclustered_qc_gids, mash_ani, quality_metadata, rnd_type_genome): """Select representative genomes for species clusters in a greedy fashion using species-specific ANI thresholds.""" # sort genomes by quality score if rnd_type_genome: self.logger.info('Selecting random de novo type genomes.') sorted_gids = [] for gid in random.sample(unclustered_qc_gids, len(unclustered_qc_gids)): sorted_gids.append((gid, 0)) else: self.logger.info('Selecting de novo type genomes in a greedy manner based on quality.') qscore = quality_score(unclustered_qc_gids, quality_metadata) sorted_gids = sorted(qscore.items(), key=operator.itemgetter(1), reverse=True) # greedily determine representatives for new species clusters cluster_rep_file = os.path.join(self.output_dir, 'cluster_reps.tsv') clusters = set() if not os.path.exists(cluster_rep_file): self.logger.info('Clustering genomes to identify representatives.') clustered_genomes = 0 max_ani_pairs = 0 for idx, (cur_gid, _score) in enumerate(sorted_gids): # determine reference genomes to calculate ANI between ani_pairs = [] if cur_gid in mash_ani: for rep_gid in clusters: if mash_ani[cur_gid].get(rep_gid, 0) >= self.min_mash_ani: ani_pairs.append((cur_gid, rep_gid)) ani_pairs.append((rep_gid, cur_gid)) # determine if genome clusters with representative clustered = False if ani_pairs: if len(ani_pairs) > max_ani_pairs: max_ani_pairs = len(ani_pairs) ani_af = self.fastani.pairs(ani_pairs, genome_files, report_progress=False) closest_rep_gid = None closest_rep_ani = 0 closest_rep_af = 0 for rep_gid in clusters: ani, af = symmetric_ani(ani_af, cur_gid, rep_gid) if af >= self.af_sp: if ani > closest_rep_ani or (ani == closest_rep_ani and af > closest_rep_af): closest_rep_gid = rep_gid closest_rep_ani = ani closest_rep_af = af if ani > nontype_radius[cur_gid].ani and af >= self.af_sp: nontype_radius[cur_gid] = GenomeRadius(ani = ani, af = af, neighbour_gid = rep_gid) if closest_rep_gid and closest_rep_ani > nontype_radius[closest_rep_gid].ani: clustered = True if not clustered: # genome is a new species cluster representative clusters.add(cur_gid) else: clustered_genomes += 1 if (idx+1) % 10 == 0 or idx+1 == len(sorted_gids): statusStr = '-> Clustered %d of %d (%.2f%%) genomes [ANI pairs: %d; clustered genomes: %d; clusters: %d].'.ljust(96) % ( idx+1, len(sorted_gids), float(idx+1)*100/len(sorted_gids), max_ani_pairs, clustered_genomes, len(clusters)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() max_ani_pairs = 0 sys.stdout.write('\n') # write out selected cluster representative fout = open(cluster_rep_file, 'w') for gid in clusters: fout.write('%s\n' % gid) fout.close() else: # read cluster reps from file self.logger.warning('Using previously determined cluster representatives.') for line in open(cluster_rep_file): gid = line.strip() clusters.add(gid) self.logger.info('Selected %d representative genomes for de novo species clusters.' % len(clusters)) return clusters def _cluster_genomes(self, genome_files, rep_genomes, type_gids, passed_qc, final_cluster_radius): """Cluster all non-type/representative genomes to selected type/representatives genomes.""" all_reps = rep_genomes.union(type_gids) # calculate MASH distance between non-type/representative genomes and selected type/representatives genomes mash = Mash(self.cpus) mash_type_rep_sketch_file = os.path.join(self.output_dir, 'gtdb_rep_genomes.msh') type_rep_genome_list_file = os.path.join(self.output_dir, 'gtdb_rep_genomes.lst') mash.sketch(all_reps, genome_files, type_rep_genome_list_file, mash_type_rep_sketch_file) mash_none_rep_sketch_file = os.path.join(self.output_dir, 'gtdb_nonrep_genomes.msh') type_none_rep_file = os.path.join(self.output_dir, 'gtdb_nonrep_genomes.lst') mash.sketch(passed_qc - all_reps, genome_files, type_none_rep_file, mash_none_rep_sketch_file) # get Mash distances mash_dist_file = os.path.join(self.output_dir, 'gtdb_rep_vs_nonrep_genomes.dst') mash.dist(float(100 - self.min_mash_ani)/100, mash_type_rep_sketch_file, mash_none_rep_sketch_file, mash_dist_file) # read Mash distances mash_ani = mash.read_ani(mash_dist_file) # calculate ANI between non-type/representative genomes and selected type/representatives genomes clusters = {} for gid in all_reps: clusters[gid] = [] genomes_to_cluster = passed_qc - set(clusters) ani_pairs = [] for gid in genomes_to_cluster: if gid in mash_ani: for rep_gid in clusters: if mash_ani[gid].get(rep_gid, 0) >= self.min_mash_ani: ani_pairs.append((gid, rep_gid)) ani_pairs.append((rep_gid, gid)) self.logger.info('Calculating ANI between %d species clusters and %d unclustered genomes (%d pairs):' % ( len(clusters), len(genomes_to_cluster), len(ani_pairs))) ani_af = self.fastani.pairs(ani_pairs, genome_files) # assign genomes to closest representatives # that is within the representatives ANI radius self.logger.info('Assigning genomes to closest representative.') for idx, cur_gid in enumerate(genomes_to_cluster): closest_rep_gid = None closest_rep_ani = 0 closest_rep_af = 0 for rep_gid in clusters: ani, af = symmetric_ani(ani_af, cur_gid, rep_gid) if ani >= final_cluster_radius[rep_gid].ani and af >= self.af_sp: if ani > closest_rep_ani or (ani == closest_rep_ani and af > closest_rep_af): closest_rep_gid = rep_gid closest_rep_ani = ani closest_rep_af = af if closest_rep_gid: clusters[closest_rep_gid].append(self.ClusteredGenome(gid=cur_gid, ani=closest_rep_ani, af=closest_rep_af)) else: self.logger.warning('Failed to assign genome %s to representative.' % cur_gid) if closest_rep_gid: self.logger.warning(' ...closest_rep_gid = %s' % closest_rep_gid) self.logger.warning(' ...closest_rep_ani = %.2f' % closest_rep_ani) self.logger.warning(' ...closest_rep_af = %.2f' % closest_rep_af) self.logger.warning(' ...closest rep radius = %.2f' % final_cluster_radius[closest_rep_gid].ani) else: self.logger.warning(' ...no representative with an AF >%.2f identified.' % self.af_sp) statusStr = '-> Assigned %d of %d (%.2f%%) genomes.'.ljust(86) % (idx+1, len(genomes_to_cluster), float(idx+1)*100/len(genomes_to_cluster)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() sys.stdout.write('\n') return clusters, ani_af def _assign_species_names(self, clusters, names_in_use, gtdb_taxonomy, gtdb_user_to_genbank): """Assign a species name to each species cluster.""" orig_names_in_use = set(names_in_use) fout = open(os.path.join(self.output_dir, 'gtdb_assigned_sp.tsv'), 'w') fout.write('Representative genome\tAssigned species\tGTDB taxonomy\tNo. clustered genomes\tClustered GTDB genera\tClustered GTDB species\tSpecies name in use\tMost common name in use\tClustered genomes\n') cluster_sp_names = {} for rid in sorted(clusters, key=lambda x: len(clusters[x]), reverse=True): clustered_gids = [c.gid for c in clusters[rid]] # find most common genus name in cluster gtdb_genera = [gtdb_taxonomy[gid][5] for gid in clustered_gids] + [gtdb_taxonomy[rid][5]] gtdb_genus_counter = Counter(gtdb_genera) gtdb_common_genus = None gtdb_common_genus_count = 0 for genus, count in gtdb_genus_counter.most_common(): if genus != 'g__': gtdb_common_genus = genus gtdb_common_genus_count = count break # in case of ties involving genus of representative genome, # defer to classification of representative rep_genus = gtdb_taxonomy[rid][5] if gtdb_genus_counter[rep_genus] == gtdb_common_genus_count and rep_genus != 'g__': gtdb_common_genus = rep_genus # get most common GTDB species name gtdb_sp = [gtdb_taxonomy[gid][6] for gid in clustered_gids] + [gtdb_taxonomy[rid][6]] gtdb_sp_counter = Counter(gtdb_sp) gtdb_common_sp = None gtdb_common_sp_count = 0 for sp, count in gtdb_sp_counter.most_common(): if sp != 's__': gtdb_common_sp = sp gtdb_common_sp_count = count break most_common_in_use = gtdb_common_sp in names_in_use min_req_genomes = 0.5*(sum(gtdb_sp_counter.values()) - gtdb_sp_counter.get('s__', 0)) if gtdb_common_sp_count >= min_req_genomes and not most_common_in_use: # assign common species if it occurs in >=50% of the clustered genomes, # excluding genomes with no species assignment names_in_use.add(gtdb_common_sp) cluster_sp_names[rid] = gtdb_common_sp else: # derive new species name from genus, if possible, # and accession number of representative genome genus = '{unresolved}' if gtdb_common_genus and gtdb_common_genus != 'g__': genus = gtdb_common_genus[3:] acc = rid if rid.startswith('U_'): if rid in gtdb_user_to_genbank: acc = gtdb_user_to_genbank[rid] else: # create accession from GTDB User ID of the form: # U_<number>u.0 which will give 'sp<number>u' acc = 'U_' + rid.replace('U_', '') + 'u.0' derived_sp = 's__' + '%s sp%s' % (genus, acc[acc.rfind('_')+1:acc.rfind('.')]) if derived_sp in names_in_use: self.logger.error('Derived species name already in use: %s, %s' % (derived_sp, acc)) sys.exit(-1) names_in_use.add(derived_sp) cluster_sp_names[rid] = derived_sp fout.write('%s\t%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n' % ( rid, cluster_sp_names[rid], '; '.join(gtdb_taxonomy[rid]), len(clustered_gids), ', '.join("%s=%r" % (genus, count) for (genus, count) in gtdb_genus_counter.most_common()), ', '.join("%s=%r" % (sp, count) for (sp, count) in gtdb_sp_counter.most_common()), ', '.join("%s=%s" % (sp, sp in names_in_use) for sp, _count in gtdb_sp_counter.most_common()), '%s=%d' % (gtdb_common_sp, gtdb_common_sp_count) if most_common_in_use else 'n/a', ', '.join(clustered_gids))) fout.close() return cluster_sp_names def _write_rep_info(self, clusters, cluster_sp_names, quality_metadata, genome_quality, excluded_from_refseq_note, ani_af, output_file): """Write out information about selected representative genomes.""" fout = open(output_file, 'w') fout.write('Species\tType genome\tNCBI assembly level\tNCBI genome category') fout.write('\tGenome size (bp)\tQuality score\tCompleteness (%)\tContamination (%)\tNo. scaffolds\tNo. contigs\tN50 contigs\tAmbiguous bases\tSSU count\tSSU length (bp)') fout.write('\tNo. genomes in cluster\tMean ANI\tMean AF\tMin ANI\tMin AF\tNCBI exclude from RefSeq\n') for gid in clusters: fout.write('%s\t%s\t%s\t%s' % ( cluster_sp_names[gid], gid, quality_metadata[gid].ncbi_assembly_level, quality_metadata[gid].ncbi_genome_category)) fout.write('\t%d\t%.2f\t%.2f\t%.2f\t%d\t%d\t%.1f\t%d\t%d\t%d' % ( quality_metadata[gid].genome_size, genome_quality[gid], quality_metadata[gid].checkm_completeness, quality_metadata[gid].checkm_contamination, quality_metadata[gid].scaffold_count, quality_metadata[gid].contig_count, quality_metadata[gid].n50_contigs, quality_metadata[gid].ambiguous_bases, quality_metadata[gid].ssu_count, quality_metadata[gid].ssu_length if quality_metadata[gid].ssu_length else 0)) anis = [] afs = [] for cluster_id in clusters[gid]: ani, af = symmetric_ani(ani_af, gid, cluster_id) anis.append(ani) afs.append(af) if anis: fout.write('\t%d\t%.1f\t%.2f\t%.1f\t%.2f\t%s\n' % (len(clusters[gid]), np_mean(anis), np_mean(afs), min(anis), min(afs), excluded_from_refseq_note.get(gid, ''))) else: fout.write('\t%d\t%s\t%s\t%s\t%s\t%s\n' % (len(clusters[gid]), 'n/a', 'n/a', 'n/a', 'n/a', excluded_from_refseq_note.get(gid, ''))) fout.close() def _gtdb_user_genomes(self, gtdb_user_genomes_file, metadata_file): """Get map between GTDB User genomes and GenBank accessions.""" uba_to_genbank = {} for line in open(gtdb_user_genomes_file): line_split = line.strip().split('\t') gb_acc = line_split[0] uba_id = line_split[4] uba_to_genbank[uba_id] = gb_acc user_to_genbank = {} m = read_gtdb_metadata(metadata_file, ['organism_name']) for gid, metadata in m.items(): if '(UBA' in str(metadata.organism_name): uba_id = metadata.organism_name[metadata.organism_name.find('(')+1:-1] if uba_id in uba_to_genbank: user_to_genbank[gid] = uba_to_genbank[uba_id] return user_to_genbank def run(self, qc_file, metadata_file, gtdb_user_genomes_file, genome_path_file, type_genome_cluster_file, type_genome_synonym_file, ncbi_refseq_assembly_file, ncbi_genbank_assembly_file, ani_af_nontype_vs_type, species_exception_file, rnd_type_genome): """Infer de novo species clusters and type genomes for remaining genomes.""" # identify genomes failing quality criteria self.logger.info('Reading QC file.') passed_qc = read_qc_file(qc_file) self.logger.info('Identified %d genomes passing QC.' % len(passed_qc)) # get NCBI taxonomy strings for each genome self.logger.info('Reading NCBI taxonomy from GTDB metadata file.') ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(metadata_file, species_exception_file) gtdb_taxonomy = read_gtdb_taxonomy(metadata_file) self.logger.info('Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count)) self.logger.info('Read GTDB taxonomy for %d genomes.' % len(gtdb_taxonomy)) # parse NCBI assembly files self.logger.info('Parsing NCBI assembly files.') excluded_from_refseq_note = exclude_from_refseq(ncbi_refseq_assembly_file, ncbi_genbank_assembly_file) # get path to genome FASTA files self.logger.info('Reading path to genome FASTA files.') genome_files = read_genome_path(genome_path_file) self.logger.info('Read path for %d genomes.' % len(genome_files)) for gid in set(genome_files): if gid not in passed_qc: genome_files.pop(gid) self.logger.info('Considering %d genomes as potential representatives after removing unwanted User genomes.' % len(genome_files)) assert(len(genome_files) == len(passed_qc)) # determine type genomes and genomes clustered to type genomes type_species, species_type_gid, type_gids, type_clustered_gids, type_radius = self._parse_type_clusters(type_genome_cluster_file) assert(len(type_species) == len(type_gids)) self.logger.info('Identified %d type genomes.' % len(type_gids)) self.logger.info('Identified %d clustered genomes.' % len(type_clustered_gids)) # calculate quality score for genomes self.logger.info('Parse quality statistics for all genomes.') quality_metadata = read_quality_metadata(metadata_file) # calculate genome quality score self.logger.info('Calculating genome quality score.') genome_quality = quality_score(quality_metadata.keys(), quality_metadata) # determine genomes left to be clustered unclustered_gids = passed_qc - type_gids - type_clustered_gids self.logger.info('Identified %d unclustered genomes passing QC.' % len(unclustered_gids)) # establish closest type genome for each unclustered genome self.logger.info('Determining ANI circumscription for %d unclustered genomes.' % len(unclustered_gids)) nontype_radius = self._nontype_radius(unclustered_gids, type_gids, ani_af_nontype_vs_type) # calculate Mash ANI estimates between unclustered genomes self.logger.info('Calculating Mash ANI estimates between unclustered genomes.') mash_anis = self._mash_ani_unclustered(genome_files, unclustered_gids) # select species representatives genomes in a greedy fashion based on genome quality rep_genomes = self._selected_rep_genomes(genome_files, nontype_radius, unclustered_gids, mash_anis, quality_metadata, rnd_type_genome) # cluster all non-type/non-rep genomes to species type/rep genomes final_cluster_radius = type_radius.copy() final_cluster_radius.update(nontype_radius) final_clusters, ani_af = self._cluster_genomes(genome_files, rep_genomes, type_gids, passed_qc, final_cluster_radius) rep_clusters = {} for gid in rep_genomes: rep_clusters[gid] = final_clusters[gid] # get list of synonyms in order to restrict usage of species names synonyms = self._parse_synonyms(type_genome_synonym_file) self.logger.info('Identified %d synonyms.' % len(synonyms)) # determine User genomes with NCBI accession number that may form species names gtdb_user_to_genbank = self._gtdb_user_genomes(gtdb_user_genomes_file, metadata_file) self.logger.info('Identified %d GTDB User genomes with NCBI accessions.' % len(gtdb_user_to_genbank)) # assign species names to de novo species clusters names_in_use = synonyms.union(type_species) self.logger.info('Identified %d species names already in use.' % len(names_in_use)) self.logger.info('Assigning species name to each de novo species cluster.') cluster_sp_names = self._assign_species_names(rep_clusters, names_in_use, gtdb_taxonomy, gtdb_user_to_genbank) # write out file with details about selected representative genomes self._write_rep_info(rep_clusters, cluster_sp_names, quality_metadata, genome_quality, excluded_from_refseq_note, ani_af, os.path.join(self.output_dir, 'gtdb_rep_genome_info.tsv')) # remove genomes that are not representatives of a species cluster and then write out representative ANI radius for gid in set(final_cluster_radius) - set(final_clusters): del final_cluster_radius[gid] all_species = cluster_sp_names all_species.update(species_type_gid) self.logger.info('Writing %d species clusters to file.' % len(all_species)) self.logger.info('Writing %d cluster radius information to file.' % len(final_cluster_radius)) write_clusters(final_clusters, final_cluster_radius, all_species, os.path.join(self.output_dir, 'gtdb_clusters_final.tsv')) write_rep_radius(final_cluster_radius, all_species, os.path.join(self.output_dir, 'gtdb_ani_radius_final.tsv'))
class UpdateErroneousNCBI(object): """Identify genomes with erroneous NCBI species assignments.""" def __init__(self, ani_ncbi_erroneous, ani_cache_file, cpus, output_dir): """Initialization.""" self.output_dir = output_dir self.logger = logging.getLogger('timestamp') self.ani_ncbi_erroneous = ani_ncbi_erroneous self.fastani = FastANI(ani_cache_file, cpus) def identify_misclassified_genomes_ani(self, cur_genomes, cur_clusters): """Identify genomes with erroneous NCBI species assignments, based on ANI to type strain genomes.""" forbidden_names = set(['cyanobacterium']) # get mapping from genomes to their representatives gid_to_rid = {} for rid, cids in cur_clusters.items(): for cid in cids: gid_to_rid[cid] = rid # get genomes with NCBI species assignment ncbi_sp_gids = defaultdict(list) for gid in cur_genomes: ncbi_species = cur_genomes[gid].ncbi_taxa.species ncbi_specific = specific_epithet(ncbi_species) if ncbi_species != 's__' and ncbi_specific not in forbidden_names: ncbi_sp_gids[ncbi_species].append(gid) # get NCBI species anchored by a type strain genome ncbi_type_anchored_species = {} for rid, cids in cur_clusters.items(): if cur_genomes[rid].is_effective_type_strain(): ncbi_type_species = cur_genomes[rid].ncbi_taxa.species if ncbi_type_species != 's__': ncbi_type_anchored_species[ncbi_type_species] = rid self.logger.info( ' - identified {:,} NCBI species anchored by a type strain genome.' .format(len(ncbi_type_anchored_species))) # identify genomes with erroneous NCBI species assignments fout = open( os.path.join( self.output_dir, 'ncbi_misclassified_sp.ani_{}.tsv'.format( self.ani_ncbi_erroneous)), 'w') fout.write( 'Genome ID\tNCBI species\tGenome cluster\tType species cluster\tANI to type strain\tAF to type strain\n' ) misclassified_gids = set() for idx, (ncbi_species, species_gids) in enumerate(ncbi_sp_gids.items()): if ncbi_species not in ncbi_type_anchored_species: continue type_rid = ncbi_type_anchored_species[ncbi_species] gids_to_check = [] for gid in species_gids: cur_rid = gid_to_rid[gid] if type_rid != cur_rid: # need to check genome as it has the same NCBI species name # as a type strain genome, but resides in a different GTDB # species cluster gids_to_check.append(gid) if len(gids_to_check) > 0: gid_pairs = [] for gid in gids_to_check: gid_pairs.append((type_rid, gid)) gid_pairs.append((gid, type_rid)) statusStr = '-> Establishing erroneous assignments for {} [ANI pairs: {:,}; {:,} of {:,} species].'.format( ncbi_species, len(gid_pairs), idx + 1, len(ncbi_sp_gids)).ljust(96) sys.stdout.write('{}\r'.format(statusStr)) sys.stdout.flush() ani_af = self.fastani.pairs(gid_pairs, cur_genomes.genomic_files, report_progress=False, check_cache=True) for gid in gids_to_check: ani, af = symmetric_ani(ani_af, type_rid, gid) if ani < self.ani_ncbi_erroneous: misclassified_gids.add(gid) fout.write('{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\n'.format( gid, ncbi_species, gid_to_rid[gid], type_rid, ani, af)) sys.stdout.write('\n') fout.close() misclassified_species = set( [cur_genomes[gid].ncbi_taxa.species for gid in misclassified_gids]) self.logger.info( ' - identified {:,} genomes from {:,} species as having misclassified NCBI species assignments.' .format(len(misclassified_gids), len(misclassified_species))) return misclassified_gids def identify_misclassified_genomes_cluster(self, cur_genomes, cur_clusters): """Identify genomes with erroneous NCBI species assignments, based on GTDB clustering of type strain genomes.""" forbidden_names = set(['cyanobacterium']) # get mapping from genomes to their representatives gid_to_rid = {} for rid, cids in cur_clusters.items(): for cid in cids: gid_to_rid[cid] = rid # get genomes with NCBI species assignment ncbi_sp_gids = defaultdict(list) for gid in cur_genomes: ncbi_species = cur_genomes[gid].ncbi_taxa.species ncbi_specific = specific_epithet(ncbi_species) if ncbi_species != 's__' and ncbi_specific not in forbidden_names: ncbi_sp_gids[ncbi_species].append(gid) # get NCBI species anchored by a type strain genome ncbi_type_anchored_species = {} for rid, cids in cur_clusters.items(): for cid in cids: if cur_genomes[cid].is_effective_type_strain(): ncbi_type_species = cur_genomes[cid].ncbi_taxa.species ncbi_specific = specific_epithet(ncbi_species) if ncbi_type_species != 's__' and ncbi_specific not in forbidden_names: if (ncbi_type_species in ncbi_type_anchored_species and rid != ncbi_type_anchored_species[ncbi_type_species]): self.logger.error( 'NCBI species {} has multiple effective type strain genomes in different clusters.' .format(ncbi_type_species)) sys.exit(-1) ncbi_type_anchored_species[ncbi_type_species] = rid self.logger.info( ' - identified {:,} NCBI species anchored by a type strain genome.' .format(len(ncbi_type_anchored_species))) # identify genomes with erroneous NCBI species assignments fout = open( os.path.join(self.output_dir, 'ncbi_misclassified_sp.gtdb_clustering.tsv'), 'w') fout.write( 'Genome ID\tNCBI species\tGenome cluster\tType species cluster\n') misclassified_gids = set() for idx, (ncbi_species, species_gids) in enumerate(ncbi_sp_gids.items()): if ncbi_species not in ncbi_type_anchored_species: continue # find genomes with NCBI species assignments that are in a # different cluster than the type strain genome type_rid = ncbi_type_anchored_species[ncbi_species] for gid in species_gids: cur_rid = gid_to_rid[gid] if type_rid != cur_rid: misclassified_gids.add(gid) fout.write('{}\t{}\t{}\t{}\t\n'.format( gid, ncbi_species, cur_rid, type_rid)) sys.stdout.write('\n') fout.close() misclassified_species = set( [cur_genomes[gid].ncbi_taxa.species for gid in misclassified_gids]) self.logger.info( ' - identified {:,} genomes from {:,} species as having misclassified NCBI species assignments.' .format(len(misclassified_gids), len(misclassified_species))) return misclassified_gids def run(self, gtdb_clusters_file, cur_gtdb_metadata_file, cur_genomic_path_file, uba_genome_paths, qc_passed_file, ncbi_genbank_assembly_file, untrustworthy_type_file, gtdb_type_strains_ledger, sp_priority_ledger, genus_priority_ledger, dsmz_bacnames_file): """Cluster genomes to selected GTDB representatives.""" # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, uba_genome_file=uba_genome_paths, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info( f' ... current genome set contains {len(cur_genomes):,} genomes.') # get path to previous and current genomic FASTA files self.logger.info('Reading path to current genomic FASTA files.') cur_genomes.load_genomic_file_paths(cur_genomic_path_file) cur_genomes.load_genomic_file_paths(uba_genome_paths) # read named GTDB species clusters self.logger.info( 'Reading named and previous placeholder GTDB species clusters.') cur_clusters, rep_radius = read_clusters(gtdb_clusters_file) self.logger.info( ' ... identified {:,} clusters spanning {:,} genomes.'.format( len(cur_clusters), sum([len(gids) + 1 for gids in cur_clusters.values()]))) # identify genomes with erroneous NCBI species assignments self.logger.info( 'Identifying genomes with erroneous NCBI species assignments as established by ANI type strain genomes.' ) self.identify_misclassified_genomes_ani(cur_genomes, cur_clusters) self.logger.info( 'Identifying genomes with erroneous NCBI species assignments as established by GTDB cluster of type strain genomes.' ) self.identify_misclassified_genomes_cluster(cur_genomes, cur_clusters)
def write_synonym_table(self, type_strain_synonyms, consensus_synonyms, ani_af, sp_priority_ledger, genus_priority_ledger, lpsn_gss_file): """Create table indicating species names that should be considered synonyms.""" sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger, genus_priority_ledger, lpsn_gss_file, self.output_dir) out_file = os.path.join(self.output_dir, 'synonyms.tsv') fout = open(out_file, 'w') fout.write( 'Synonym type\tNCBI species\tGTDB representative\tStrain IDs\tType sources\tPriority year' ) fout.write('\tGTDB type species\tGTDB type strain\tNCBI assembly type') fout.write( '\tNCBI synonym\tHighest-quality synonym genome\tSynonym strain IDs\tSynonym type sources\tSynonym priority year' ) fout.write( '\tSynonym GTDB type species\tSynonym GTDB type strain\tSynonym NCBI assembly type' ) fout.write('\tANI\tAF\tWarnings\n') incorrect_priority = 0 failed_type_strain_priority = 0 for synonyms, synonym_type in [ (type_strain_synonyms, 'TYPE_STRAIN_SYNONYM'), (consensus_synonyms, 'MAJORITY_VOTE_SYNONYM') ]: for rid, synonym_ids in synonyms.items(): for gid in synonym_ids: ani, af = FastANI.symmetric_ani(ani_af, rid, gid) fout.write(synonym_type) fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format( self.cur_genomes[rid].ncbi_taxa.species, rid, ','.join(sorted(self.cur_genomes[rid].strain_ids())), ','.join( sorted(self.cur_genomes[rid].gtdb_type_sources()) ).upper().replace('STRAININFO', 'StrainInfo'), sp_priority_mngr.species_priority_year( self.cur_genomes, rid), self.cur_genomes[rid].is_gtdb_type_species(), self.cur_genomes[rid].is_gtdb_type_strain(), self.cur_genomes[rid].ncbi_type_material)) synonym_priority_year = sp_priority_mngr.species_priority_year( self.cur_genomes, gid) if synonym_priority_year == Genome.NO_PRIORITY_YEAR: synonym_priority_year = 'n/a' fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format( self.cur_genomes[gid].ncbi_taxa.species, gid, ','.join(sorted(self.cur_genomes[gid].strain_ids())), ','.join( sorted(self.cur_genomes[gid].gtdb_type_sources()) ).upper().replace('STRAININFO', 'StrainInfo'), synonym_priority_year, self.cur_genomes[gid].is_gtdb_type_species(), self.cur_genomes[gid].is_gtdb_type_strain(), self.cur_genomes[gid].ncbi_type_material)) fout.write('\t{:.3f}\t{:.4f}'.format(ani, af)) if self.cur_genomes[rid].is_effective_type_strain( ) and self.cur_genomes[gid].is_effective_type_strain(): priority_gid, note = sp_priority_mngr.species_priority( self.cur_genomes, rid, gid) if priority_gid != rid: incorrect_priority += 1 fout.write('\tIncorrect priority: {}'.format(note)) elif not self.cur_genomes[rid].is_gtdb_type_strain( ) and self.cur_genomes[gid].is_gtdb_type_strain(): failed_type_strain_priority += 1 fout.write( '\tFailed to prioritize type strain of species') fout.write('\n') if incorrect_priority: self.logger.warning( f' - identified {incorrect_priority:,} synonyms with incorrect priority.' ) if failed_type_strain_priority: self.logger.warning( f' - identified {failed_type_strain_priority:,} synonyms that failed to priotize the type strain of the species.' )