def run(self, metadata_file, gtdb_user_genomes_file, gtdb_user_reps, ncbi_refseq_assembly_file, ncbi_genbank_assembly_file, gtdb_domain_report, qc_exception_file, species_exception_file, min_comp, max_cont, min_quality, sh_exception, min_perc_markers, max_contigs, min_N50, max_ambiguous, output_dir): """Quality check all potential GTDB genomes.""" # get GTDB and NCBI taxonomy strings for each genome self.logger.info('Reading NCBI taxonomy from GTDB metadata file.') ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(metadata_file, species_exception_file) ncbi_species = binomial_species(ncbi_taxonomy) gtdb_taxonomy = read_gtdb_taxonomy(metadata_file) self.logger.info('Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count)) self.logger.info('Read GTDB taxonomy for %d genomes.' % len(gtdb_taxonomy)) # determine User genomes to retain for consideration gtdb_user_to_genbank = self._gtdb_user_genomes(gtdb_user_genomes_file, metadata_file) self.logger.info('Identified %d GTDB User genomes with GenBank accessions to retain for potential inclusion in GTDB.' % len(gtdb_user_to_genbank)) user_genomes = 0 for line in open(gtdb_user_reps): line_split = line.strip().split('\t') gid, taxonomy = line_split if gid not in gtdb_user_to_genbank: if 'd__Bacteria' in taxonomy: self.logger.warning('Bacterial genome %s has no NCBI accession and is being skipped.' % gid) else: gtdb_user_to_genbank[gid] = gid user_genomes += 1 self.logger.info('Identified %d archaeal GTDB User genome WITHOUT GenBank accessions to retain for potential inclusion in GTDB.' % user_genomes) # parse genomes flagged as exceptions from QC qc_exceptions = set() for line in open(qc_exception_file): qc_exceptions.add(line.split('\t')[0].strip()) self.logger.info('Identified %d genomes flagged as exceptions from QC.' % len(qc_exceptions)) # calculate quality score for genomes self.logger.info('Parsing QC statistics for each genome.') quality_metadata = read_gtdb_metadata(metadata_file, ['checkm_completeness', 'checkm_contamination', 'checkm_strain_heterogeneity_100', 'contig_count', 'n50_contigs', 'ambiguous_bases', 'genome_size']) marker_perc = parse_marker_percentages(gtdb_domain_report) # parse NCBI assembly files self.logger.info('Parsing NCBI assembly files.') excluded_from_refseq_note = exclude_from_refseq(ncbi_refseq_assembly_file, ncbi_genbank_assembly_file) # get type material designations for each genome self.logger.info('Reading type material designations for genomes from GTDB metadata file.') type_metadata = read_gtdb_metadata(metadata_file, ['ncbi_type_material_designation', 'gtdb_type_designation', 'gtdb_type_designation_sources']) ncbi_tsp = ncbi_type_strain_of_species(type_metadata) gtdb_tsp = gtdb_type_strain_of_species(type_metadata) # QC all genomes self.logger.info('Validating genomes.') fout_retained = open(os.path.join(output_dir, 'qc_passed.tsv'), 'w') fout_failed = open(os.path.join(output_dir, 'qc_failed.tsv'), 'w') header = 'Accession\tNCBI species' header += '\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%' header += '\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases' fout_retained.write(header + '\tNote\n') fout_failed.write(header) fout_failed.write('\tFailed completeness\tFailed contamination\tFailed quality') fout_failed.write('\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases\n') num_retained = 0 num_filtered = 0 for gid in quality_metadata: if gid.startswith('U_') and gid not in gtdb_user_to_genbank: # skip user genomes not marked for retention continue failed_tests = defaultdict(int) passed_qc = pass_qc(quality_metadata[gid], marker_perc[gid], min_comp, max_cont, min_quality, sh_exception, min_perc_markers, max_contigs, min_N50, max_ambiguous, failed_tests) if passed_qc or gid in qc_exceptions: num_retained += 1 fout_retained.write('%s\t%s' % (gid, ncbi_taxonomy[gid][6])) fout_retained.write('\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d\t%s\n' % ( quality_metadata[gid].checkm_completeness, quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_completeness-5*quality_metadata[gid].checkm_contamination, ('%.2f' % quality_metadata[gid].checkm_strain_heterogeneity_100) if quality_metadata[gid].checkm_strain_heterogeneity_100 else '-', marker_perc[gid], quality_metadata[gid].contig_count, quality_metadata[gid].n50_contigs, quality_metadata[gid].ambiguous_bases, 'Passed QC' if passed_qc else 'Flagged as exception')) else: num_filtered += 1 fout_failed.write('%s\t%s' % (gid, ncbi_taxonomy[gid][6])) fout_failed.write('\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d' % ( quality_metadata[gid].checkm_completeness, quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_completeness-5*quality_metadata[gid].checkm_contamination, ('%.2f' % quality_metadata[gid].checkm_strain_heterogeneity_100) if quality_metadata[gid].checkm_strain_heterogeneity_100 else '-', marker_perc[gid], quality_metadata[gid].contig_count, quality_metadata[gid].n50_contigs, quality_metadata[gid].ambiguous_bases)) fout_failed.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' % ( failed_tests['comp'], failed_tests['cont'], failed_tests['qual'], failed_tests['marker_perc'], failed_tests['contig_count'], failed_tests['N50'], failed_tests['ambig'])) fout_retained.close() fout_failed.close() self.logger.info('Retained %d genomes and filtered %d genomes.' % (num_retained, num_filtered)) # QC genomes in each named species self.logger.info('Performing QC of type genome for each of the %d NCBI species.' % len(ncbi_species)) fout_type_fail = open(os.path.join(output_dir, 'type_genomes_fail_qc.tsv'), 'w') fout_type_fail.write('Species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tType sources\tNCBI assembly type\tGenome size (bp)') fout_type_fail.write('\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%') fout_type_fail.write('\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases\tNCBI exclude from RefSeq\tLost species\n') fout_fail_sp = open(os.path.join(output_dir, 'species_fail_qc.tsv'), 'w') fout_fail_sp.write('Species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tAssembled from type material\tGenome size (bp)') fout_fail_sp.write('\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%') fout_fail_sp.write('\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases') fout_fail_sp.write('\tFailed completeness\tFailed contamination\tFailed quality') fout_fail_sp.write('\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases') fout_fail_sp.write('\tNCBI exclude from RefSeq\n') fout_sp_lost = open(os.path.join(output_dir, 'species_lost.tsv'), 'w') fout_sp_lost.write('Species\tNo. genomes\tNo. type genomes') fout_sp_lost.write('\tFail completeness\tFail contamination\tFail quality\tFailed percent markers') fout_sp_lost.write('\tFail no. contigs\tFail N50 contigs\tFail ambiguous bases\n') lost_type = 0 lost_sp = 0 filtered_genomes = 0 failed_tests_cumulative = defaultdict(int) for sp, gids in ncbi_species.items(): type_pass = set() type_fail = set() other_pass = set() other_fail = set() failed_tests_gids = {} for gid in gids: failed_tests = defaultdict(int) passed_qc = pass_qc(quality_metadata[gid], marker_perc[gid], min_comp, max_cont, min_quality, sh_exception, min_perc_markers, max_contigs, min_N50, max_ambiguous, failed_tests) failed_tests_gids[gid] = failed_tests if gid in gtdb_tsp or gid in ncbi_tsp: if passed_qc: type_pass.add(gid) else: type_fail.add(gid) filtered_genomes += 1 else: if passed_qc: other_pass.add(gid) else: other_fail.add(gid) filtered_genomes += 1 # tally failed species for test, count in failed_tests.items(): failed_tests_cumulative[test] += count if len(type_pass) >= 1: # great: one or more type genomes pass QC and will be selected as the type genome continue if len(type_fail): # all potential type genomes for species failed QC so report these for manual inspection lost_type += 1 for gid in type_fail: fout_type_fail.write('%s\t%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d\t%s\t%s\n' % ( sp, gid, '; '.join(gtdb_taxonomy[gid]), '; '.join(ncbi_taxonomy[gid]), type_metadata[gid].gtdb_type_designation_sources, type_metadata[gid].ncbi_type_material_designation, float(quality_metadata[gid].genome_size)/1e6, quality_metadata[gid].checkm_completeness, quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_completeness-5*quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_strain_heterogeneity_100, marker_perc[gid], quality_metadata[gid].contig_count, quality_metadata[gid].n50_contigs, quality_metadata[gid].ambiguous_bases, excluded_from_refseq_note[gid], len(other_pass) == 0)) if len(other_pass) == 0: # no genomes for species pass QC so report loss of species lost_sp += 1 fout_sp_lost.write('%s\t%d\t%d' % (sp, len(gids), len(type_fail))) fout_sp_lost.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' % ( sum([failed_tests_gids[gid]['comp'] for gid in gids]), sum([failed_tests_gids[gid]['cont'] for gid in gids]), sum([failed_tests_gids[gid]['qual'] for gid in gids]), sum([failed_tests_gids[gid]['marker_perc'] for gid in gids]), sum([failed_tests_gids[gid]['contig_count'] for gid in gids]), sum([failed_tests_gids[gid]['N50'] for gid in gids]), sum([failed_tests_gids[gid]['ambig'] for gid in gids]))) for gid in type_fail.union(other_fail): fout_fail_sp.write('%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d' % ( sp, gid, '; '.join(gtdb_taxonomy[gid]), '; '.join(ncbi_taxonomy[gid]), gid in type_fail, float(quality_metadata[gid].genome_size)/1e6, quality_metadata[gid].checkm_completeness, quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_completeness-5*quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_strain_heterogeneity_100, marker_perc[gid], quality_metadata[gid].contig_count, quality_metadata[gid].n50_contigs, quality_metadata[gid].ambiguous_bases)) fout_fail_sp.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % ( failed_tests_gids[gid]['comp'], failed_tests_gids[gid]['cont'], failed_tests_gids[gid]['qual'], failed_tests_gids[gid]['marker_perc'], failed_tests_gids[gid]['contig_count'], failed_tests_gids[gid]['N50'], failed_tests_gids[gid]['ambig'])) fout_fail_sp.write('\t%s\n' % excluded_from_refseq_note[gid]) fout_type_fail.close() fout_fail_sp.close() fout_sp_lost.close() self.logger.info('Genomes filtered for each criterion:') for test in sorted(failed_tests_cumulative): self.logger.info('%s: %d' % (test, failed_tests_cumulative[test])) self.logger.info('Filtered %d genomes assigned to NCBI species.' % filtered_genomes) self.logger.info('Identified %d species with type genomes failing QC and %d total species failing QC.' % (lost_type, lost_sp))
def run(self, qc_file, gtdb_metadata_file, gtdb_final_clusters, output_dir): """Quality check all potential GTDB genomes.""" # identify genomes failing quality criteria self.logger.info('Reading QC file.') passed_qc = read_qc_file(qc_file) self.logger.info('Identified %d genomes passing QC.' % len(passed_qc)) # get GTDB and NCBI taxonomy strings for each genome self.logger.info( 'Reading NCBI and GTDB taxonomy from GTDB metadata file.') ncbi_taxonomy = read_gtdb_ncbi_taxonomy(gtdb_metadata_file) prev_gtdb_taxonomy = read_gtdb_taxonomy(gtdb_metadata_file) self.logger.info('Read NCBI taxonomy for %d genomes.' % len(ncbi_taxonomy)) self.logger.info('Read GTDB taxonomy for %d genomes.' % len(prev_gtdb_taxonomy)) # get GTDB metadata type_metadata = read_gtdb_metadata(gtdb_metadata_file, [ 'gtdb_type_designation', 'gtdb_type_designation_sources', 'gtdb_type_species_of_genus' ]) quality_metadata = read_quality_metadata(gtdb_metadata_file) # read species clusters sp_clusters, species = read_clusters(gtdb_final_clusters) self.logger.info('Read %d species clusters.' % len(sp_clusters)) # sanity check species clusters all defined by genomes passing QC for gid in sp_clusters: if gid not in passed_qc: self.logger.error( 'Genome %s defines a species cluster, but fails QC.' % gid) sys.exit(-1) # modify GTDB taxonomy to reflect new species clustering a report incongruencies self.logger.info( 'Identifying species with incongruent specific names.') self._incongruent_specific_names(species, ncbi_taxonomy, prev_gtdb_taxonomy, type_metadata, output_dir) self._incongruent_genus_names(species, ncbi_taxonomy, prev_gtdb_taxonomy, type_metadata, output_dir) # get GIDs for canonical and validation trees fout_bac_can_gtdb = open( os.path.join(output_dir, 'bac_can_taxonomy.tsv'), 'w') fout_bac_val_gtdb = open( os.path.join(output_dir, 'bac_val_taxonomy.tsv'), 'w') fout_ar_can_gtdb = open( os.path.join(output_dir, 'ar_can_taxonomy.tsv'), 'w') fout_ar_val_gtdb = open( os.path.join(output_dir, 'ar_val_taxonomy.tsv'), 'w') fout_bac_val = open( os.path.join(output_dir, 'gids_bac_validation.lst'), 'w') fout_ar_val = open(os.path.join(output_dir, 'gids_ar_validation.lst'), 'w') fout_bac_can = open(os.path.join(output_dir, 'gids_bac_canonical.lst'), 'w') fout_ar_can = open(os.path.join(output_dir, 'gids_ar_canonical.lst'), 'w') fout_bac_val.write('#Accession\tSpecies\tNote\n') fout_ar_val.write('#Accession\tSpecies\tNote\n') fout_bac_can.write('#Accession\tSpecies\tNote\n') fout_ar_can.write('#Accession\tSpecies\tNote\n') for rid in sp_clusters: domain = prev_gtdb_taxonomy[rid][0] if domain == 'd__Bacteria': fout_val = fout_bac_val fout_can = fout_bac_can fout_can_gtdb = fout_bac_can_gtdb fout_val_gtdb = fout_bac_val_gtdb elif domain == 'd__Archaea': fout_val = fout_ar_val fout_can = fout_ar_can fout_can_gtdb = fout_ar_can_gtdb fout_val_gtdb = fout_ar_val_gtdb else: self.logger.error('Genome %s has no GTDB domain assignment.' % rid) sys.exit(-1) # substitute proposed species name into GTDB taxonomy sp = species[rid] canonical_sp = parse_canonical_sp(sp) taxa = prev_gtdb_taxonomy[rid][0:6] + [canonical_sp] new_gtdb_str = '; '.join(taxa) fout_can_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str)) fout_val_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str)) fout_val.write('%s\t%s\t%s\n' % (rid, sp, 'GTDB type or representative genome')) fout_can.write('%s\t%s\t%s\n' % (rid, sp, 'GTDB type or representative genome')) cluster_gids = set(sp_clusters[rid]) for gid in cluster_gids: if gid not in passed_qc: self.logger.error( 'Genome %s is in a species cluster, but fails QC.' % gid) sys.exit(-1) if len(cluster_gids) > 0: # select highest-quality genome q = quality_score(cluster_gids, quality_metadata) gid = max(q.items(), key=operator.itemgetter(1))[0] fout_val.write( '%s\t%s\t%s\n' % (gid, sp, 'selected highest-quality genome (Q=%.2f)' % q[gid])) fout_val_gtdb.write('%s\t%s\n' % (gid, new_gtdb_str)) fout_bac_val.close() fout_ar_val.close() fout_bac_can.close() fout_ar_can.close() fout_bac_can_gtdb.close() fout_bac_val_gtdb.close() fout_ar_can_gtdb.close() fout_ar_val_gtdb.close()
def run(self, qc_file, gtdb_metadata_file, gtdb_final_clusters, species_exception_file, output_dir): """Quality check all potential GTDB genomes.""" # identify genomes failing quality criteria self.logger.info('Reading QC file.') passed_qc = read_qc_file(qc_file) self.logger.info('Identified %d genomes passing QC.' % len(passed_qc)) # get GTDB and NCBI taxonomy strings for each genome self.logger.info('Reading NCBI and GTDB taxonomy from GTDB metadata file.') ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(gtdb_metadata_file, species_exception_file) prev_gtdb_taxonomy = read_gtdb_taxonomy(gtdb_metadata_file) self.logger.info('Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count)) self.logger.info('Read GTDB taxonomy for %d genomes.' % len(prev_gtdb_taxonomy)) # get GTDB metadata type_metadata = read_gtdb_metadata(gtdb_metadata_file, ['gtdb_type_designation', 'gtdb_type_designation_sources', 'gtdb_type_species_of_genus']) quality_metadata = read_quality_metadata(gtdb_metadata_file) # read species clusters sp_clusters, species, _rep_radius = read_clusters(gtdb_final_clusters) self.logger.info('Read %d species clusters.' % len(sp_clusters)) # sanity check species clusters all defined by genomes passing QC for gid in sp_clusters: if gid not in passed_qc: self.logger.error('Genome %s defines a species cluster, but fails QC.' % gid) sys.exit(-1) # modify GTDB taxonomy to reflect new species clustering and report incongruencies self.logger.info('Identifying species with incongruent specific names.') self._incongruent_specific_names(species, ncbi_taxonomy, prev_gtdb_taxonomy, type_metadata, output_dir) self._incongruent_genus_names(species, ncbi_taxonomy, prev_gtdb_taxonomy, type_metadata, output_dir) # get GIDs for canonical and validation trees fout_bac_can_gtdb = open(os.path.join(output_dir, 'bac_can_taxonomy.tsv'), 'w') fout_bac_val_gtdb = open(os.path.join(output_dir, 'bac_val_taxonomy.tsv'), 'w') fout_ar_can_gtdb = open(os.path.join(output_dir, 'ar_can_taxonomy.tsv'), 'w') fout_ar_val_gtdb = open(os.path.join(output_dir, 'ar_val_taxonomy.tsv'), 'w') fout_bac_val = open(os.path.join(output_dir, 'gids_bac_validation.lst'), 'w') fout_ar_val = open(os.path.join(output_dir, 'gids_ar_validation.lst'), 'w') fout_bac_can = open(os.path.join(output_dir, 'gids_bac_canonical.lst'), 'w') fout_ar_can = open(os.path.join(output_dir, 'gids_ar_canonical.lst'), 'w') fout_bac_val.write('#Accession\tSpecies\tNote\n') fout_ar_val.write('#Accession\tSpecies\tNote\n') fout_bac_can.write('#Accession\tSpecies\tNote\n') fout_ar_can.write('#Accession\tSpecies\tNote\n') for rid in sp_clusters: domain = prev_gtdb_taxonomy[rid][0] if domain == 'd__Bacteria': fout_val = fout_bac_val fout_can = fout_bac_can fout_can_gtdb = fout_bac_can_gtdb fout_val_gtdb = fout_bac_val_gtdb elif domain == 'd__Archaea': fout_val = fout_ar_val fout_can = fout_ar_can fout_can_gtdb = fout_ar_can_gtdb fout_val_gtdb = fout_ar_val_gtdb else: self.logger.error('Genome %s has no GTDB domain assignment.' % rid) sys.exit(-1) # substitute proposed species name into GTDB taxonomy sp = species[rid] canonical_sp = parse_canonical_sp(sp) taxa = prev_gtdb_taxonomy[rid][0:6] + [canonical_sp] new_gtdb_str = '; '.join(taxa) fout_can_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str)) fout_val_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str)) fout_val.write('%s\t%s\t%s\n' % (rid, sp, 'GTDB type or representative genome')) fout_can.write('%s\t%s\t%s\n' % (rid, sp, 'GTDB type or representative genome')) cluster_gids = set(sp_clusters[rid]) for gid in cluster_gids: if gid not in passed_qc: self.logger.error('Genome %s is in a species cluster, but fails QC.' % gid) sys.exit(-1) if len(cluster_gids) > 0: # select highest-quality genome q = quality_score(cluster_gids, quality_metadata) gid = max(q.items(), key=operator.itemgetter(1))[0] taxa = prev_gtdb_taxonomy[gid][0:6] + [canonical_sp] new_gtdb_str = '; '.join(taxa) fout_val.write('%s\t%s\t%s\n' % (gid, sp, 'selected highest-quality genome (Q=%.2f)' % q[gid])) fout_val_gtdb.write('%s\t%s\n' % (gid, new_gtdb_str)) fout_bac_val.close() fout_ar_val.close() fout_bac_can.close() fout_ar_can.close() fout_bac_can_gtdb.close() fout_bac_val_gtdb.close() fout_ar_can_gtdb.close() fout_ar_val_gtdb.close()
def representatives(self, species_derep_file, metadata_file, prev_rep_file, mash_pairwise_file, trusted_user_file, min_rep_comp, max_rep_cont, min_quality, max_contigs, min_N50, max_ambiguous, max_gap_length, output_file): """Identify additional representatives. Additional representatives are selected in a greedy fashion, by ordering genomes according to database source and estimated genome quality. A slight quality boost is given to genomes that were previously selected as a representative in order to try and retain more stability between releases. Genomes only added as a new representative if they cannot be clustered with an existing representative. Clustering is based on a conservative Mash distance threshold that reflects the 95% ANI species criteria. Parameters ---------- species_derep_file : str File listing selected representatives from named species. metadata_file : str Metadata, including CheckM estimates, for all genomes. prev_rep_file : str File indicating previous representatives to favour during selection. trusted_user_file : str File listing trusted User genomes that should be treated as if they are in GenBank. mash_pairwise_file : str File with pairwise Mash distances. min_rep_comp : float [0, 100] Minimum completeness for a genome to be a representative. max_rep_cont : float [0, 100] Maximum contamination for a genome to be a representative. min_quality : float [0, 100] Minimum quality (comp - 5*cont) for a genome to be a representative. max_contigs : int Maximum number of contigs for a genome to be a representative. min_N50 : int Minimum N50 of scaffolds for a genome to be a representative. max_ambiguous : int Maximum number of ambiguous bases within contigs for a genome to be a representative. max_gap_length : int Maximum number of ambiguous bases between contigs for a genome to be a representative. output_file : str Output file containing all genomes identified as representatives. """ # read previous representatives and trusted user genomes prev_gtdb_reps = self._read_genome_list(prev_rep_file) trusted_user_genomes = self._read_genome_list(trusted_user_file) self.logger.info('Identified %d trusted User genomes.' % len(trusted_user_genomes)) self.logger.info('Identified %d previous GTDB representatives.' % len(prev_gtdb_reps)) # get genome and assembly quality genome_stats = self._genome_stats(metadata_file) # read initial representatives init_rep_genomes = set() for line in open(species_derep_file): if line[0] == '#': continue genome_id = line.strip().split('\t')[0] init_rep_genomes.add(genome_id) self.logger.info('Identified %d initial representatives.' % len(init_rep_genomes)) # remove existing representative genomes and genomes # of insufficient quality to be a representative genome_quality = {} potential_reps = set() for genome_id, stats in genome_stats.items(): if genome_id in init_rep_genomes: continue if genome_id.startswith('U_') and genome_id not in trusted_user_genomes: continue if (stats.checkm_completeness >= min_rep_comp and stats.checkm_contamination <= max_rep_cont and (stats.checkm_completeness - 5*stats.checkm_contamination) >= min_quality and stats.contig_count <= max_contigs and stats.n50_scaffolds >= min_N50 and stats.ambiguous_bases <= max_ambiguous and stats.total_gap_length <= max_gap_length): potential_reps.add(genome_id) genome_quality[genome_id] = stats.checkm_completeness - 5*stats.checkm_contamination # perform greedy identification of new representatives ordered_genomes = self._order_genomes(potential_reps, genome_quality, trusted_user_genomes, prev_gtdb_reps) info = (('Comparing %d genomes to %d initial representatives.') % (len(ordered_genomes), len(init_rep_genomes))) self.logger.info(info) gtdb_taxonomy = read_gtdb_taxonomy(metadata_file) ncbi_taxonomy = read_gtdb_ncbi_taxonomy(metadata_file) representatives = self._greedy_representatives(init_rep_genomes, ordered_genomes, gtdb_taxonomy, ncbi_taxonomy, mash_pairwise_file) self.logger.info('Identified %d representatives.' % len(representatives)) # read metadata for genomes (refseq_genomes, complete_genomes, representative_genomes) = ncbi.read_refseq_metadata(metadata_file) ncbi_type_strains = read_gtdb_ncbi_type_strain(metadata_file) # write out information for representative genomes fout = open(output_file, 'w') fout.write('# Selection criteria:\n') fout.write('# Species dereplication file: %s\n' % species_derep_file) fout.write('# Previous representative file: %s\n' % prev_rep_file) fout.write('# Trusted user genomes file: %s\n' % trusted_user_file) fout.write('# Genome quality metadata file: %s\n' % str(metadata_file)) fout.write('# Min. representative completeness: %.2f\n' % min_rep_comp) fout.write('# Max. representative contamination: %.2f\n' % max_rep_cont) fout.write('# Mash strict threshold: %.3f\n' % self.mash_strict_threshold) fout.write('# Mash GTDB species threshold: %.3f\n' % self.mash_gtdb_species_threshold) fout.write('# Mash NCBI species threshold: %.3f\n' % self.mash_ncbi_species_threshold) fout.write('#\n') fout.write('# Genome Id\tGTDB Taxonomy\tNCBI Taxonomy\tNCBI Organism Name\tNCBI Type strain\tComplete\tRepresentative\n') for genome_id in representatives: representative = 'yes' if genome_id in representative_genomes else 'no' complete = 'yes' if genome_id in complete_genomes else 'no' ts = 'yes' if genome_id in ncbi_type_strains else 'no' gtdb_taxa_str = ';'.join(gtdb_taxonomy.get(genome_id, Taxonomy.rank_prefixes)) ncbi_taxa_str = ';'.join(ncbi_taxonomy.get(genome_id, Taxonomy.rank_prefixes)) fout.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (genome_id, gtdb_taxa_str, ncbi_taxa_str, genome_stats[genome_id].ncbi_organism_name, ts, complete, representative)) fout.close()
def cluster(self, rep_genome_file, metadata_file, mash_pairwise_file, output_file): """Cluster genomes based on Mash distances. Genomes are assigned to their closest representative, that is below the species cutoff. However, genomes assigned to different GTDB species are never clustered together. This allows refinement of species to be performed using alternative methods and ensures this will be respected. Parameters ---------- rep_genome_file : str File indicating genome representative. metadata_file : str Metadata, including CheckM estimates, for all genomes. mash_pairwise_file : str File with pairwise Mash distances. output_file : str Output file indicating genome clusters. """ # read previous representatives and trusted user genomes representatives = self._read_genome_list(rep_genome_file) self.logger.info('Identified %d representative genomes.' % len(representatives)) # get genome and assembly quality genome_stats = self._genome_stats(metadata_file) gtdb_taxonomy = read_gtdb_taxonomy(metadata_file) ncbi_taxonomy = read_gtdb_ncbi_taxonomy(metadata_file) # read Mash distance between genomes self.logger.info('Reading pairwise Mash distances between genomes.') mash_dists = self._read_mash_dists(mash_pairwise_file) # cluster genomes self.logger.info('Clustering genomes.') clusters = {} for rep_id in representatives: clusters[rep_id] = [] remaining_genomes = set(genome_stats) - representatives for i, genome_id in enumerate(remaining_genomes): if i % 100 == 0: sys.stdout.write('==> Processed %d of %d genomes.\r' % (i+1, len(remaining_genomes))) sys.stdout.flush() query_dists = mash_dists[genome_id] query_gtdb_sp = gtdb_taxonomy[genome_id][6] query_ncbi_sp = ncbi_taxonomy[genome_id][6] assigned_rep = None min_d = 1.0 for ref_id in representatives: d = query_dists.get(ref_id, 1.0) if d >= min_d: continue ref_gtdb_sp = gtdb_taxonomy[ref_id][6] if (d <= self.mash_strict_threshold and (query_gtdb_sp == 's__' or ref_gtdb_sp == query_gtdb_sp)): # genomes meet the strict threshold for # clustering and don't conflict in their # assigned species names assigned_rep = ref_id min_d = d continue if ref_gtdb_sp == 's__' or ref_gtdb_sp != query_gtdb_sp: continue if d <= self.mash_gtdb_species_threshold: # genomes are from same named species and # meet the threshold for clustering assigned_rep = ref_id min_d = d elif (d <= self.mash_ncbi_species_threshold and self._canonical_species_name(ref_gtdb_sp) == query_ncbi_sp): # genomes are from same named species and # meet the threshold for clustering assigned_rep = ref_id min_d = d if assigned_rep: clusters[assigned_rep].append(genome_id) sys.stdout.write('==> Processed %d of %d genomes.\r' % (len(remaining_genomes), len(remaining_genomes))) sys.stdout.flush() sys.stdout.write('\n') # write out clusters fout = open(output_file, 'w') clustered_genomes = 0 for c, cluster_rep in enumerate(sorted(clusters, key=lambda x: len(clusters[x]), reverse=True)): cluster_str = 'cluster_%d' % (c + 1) cluster = clusters[cluster_rep] clustered_genomes += len(cluster) fout.write('%s\t%s\t%d\t%s\n' % (cluster_rep, cluster_str, len(cluster) + 1, ','.join(cluster))) fout.close() self.logger.info('Assigned %d genomes to representatives.' % clustered_genomes)
def run(self, qc_file, metadata_file, gtdb_user_genomes_file, genome_path_file, type_genome_cluster_file, type_genome_synonym_file, ncbi_refseq_assembly_file, ncbi_genbank_assembly_file, ani_af_nontype_vs_type, species_exception_file, rnd_type_genome): """Infer de novo species clusters and type genomes for remaining genomes.""" # identify genomes failing quality criteria self.logger.info('Reading QC file.') passed_qc = read_qc_file(qc_file) self.logger.info('Identified %d genomes passing QC.' % len(passed_qc)) # get NCBI taxonomy strings for each genome self.logger.info('Reading NCBI taxonomy from GTDB metadata file.') ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(metadata_file, species_exception_file) gtdb_taxonomy = read_gtdb_taxonomy(metadata_file) self.logger.info('Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count)) self.logger.info('Read GTDB taxonomy for %d genomes.' % len(gtdb_taxonomy)) # parse NCBI assembly files self.logger.info('Parsing NCBI assembly files.') excluded_from_refseq_note = exclude_from_refseq(ncbi_refseq_assembly_file, ncbi_genbank_assembly_file) # get path to genome FASTA files self.logger.info('Reading path to genome FASTA files.') genome_files = parse_genome_path(genome_path_file) self.logger.info('Read path for %d genomes.' % len(genome_files)) for gid in set(genome_files): if gid not in passed_qc: genome_files.pop(gid) self.logger.info('Considering %d genomes as potential representatives after removing unwanted User genomes.' % len(genome_files)) assert(len(genome_files) == len(passed_qc)) # determine type genomes and genomes clustered to type genomes type_species, species_type_gid, type_gids, type_clustered_gids, type_radius = self._parse_type_clusters(type_genome_cluster_file) assert(len(type_species) == len(type_gids)) self.logger.info('Identified %d type genomes.' % len(type_gids)) self.logger.info('Identified %d clustered genomes.' % len(type_clustered_gids)) # calculate quality score for genomes self.logger.info('Parse quality statistics for all genomes.') quality_metadata = read_quality_metadata(metadata_file) # calculate genome quality score self.logger.info('Calculating genome quality score.') genome_quality = quality_score(quality_metadata.keys(), quality_metadata) # determine genomes left to be clustered unclustered_gids = passed_qc - type_gids - type_clustered_gids self.logger.info('Identified %d unclustered genomes passing QC.' % len(unclustered_gids)) # establish closest type genome for each unclustered genome self.logger.info('Determining ANI circumscription for %d unclustered genomes.' % len(unclustered_gids)) nontype_radius = self._nontype_radius(unclustered_gids, type_gids, ani_af_nontype_vs_type) # calculate Mash ANI estimates between unclustered genomes self.logger.info('Calculating Mash ANI estimates between unclustered genomes.') mash_anis = self._mash_ani_unclustered(genome_files, unclustered_gids) # select species representatives genomes in a greedy fashion based on genome quality rep_genomes = self._selected_rep_genomes(genome_files, nontype_radius, unclustered_gids, mash_anis, quality_metadata, rnd_type_genome) # cluster all non-type/non-rep genomes to species type/rep genomes final_cluster_radius = type_radius.copy() final_cluster_radius.update(nontype_radius) final_clusters, ani_af = self._cluster_genomes(genome_files, rep_genomes, type_gids, passed_qc, final_cluster_radius) rep_clusters = {} for gid in rep_genomes: rep_clusters[gid] = final_clusters[gid] # get list of synonyms in order to restrict usage of species names synonyms = self._parse_synonyms(type_genome_synonym_file) self.logger.info('Identified %d synonyms.' % len(synonyms)) # determine User genomes with NCBI accession number that may form species names gtdb_user_to_genbank = self._gtdb_user_genomes(gtdb_user_genomes_file, metadata_file) self.logger.info('Identified %d GTDB User genomes with NCBI accessions.' % len(gtdb_user_to_genbank)) # assign species names to de novo species clusters names_in_use = synonyms.union(type_species) self.logger.info('Identified %d species names already in use.' % len(names_in_use)) self.logger.info('Assigning species name to each de novo species cluster.') cluster_sp_names = self._assign_species_names(rep_clusters, names_in_use, gtdb_taxonomy, gtdb_user_to_genbank) # write out file with details about selected representative genomes self._write_rep_info(rep_clusters, cluster_sp_names, quality_metadata, genome_quality, excluded_from_refseq_note, ani_af, os.path.join(self.output_dir, 'gtdb_rep_genome_info.tsv')) # remove genomes that are not representatives of a species cluster and then write out representative ANI radius for gid in set(final_cluster_radius) - set(final_clusters): del final_cluster_radius[gid] all_species = cluster_sp_names all_species.update(species_type_gid) self.logger.info('Writing %d species clusters to file.' % len(all_species)) self.logger.info('Writing %d cluster radius information to file.' % len(final_cluster_radius)) write_clusters(final_clusters, final_cluster_radius, all_species, os.path.join(self.output_dir, 'gtdb_clusters_final.tsv')) write_type_radius(final_cluster_radius, all_species, os.path.join(self.output_dir, 'gtdb_ani_radius_final.tsv'))
def dereplicate(self, metadata_file, prev_rep_file, exceptions_file, trusted_user_file, max_species, min_rep_comp, max_rep_cont, min_quality, max_contigs, min_N50, max_ambiguous, max_gap_length, strict_filtering, output_file): """Select representative genomes from named species. Each named species is dereplicated to a fixed number of reprsentatives, taking care to retain all genomes marked as a 'reference' or 'representative' at NCBI. Preference is then given to genomes marked as type strains at NCBI. Finally, genomes are selected based on estimated quality. Parameters ---------- max_species : int Maximum number of genomes of the same species to retain. prev_rep_file : str File indicating previous representatives to favour during selection. trusted_genomes_file: File containing list of genomes to retain regardless of filtering criteria. metadata_file : str Metadata, including CheckM estimates, for all genomes. min_rep_comp : float [0, 100] Minimum completeness for a genome to be a representative. max_rep_cont : float [0, 100] Maximum contamination for a genome to be a representative. min_quality : float [0, 100] Minimum genome quality (comp-5*cont) for a genome to be a representative. max_contigs : int Maximum number of contigs for a genome to be a representative. min_N50 : int Minimum N50 of scaffolds for a genome to be a representative. max_ambiguous : int Maximum number of ambiguous bases for a genome to be a representative. max_gap_length : int Maximum number of ambiguous bases between contigs for a genome to be a representative. strict_filtering : boolean If True apply filtering to all genomes, otherise apply lenient filtering to genomes where the chromosome and plasmids are reported as complete. output_file : str Output file to contain list of dereplicated genomes. """ # identify previous reps, genomes to treat as exceptions, # and user genomes to process prev_gtdb_reps = self._read_genome_list(prev_rep_file) exception_genomes = self._read_genome_list(exceptions_file) trusted_user_genomes = self._read_genome_list(trusted_user_file) (refseq_genomes, complete_genomes, representative_genomes) = ncbi.read_refseq_metadata(metadata_file) self.logger.info('Identified %d RefSeq genomes.' % len(refseq_genomes)) self.logger.info('Identified %d representative or reference genomes.' % len(representative_genomes)) self.logger.info('Identified %d complete genomes.' % len(complete_genomes)) self.logger.info('Identified %d genomes in exception list.' % len(exception_genomes)) self.logger.info('Identified %d trusted user genomes.' % len(trusted_user_genomes)) self.logger.info('Identified %d previous GTDB representatives.' % len(prev_gtdb_reps)) # get genome and assembly quality genome_stats = self._genome_stats(metadata_file) # get genomes in each named GTDB species gtdb_taxonomy = read_gtdb_taxonomy(metadata_file) ncbi_taxonomy = read_gtdb_ncbi_taxonomy(metadata_file) species = {} species_index = Taxonomy.rank_index['s__'] for genome_id, taxa in gtdb_taxonomy.items(): sp = taxa[species_index] if sp != 's__': species[genome_id] = sp self.logger.info('Identified %d genomes with a GTDB species names.' % len(species)) # identify genomes passing filtering criteria filtered_reps_file = output_file + '.filtered_reps' fout = open(filtered_reps_file, 'w') fout.write('Genome ID\tCompleteness\tContamination') fout.write('\tContig Count\tN50\tAmbiguous Bases\tTotal Gap Length') fout.write('\tNote\tNCBI Organism Name\n') lpsn_type_strains = defaultdict(set) genomes_to_consider = [] genome_quality = {} filtered_reps = 0 lack_ncbi_taxonomy = 0 for genome_id in list(genome_stats.keys()): if genome_id.startswith('U_') and genome_id not in trusted_user_genomes: continue stats = genome_stats[genome_id] comp = stats.checkm_completeness cont = stats.checkm_contamination keep = False if genome_id in exception_genomes: keep = True elif (comp >= min_rep_comp and cont <= max_rep_cont and (comp - 5*cont) >= min_quality and stats.contig_count <= max_contigs and stats.n50_scaffolds >= min_N50 and stats.ambiguous_bases <= max_ambiguous and stats.total_gap_length <= max_gap_length): keep = True elif not strict_filtering: # check if genome appears to consist of only an unspanned # chromosome and unspanned plasmids and thus can be # subjected to a more lenient quality check if (stats.ncbi_assembly_level in ['Complete Genome', 'Chromosome'] and stats.ncbi_genome_representation == 'full' and stats.scaffold_count == stats.ncbi_molecule_count and stats.ncbi_unspanned_gaps == 0 and stats.ncbi_spanned_gaps <= 10 and stats.ambiguous_bases <= max_ambiguous and stats.total_gap_length <= max_gap_length and stats.ssu_count >= 1): # apply lenient quality check that should pick # up the vast majority (if not all) even highly # reduced genomes and those with substantial genome # duplication leading to high estimated contamination if comp >= 40 and cont <= 15: keep = True if keep: genomes_to_consider.append(genome_id) genome_quality[genome_id] = comp - 5*cont if stats.lpsn_strain: gtdb_species = gtdb_taxonomy[genome_id][species_index] if gtdb_species != 's__': lpsn_type_strains[gtdb_species].add(genome_id) # check if a representative at NCBI is being filtered if genome_id in representative_genomes: if genome_id not in genomes_to_consider: if comp < min_rep_comp: note = 'failed completeness criteria' elif cont > max_rep_cont: note = 'failed contamination criteria' elif (comp - 5*cont) < min_quality: note = 'failed genome quality criteria' elif stats.contig_count > max_contigs: note = 'failed contig count criteria' elif stats.n50_scaffolds < min_N50: note = 'failed scaffold N50 criteria' elif stats.ambiguous_bases > max_ambiguous: note = 'failed ambiguous bases criteria' elif stats.total_gap_length > max_gap_length: note = 'failed total gap length criteria' fout.write('%s\t%.2f\t%.2f\t%d\t%d\t%d\t%d\t%s\t%s\n' % ( genome_id, comp, cont, stats.contig_count, stats.n50_scaffolds, stats.ambiguous_bases, stats.total_gap_length, note, stats.ncbi_organism_name )) warning = ('Filtered RefSeq rep %s with comp=%.2f, cont=%.2f, contigs=%d, N50=%d' % (genome_id, comp, cont, stats.contig_count, stats.n50_scaffolds)) self.logger.warning(warning) filtered_reps += 1 if genome_id in refseq_genomes and not stats.ncbi_taxonomy: # this should never happen, but sometimes the NCBI taxonomy # is missing information for some genomes probably due to when NCBI # updates the taxonomy database relative to RefSeq lack_ncbi_taxonomy += 1 self.logger.warning('RefSeq representative %s has no assigned NCBI taxonomy.' % genome_id) fout.close() self.logger.info('Identified %d RefSeq representatives without an assigned NCBI taxonomy.' % lack_ncbi_taxonomy) self.logger.info('Filtered %d RefSeq representatives based on genome or assembly quality.' % filtered_reps) self.logger.info('Filtered RefSeq representatives written to %s' % filtered_reps_file) self.logger.info('Considering %d genomes after filtering for genome quality.' % (len(genomes_to_consider))) ncbi_type_strains = read_gtdb_ncbi_type_strain(metadata_file) self.logger.info('Identified %d genomes marked as type strains at NCBI.' % len(ncbi_type_strains)) self.logger.info('Identified %d genomes marked as type strains at LPSN.' % sum([len(x) for x in list(lpsn_type_strains.values())])) # dereplicate named species genomes_to_retain = self._dereplicate_species(genomes_to_consider, max_species, species, representative_genomes, complete_genomes, ncbi_type_strains, lpsn_type_strains, prev_gtdb_reps, genome_quality) self.logger.info('Retained %d genomes.' % len(genomes_to_retain)) # write results if not exceptions_file: exceptions_file = '' fout = open(output_file, 'w') fout.write('# Selection criteria:\n') fout.write('# Maximum species: %d\n' % max_species) fout.write('# Exception file: %s\n' % exceptions_file) fout.write('# Trusted user genomes file: %s\n' % trusted_user_file) fout.write('# Genome quality metadata file: %s\n' % str(metadata_file)) fout.write('# Min. representative completeness: %s\n' % str(min_rep_comp)) fout.write('# Max. representative contamination: %s\n' % str(max_rep_cont)) fout.write('#\n') fout.write('# Genome Id\tGTDB Taxonomy\tNCBI Taxonomy\tNCBI Organism Name\tNCBI Type strain\tComplete\tRepresentative\n') for genome_id in genomes_to_retain: representative = 'yes' if genome_id in representative_genomes else 'no' complete = 'yes' if genome_id in complete_genomes else 'no' ts = 'yes' if genome_id in ncbi_type_strains else 'no' gtdb_taxa_str = ';'.join(gtdb_taxonomy.get(genome_id, Taxonomy.rank_prefixes)) ncbi_taxa_str = ';'.join(ncbi_taxonomy.get(genome_id, Taxonomy.rank_prefixes)) fout.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (genome_id, gtdb_taxa_str, ncbi_taxa_str, genome_stats[genome_id].ncbi_organism_name, ts, complete, representative)) fout.close()
def run(self, max_species, prev_rep_file, trusted_genomes_file, metadata_file, min_rep_comp, max_rep_cont, min_quality, max_contigs, min_N50, max_ambiguous, max_gap_length, strict_filtering, output_file): """Dereplicate genomes to a specific number per named species. Parameters ---------- max_species : int Maximum number of genomes of the same species to retain. prev_rep_file : str File indicating previous representatives to favour during selection. trusted_genomes_file: File containing list of genomes to retain regardless of filtering criteria. metadata_file : str Metadata, including CheckM estimates, for all genomes. min_rep_comp : float [0, 100] Minimum completeness for a genome to be a representative. max_rep_cont : float [0, 100] Maximum contamination for a genome to be a representative. min_quality : float [0, 100] Minimum genome quality (comp-5*cont) for a genome to be a representative. max_contigs : int Maximum number of contigs for a genome to be a representative. min_N50 : int Minimum N50 of scaffolds for a genome to be a representative. max_ambiguous : int Maximum number of ambiguous bases for a genome to be a representative. max_gap_length : int Maximum number of ambiguous bases between contigs for a genome to be a representative. strict_filtering : boolean If True apply filtering to all genomes, otherise apply lenient filtering to genomes where the chromosome and plasmids are reported as complete. output_file : str Output file to contain list of dereplicated genomes. """ trusted_accessions = set() if trusted_genomes_file: for line in open(trusted_genomes_file): line_split = line.split('\t') trusted_accessions.add(line_split[0].strip()) accession_to_taxid, complete_genomes, representative_genomes = ncbi.read_refseq_metadata(metadata_file, keep_db_prefix=True) self.logger.info('Identified %d RefSeq genomes.' % len(accession_to_taxid)) self.logger.info('Identified %d representative or reference genomes.' % len(representative_genomes)) self.logger.info('Identified %d complete genomes.' % len(complete_genomes)) self.logger.info('Identified %d genomes in exception list.' % len(trusted_accessions)) if trusted_accessions.difference(representative_genomes): self.logger.error('There are genomes in the exception list which are not representatives.') sys.exit() gtdb_taxonomy = read_gtdb_taxonomy(metadata_file) ncbi_taxonomy = read_gtdb_ncbi_taxonomy(metadata_file) ncbi_organism_names = read_gtdb_ncbi_organism_name(metadata_file) species = species_label(gtdb_taxonomy, ncbi_taxonomy, ncbi_organism_names) self.logger.info('Identified %d genomes with a GTDB or NCBI species names.' % len(species)) # get previous representatives prev_gtdb_reps = set() for line in open(prev_rep_file): prev_gtdb_reps.add(line.strip().split('\t')[0]) self.logger.info('Identified %d previous GTDB representatives.' % len(prev_gtdb_reps)) # get genome quality genomes_to_consider = list(accession_to_taxid.keys()) genome_stats = read_gtdb_metadata(metadata_file, ['checkm_completeness', 'checkm_contamination', 'contig_count', 'n50_scaffolds', 'ambiguous_bases', 'total_gap_length', 'scaffold_count', 'ssu_count', 'ncbi_molecule_count', 'ncbi_unspanned_gaps', 'ncbi_genome_representation', 'ncbi_spanned_gaps', 'ncbi_assembly_level', 'ncbi_taxonomy', 'ncbi_organism_name', 'lpsn_strain']) missing_quality = set(accession_to_taxid.keys()) - set(genome_stats.keys()) if missing_quality: self.logger.error('There are %d genomes without metadata information.' % len(missing_quality)) self.exit(-1) filtered_reps_file = output_file + '.filtered_reps' fout = open(filtered_reps_file, 'w') fout.write('Genome ID\tCompleteness\tContamination\tContig Count\tN50\tNote\n') lpsn_type_strains = defaultdict(set) new_genomes_to_consider = [] genome_quality = {} filtered_reps = 0 lack_ncbi_taxonomy = 0 contig_filter_count = 0 for genome_id in list(accession_to_taxid.keys()): stats = genome_stats[genome_id] if not stats.ncbi_taxonomy: lack_ncbi_taxonomy += 1 fout.write('%s\t%.2f\t%.2f\t%d\t%d\t%d\t%d\t%s\n' % (genome_id, comp, cont, stats.contig_count, stats.n50_scaffolds, stats.ambiguous_bases, stats.total_gap_length, 'no NCBI taxonomy')) self.logger.warning('Skipping %s as it has no assigned NCBI taxonomy.' % genome_id) continue comp = stats.checkm_completeness cont = stats.checkm_contamination keep = False if genome_id in trusted_accessions: keep = True elif (comp >= min_rep_comp and cont <= max_rep_cont and (comp - 5*cont) >= min_quality and stats.contig_count <= max_contigs and stats.n50_scaffolds >= min_N50 and stats.ambiguous_bases <= max_ambiguous and stats.total_gap_length <= max_gap_length): keep = True elif not strict_filtering: # check if genome appears to consist of only an unspanned # chromosome and unspanned plasmids and thus can be # subjected to a more lenient quality check if (stats.ncbi_assembly_level in ['Complete Genome', 'Chromosome'] and stats.ncbi_genome_representation == 'full' and stats.scaffold_count == stats.ncbi_molecule_count and stats.ncbi_unspanned_gaps == 0 and stats.ncbi_spanned_gaps <= 10 and stats.ambiguous_bases <= 1000 and stats.total_gap_length <= 100000 and stats.ssu_count >= 1): # apply lenient quality check if comp >= 50 and cont <= 15: keep = True if keep: new_genomes_to_consider.append(genome_id) genome_quality[genome_id] = comp - 5*cont if stats.lpsn_strain: ncbi_species = stats.ncbi_taxonomy.split(';')[6].strip() lpsn_type_strains[ncbi_species].add(genome_id) # check if a representative at NCBI is being filtered if genome_id in representative_genomes and genome_id not in new_genomes_to_consider: fout.write('%s\t%.2f\t%.2f\t%d\t%d\t%d\t%d\t%s\n' % (genome_id, comp, cont, stats.contig_count, stats.n50_scaffolds, stats.ambiguous_bases, stats.total_gap_length, stats.ncbi_organism_name)) if stats.contig_count > 300: contig_filter_count += 1 self.logger.warning('Filtered RefSeq representative %s with comp=%.2f, cont=%.2f, contigs=%d, N50=%d' % (genome_id, comp, cont, stats.contig_count, stats.n50_scaffolds)) filtered_reps += 1 fout.close() print('contig_filter_count', contig_filter_count) genomes_to_consider = new_genomes_to_consider self.logger.info('Skipped %d genomes without an assigned NCBI taxonomy.' % lack_ncbi_taxonomy) self.logger.info('Filtered %d representative or reference genomes based on genome or assembly quality.' % filtered_reps) self.logger.info('Filtered representative or reference genomes written to %s' % filtered_reps_file) self.logger.info('Considering %d genomes after filtering for genome quality.' % (len(genomes_to_consider))) ncbi_type_strains = read_gtdb_ncbi_type_strain(metadata_file) self.logger.info('Identified %d genomes marked as type strains at NCBI.' % len(ncbi_type_strains)) self.logger.info('Identified %d genomes marked as type strains at LPSN.' % sum([len(x) for x in list(lpsn_type_strains.values())])) genomes_to_retain = self._dereplicate(genomes_to_consider, max_species, species, representative_genomes, complete_genomes, ncbi_type_strains, lpsn_type_strains, prev_gtdb_reps, genome_quality) self.logger.info('Retained %d genomes.' % len(genomes_to_retain)) if not trusted_genomes_file: trusted_genomes_file = '' fout = open(output_file, 'w') fout.write('# Selection criteria:\n') fout.write('# Maximum species: %d\n' % max_species) fout.write('# Trusted genomes file: %s\n' % trusted_genomes_file) fout.write('# Genome quality metadata file: %s\n' % str(metadata_file)) fout.write('# Min. representative completeness: %s\n' % str(min_rep_comp)) fout.write('# Max. representative contamination: %s\n' % str(max_rep_cont)) fout.write('#\n') fout.write('# Genome Id\tGTDB Taxonomy\tNCBI Taxonomy\tType strain\tComplete\tRepresentative\n') for assembly_accession in genomes_to_retain: representative = 'yes' if assembly_accession in representative_genomes else 'no' complete = 'yes' if assembly_accession in complete_genomes else 'no' ts = 'yes' if assembly_accession in ncbi_type_strains else 'no' gtdb_taxa_str = ';'.join(gtdb_taxonomy.get(assembly_accession, Taxonomy.rank_prefixes)) ncbi_taxa_str = ';'.join(ncbi_taxonomy.get(assembly_accession, Taxonomy.rank_prefixes)) if assembly_accession.startswith('GCF_'): assembly_accession = 'RS_' + assembly_accession elif assembly_accession.startswith('GCA_'): assembly_accession = 'GB_' + assembly_accession fout.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (assembly_accession, gtdb_taxa_str, ncbi_taxa_str, ts, complete, representative)) fout.close()
def run(self, qc_file, metadata_file, gtdb_user_genomes_file, genome_path_file, type_genome_cluster_file, type_genome_synonym_file, ncbi_refseq_assembly_file, ncbi_genbank_assembly_file, ani_af_nontype_vs_type): """Infer de novo species clusters and type genomes for remaining genomes.""" # identify genomes failing quality criteria self.logger.info('Reading QC file.') passed_qc = read_qc_file(qc_file) self.logger.info('Identified %d genomes passing QC.' % len(passed_qc)) # get NCBI taxonomy strings for each genome self.logger.info('Reading NCBI taxonomy from GTDB metadata file.') ncbi_taxonomy = read_gtdb_ncbi_taxonomy(metadata_file) gtdb_taxonomy = read_gtdb_taxonomy(metadata_file) self.logger.info('Read NCBI taxonomy for %d genomes.' % len(ncbi_taxonomy)) self.logger.info('Read GTDB taxonomy for %d genomes.' % len(gtdb_taxonomy)) # parse NCBI assembly files self.logger.info('Parsing NCBI assembly files.') excluded_from_refseq_note = exclude_from_refseq( ncbi_refseq_assembly_file, ncbi_genbank_assembly_file) # get path to genome FASTA files self.logger.info('Reading path to genome FASTA files.') genome_files = parse_genome_path(genome_path_file) self.logger.info('Read path for %d genomes.' % len(genome_files)) for gid in set(genome_files): if gid not in passed_qc: genome_files.pop(gid) self.logger.info( 'Considering %d genomes as potential representatives after removing unwanted User genomes.' % len(genome_files)) assert (len(genome_files) == len(passed_qc)) # determine type genomes and genomes clustered to type genomes type_species, species_type_gid, type_gids, type_clustered_gids = self._parse_type_clusters( type_genome_cluster_file) assert (len(type_species) == len(type_gids)) self.logger.info('Identified %d type genomes.' % len(type_gids)) self.logger.info('Identified %d clustered genomes.' % len(type_clustered_gids)) # calculate quality score for genomes self.logger.info('Parse quality statistics for all genomes.') quality_metadata = read_quality_metadata(metadata_file) # calculate genome quality score self.logger.info('Calculating genome quality score.') genome_quality = quality_score(quality_metadata.keys(), quality_metadata) # determine genomes left to be clustered unclustered_gids = passed_qc - type_gids - type_clustered_gids #***unclustered_gids = set(list(unclustered_gids)[0:2000]) #***DEBUG self.logger.info('Identified %d unclustered genomes passing QC.' % len(unclustered_gids)) # establish closest type genome for each unclustered genome self.logger.info( 'Determining ANI circumscription for %d unclustered genomes.' % len(unclustered_gids)) nontype_radius = self._nontype_radius(unclustered_gids, type_gids, ani_af_nontype_vs_type) # calculate Mash ANI estimates between unclustered genomes self.logger.info( 'Calculating Mash ANI estimates between unclustered genomes.') mash_anis = self._mash_ani_unclustered(genome_files, unclustered_gids) # de novo cluster genomes in a greedy fashion based on genome quality clusters, ani_af = self._cluster_de_novo(genome_files, nontype_radius, unclustered_gids, mash_anis, quality_metadata) # get list of synonyms in order to restrict usage of species names synonyms = self._parse_synonyms(type_genome_synonym_file) self.logger.info('Identified %d synonyms.' % len(synonyms)) # determine User genomes with NCBI accession number that may form species names gtdb_user_to_genbank = self._gtdb_user_genomes(gtdb_user_genomes_file, metadata_file) self.logger.info( 'Identified %d GTDB User genomes with NCBI accessions.' % len(gtdb_user_to_genbank)) # assign species names to de novo species clusters names_in_use = synonyms.union(type_species) self.logger.info('Identified %d species names already in use.' % len(names_in_use)) self.logger.info('Assigning species name to each species cluster.') cluster_sp_names = self._assign_species_names(clusters, names_in_use, ncbi_taxonomy, gtdb_taxonomy, gtdb_user_to_genbank) # write out file with details about selected representative genomes self._write_rep_info( clusters, cluster_sp_names, quality_metadata, genome_quality, excluded_from_refseq_note, ani_af, os.path.join(self.output_dir, 'gtdb_rep_genome_info.tsv')) # report clustering write_clusters( clusters, cluster_sp_names, os.path.join(self.output_dir, 'gtdb_rep_genome_clusters.tsv')) # remove genomes that are not representatives of a species cluster and then write out representative ANI radius for gid in set(nontype_radius) - set(clusters): del nontype_radius[gid] all_species = cluster_sp_names all_species.update(species_type_gid) write_type_radius( nontype_radius, all_species, os.path.join(self.output_dir, 'gtdb_rep_genome_ani_radius.tsv')) # create single file specifying all GTDB clusters self._concat_cluster_files( type_genome_cluster_file, os.path.join(self.output_dir, 'gtdb_rep_genome_clusters.tsv'), os.path.join(self.output_dir, 'gtdb_clusters_final.tsv'))
def run(self, rna_name, gtdb_metadata_file, rna_file, min_rna_length, min_scaffold_length, min_quality, max_contigs, min_N50, tax_filter, genome_list, output_dir, align_method='ssu_align'): """Infer rRNA gene tree spanning select GTDB genomes. Parameters ---------- rna_name : str Name of rRNA gene. gtdb_metadata_file : str File specifying GTDB metadata for each genome. rna_file : str File with rRNA gene sequences in FASTA format. min_rna_length : int Minimum required length of rRNA gene sequences. min_scaffold_length : int Minimum required length of scaffold containing rRNA gene sequence. min_quality : float [0, 100] Minimum genome quality for a genome to be include in tree. max_contigs : int Maximum number of contigs to include genome. min_N50 : int Minimum N50 to include genome. tax_filter : boolean Filter sequences based on incongruent taxonomy classification. genome_list : str Explicit list of genomes to use (ignores --ncbi_rep_only and --user_genomes). output_dir : str Directory to store results """ if rna_name not in ['ssu', 'lsu']: self.logger.error('Unrecognized rRNA gene type: %s' % rna_name) sys.exit(-1) genome_metadata = read_gtdb_metadata(gtdb_metadata_file, [ 'checkm_completeness', 'checkm_contamination', 'scaffold_count', 'n50_scaffolds', 'organism_name', 'gtdb_representative' ]) gtdb_taxonomy = read_gtdb_taxonomy(gtdb_metadata_file) user_genomes = set() uba_genomes = set() ncbi_genomes = set() rep_genomes = set() for genome_id in genome_metadata: org_name = str(genome_metadata[genome_id][4]) if genome_id.startswith('U_'): if '(UBA' in org_name: uba_genomes.add(genome_id) else: user_genomes.add(genome_id) elif genome_id.startswith('RS_') or genome_id.startswith('GB_'): ncbi_genomes.add(genome_id) else: self.logger.warning('Unrecognized genome prefix: %s' % genome_id) rep = genome_metadata[genome_id][5] == 't' if rep: rep_genomes.add(genome_id) self.logger.info( 'Initially considering %d genomes (%d NCBI, %d UBA, %d User).' % (len(genome_metadata), len(ncbi_genomes), len(uba_genomes), len(user_genomes))) self.logger.info('Identified %d representative genomes.' % len(rep_genomes)) # get genomes specified in genome list by user genomes_to_consider = set() if genome_list: for line in open(genome_list): gid = line.rstrip().split('\t')[0] if gid.startswith('RS_') or gid.startswith( 'GB_') or gid.startswith('U_'): genomes_to_consider.add(gid) self.logger.info( 'Restricting genomes to the %d in the genome list.' % len(genomes_to_consider)) else: # filter genomes based on quality and database source self.logger.info('Filtering genomes based on specified critieria.') self.logger.info('Filtering on minimum quality <%d.' % min_quality) self.logger.info('Filtering on number of contigs >%d.' % max_contigs) self.logger.info('Filtering on scaffold N50 <%d.' % min_N50) new_genomes_to_consider = [] filtered_genomes = 0 gt = 0 gq = 0 sc = 0 n50 = 0 for genome_id in genome_metadata: if genome_id not in rep_genomes: gt += 1 filtered_genomes += 1 continue if genome_id not in ncbi_genomes and genome_id not in uba_genomes: gt += 1 filtered_genomes += 1 continue comp, cont, scaffold_count, n50_contigs, _org_name, _rep = genome_metadata[ genome_id] q = float(comp) - 5 * float(cont) if q < min_quality or int(scaffold_count) > max_contigs or int( n50_contigs) < min_N50: if q < min_quality: gq += 1 if int(scaffold_count) > max_contigs: sc += 1 if int(n50_contigs) < min_N50: n50 += 1 filtered_genomes += 1 continue new_genomes_to_consider.append(genome_id) genomes_to_consider = new_genomes_to_consider self.logger.info( 'Filtered %d genomes (%d on genome type, %d on genome quality, %d on number of contigs, %d on N50).' % (filtered_genomes, gt, gq, sc, n50)) self.logger.info('Considering %d genomes after filtering.' % len(genomes_to_consider)) # limit taxonomy to genomes being considered cur_gtdb_taxonomy = {} for gid in genomes_to_consider: cur_gtdb_taxonomy[gid] = gtdb_taxonomy[gid] # get rRNA gene sequences for each genome rna_output_file = self._get_rna_seqs(rna_name, rna_file, min_rna_length, min_scaffold_length, cur_gtdb_taxonomy, genomes_to_consider, output_dir) # identify erroneous rRNA gene sequences if tax_filter: self.logger.info( 'Filtering sequences with incongruent taxonomy strings.') filter = self._tax_filter(rna_output_file, cur_gtdb_taxonomy, output_dir) self.logger.info('Filtered %d sequences.' % len(filter)) if len(filter) > 0: rna_filtered_output = os.path.join( output_dir, 'gtdb_%s.tax_filter.fna' % rna_name) fout = open(rna_filtered_output, 'w') for seq_id, seq, annotation in seq_io.read_seq( rna_output_file, keep_annotation=True): if seq_id not in filter: fout.write('>' + seq_id + ' ' + annotation + '\n') fout.write(seq + '\n') fout.close() rna_output_file = rna_filtered_output # align sequences with ssu-align or mothur if rna_name == 'ssu': if align_method == 'ssu_align': self.logger.info('Aligning sequences with ssu-align.') align_dir = os.path.join(output_dir, '%s_align' % rna_name) os.system('ssu-align --dna %s %s' % (rna_output_file, align_dir)) os.system('ssu-mask --afa %s' % align_dir) elif align_method == 'mothur': self.logger.info('Aligning sequences with mothur.') align_dir = os.path.join(output_dir, 'mothur') if not os.path.exists(align_dir): os.makedirs(align_dir) mothur_cmd = 'mothur "#set.dir(output=%s, blastdir=/srv/sw/Mothur/1.39.5)' % align_dir mothur_cmd += '; align.seqs(candidate=%s, template=/srv/db/mothur/silva_128/silva.seed_v128.align, search=blast, flip=t, processors=%d)' % ( rna_output_file, self.cpus) input_prefix = remove_extension(rna_output_file) align_file = os.path.join(align_dir, input_prefix + '.align') mothur_cmd += '; filter.seqs(fasta=%s, hard=/srv/db/mothur/silva_128/Lane1349.silva.filter, processors=%d);"' % ( align_file, self.cpus) os.system(mothur_cmd) input_msa = os.path.join(align_dir, input_prefix + '.filter.fasta') elif rna_name == 'lsu': self.logger.info('Aligning sequences with ssu-align.') align_dir = os.path.join(output_dir, '%s_align' % rna_name) if not os.path.exists(align_dir): os.makedirs(align_dir) os.system('esl-sfetch --index %s' % rna_output_file) # search fo sequences using domain-specific LSU HMMs for domain in ['archaea', 'bacteria', 'eukaryote']: self.logger.info( 'Matching LSU rRNA genes to %s-specific HMM.' % domain) table_out = os.path.join( align_dir, 'cmsearch.%s.%s.tblout' % (rna_name, domain)) cm_dir = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'cm_files') cm_file = os.path.join(cm_dir, 'lsu_%s.cm' % domain) log_file = os.path.join( align_dir, 'cmsearch.%s.%s.out' % (rna_name, domain)) os.system( 'cmsearch --hmmonly --cpu %d --noali --tblout %s %s %s > %s' % (self.cpus, table_out, cm_file, rna_output_file, log_file)) # identify top hits for each domain self.logger.info( 'Identifying best domain-specific HMM for each LSU rRNA gene.') top_hits = {} for domain in ['archaea', 'bacteria', 'eukaryote']: table_out = os.path.join( align_dir, 'cmsearch.%s.%s.tblout' % (rna_name, domain)) for line in open(table_out): if line[0] == '#': continue line_split = line.split() seq_id = line_split[0] start_seq = int(line_split[7]) end_seq = int(line_split[8]) bitscore = float(line_split[14]) prev_bitscore = top_hits.get(seq_id, [None, 0, 0, 0, 0])[4] if bitscore > prev_bitscore: top_hits[seq_id] = [ domain, seq_id, start_seq, end_seq, bitscore ] # create MSA for each bacteria and archaea for domain in ['archaea', 'bacteria']: # creat file of top hits top_hits_out = os.path.join( align_dir, 'top_hits.%s.%s.tsv' % (rna_name, domain)) fout = open(top_hits_out, 'w') num_hits = 0 for top_domain, seq_id, start_seq, end_seq, bitscore in top_hits.values( ): if top_domain == domain: fout.write('%s\t%d\t%d\%f\n' % (seq_id, start_seq, end_seq, bitscore)) num_hits += 1 fout.close() # align top hits self.logger.info( 'Creating MSA for %s LSU rRNA genes (%d sequences).' % (domain, num_hits)) if num_hits > 0: seq_file = os.path.join( align_dir, 'cmsearch.%s.%s.fna' % (rna_name, domain)) os.system( "grep -v '^#' %s | awk '{print $1, $2, $3, $1}' | esl-sfetch -Cf %s - > %s" % (top_hits_out, rna_output_file, seq_file)) align_file = os.path.join( align_dir, 'cmalign.%s.%s.stk' % (rna_name, domain)) os.system('cmalign --dnaout --outformat Pfam %s %s > %s' % (cm_file, seq_file, align_file)) masked_file = os.path.join( align_dir, 'cmalign.%s.%s.mask.afa' % (rna_name, domain)) os.system('esl-alimask -p --outformat AFA %s > %s' % (align_file, masked_file)) # trim sequences and infer tree if align_method == 'ssu_align': for domain in ['archaea', 'bacteria']: if rna_name == 'ssu': input_msa = os.path.join( align_dir, 'ssu_align.' + domain + '.mask.afa') elif rna_name == 'lsu': input_msa = os.path.join( align_dir, 'cmalign.%s.%s.mask.afa' % (rna_name, domain)) if not os.path.exists(input_msa): continue trimmed_msa = os.path.join(output_dir, domain + '.trimmed.fna') self._trim_seqs(input_msa, trimmed_msa) # infer tree self.logger.info('Inferring tree for %s genes.' % domain) output_tree = os.path.join(output_dir, domain + '.tree') os.system('FastTreeMP -nosupport -nt -gtr -gamma %s > %s' % (trimmed_msa, output_tree)) elif align_method == 'mothur': trimmed_msa = os.path.join(output_dir, input_prefix + '.trimmed.fna') self._trim_seqs(input_msa, trimmed_msa) # infer tree self.logger.info('Inferring tree for %s genes.') output_tree = os.path.join(output_dir, input_prefix + '.tree') os.system('FastTreeMP -nosupport -nt -gtr -gamma %s > %s' % (trimmed_msa, output_tree))
def run(self, metadata_file, trusted_comp, trusted_cont, max_contigs, min_N50, refseq_rep, output_file): """Determine trusted genomes based on genome statistics. Parameters ---------- metadata_file : str Metadata, including CheckM estimates, for all genomes. trusted_comp : float [0, 100] Minimum completeness to trust genome for marker set inference. trusted_cont : float [0, 100] Maximum contamination to trust genome for marker set inference. max_contigs : int Maximum number of contigs within trusted genomes. min_N50 : int Minimum N50 of trusted genomes. refseq_rep : boolean If true, consider only RefSeq representative and reference genomes. output_file : str Output file to contain list of trusted genomes. """ representative_genomes = None if refseq_rep: _accession_to_taxid, complete_genomes, representative_genomes = ncbi.read_refseq_metadata( metadata_file, keep_db_prefix=True) gtdb_taxonomy = read_gtdb_taxonomy(metadata_file) ncbi_taxonomy = read_gtdb_ncbi_taxonomy(metadata_file) trusted_genomes_stats = self._trusted_genomes(metadata_file, trusted_comp, trusted_cont, max_contigs, min_N50) if representative_genomes: self.logger.info('Limiting genomes to RefSeq representative.') for genome_id in list(trusted_genomes_stats.keys()): if genome_id not in representative_genomes: del trusted_genomes_stats[genome_id] self.logger.info('Identified %d trusted genomes.' % len(trusted_genomes_stats)) fout = open(output_file, 'w') fout.write('# Selection criteria:\n') fout.write('# Trusted completeness: %f\n' % trusted_comp) fout.write('# Trusted contamination: %f\n' % trusted_cont) fout.write('# Maximum contigs: %d\n' % max_contigs) fout.write('# Minimum N50: %d\n' % min_N50) fout.write('#\n') fout.write( '# Genome Id\tCompleteness,Contamination,Contig count,N50\tGTDB Taxonomy\tNCBI Taxonomy\n' ) for assembly_accession, stats in trusted_genomes_stats.items(): fout.write( '%s\t%s\t%s\t%s\n' % (assembly_accession, ','.join(map(str, stats)), ';'.join( gtdb_taxonomy.get(assembly_accession, ['none'])), ';'.join( ncbi_taxonomy.get(assembly_accession, ['none'])))) fout.close()
def run(self, metadata_file, gtdb_user_genomes_file, gtdb_user_reps, ncbi_refseq_assembly_file, ncbi_genbank_assembly_file, gtdb_domain_report, min_comp, max_cont, min_quality, sh_exception, min_perc_markers, max_contigs, min_N50, max_ambiguous, output_dir): """Quality check all potential GTDB genomes.""" # get GTDB and NCBI taxonomy strings for each genome self.logger.info('Reading NCBI taxonomy from GTDB metadata file.') ncbi_taxonomy = read_gtdb_ncbi_taxonomy(metadata_file) ncbi_species = binomial_species(ncbi_taxonomy) gtdb_taxonomy = read_gtdb_taxonomy(metadata_file) self.logger.info('Read NCBI taxonomy for %d genomes.' % len(ncbi_taxonomy)) self.logger.info('Read GTDB taxonomy for %d genomes.' % len(gtdb_taxonomy)) # determine User genomes to retain for consideration gtdb_user_to_genbank = self._gtdb_user_genomes(gtdb_user_genomes_file, metadata_file) self.logger.info( 'Identified %d GTDB User genomes with GenBank accessions to retain for potential inclusion in GTDB.' % len(gtdb_user_to_genbank)) user_genomes = 0 for line in open(gtdb_user_reps): line_split = line.strip().split('\t') gid, taxonomy = line_split if gid not in gtdb_user_to_genbank: if 'd__Bacteria' in taxonomy: self.logger.warning( 'Bacterial genome %s has no NCBI accession and is being skipped.' % gid) else: gtdb_user_to_genbank[gid] = gid user_genomes += 1 self.logger.info( 'Identified %d archaeal GTDB User genome WITHOUT GenBank accessions to retain for potential inclusion in GTDB.' % user_genomes) # calculate quality score for genomes self.logger.info('Parsing QC statistics for each genome.') quality_metadata = read_gtdb_metadata(metadata_file, [ 'checkm_completeness', 'checkm_contamination', 'checkm_strain_heterogeneity_100', 'contig_count', 'n50_contigs', 'ambiguous_bases', 'genome_size' ]) marker_perc = parse_marker_percentages(gtdb_domain_report) # parse NCBI assembly files self.logger.info('Parsing NCBI assembly files.') excluded_from_refseq_note = exclude_from_refseq( ncbi_refseq_assembly_file, ncbi_genbank_assembly_file) # get type material designations for each genome self.logger.info( 'Reading type material designations for genomes from GTDB metadata file.' ) type_metadata = read_gtdb_metadata(metadata_file, [ 'ncbi_type_material_designation', 'gtdb_type_designation', 'gtdb_type_designation_sources' ]) ncbi_tsp = ncbi_type_strain_of_species(type_metadata) gtdb_tsp = gtdb_type_strain_of_species(type_metadata) # QC all genomes self.logger.info('Validating genomes.') fout_retained = open(os.path.join(output_dir, 'qc_passed.tsv'), 'w') fout_failed = open(os.path.join(output_dir, 'qc_failed.tsv'), 'w') header = 'Accession\tNCBI species' header += '\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%%' header += '\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases' fout_retained.write(header + '\n') fout_failed.write(header) fout_failed.write( '\tFailed completeness\tFailed contamination\tFailed quality') fout_failed.write( '\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases\n' ) num_retained = 0 num_filtered = 0 for gid in quality_metadata: if gid.startswith('U_') and gid not in gtdb_user_to_genbank: # skip user genomes not marked for retention continue failed_tests = defaultdict(int) passed_qc = pass_qc(quality_metadata[gid], marker_perc[gid], min_comp, max_cont, min_quality, sh_exception, min_perc_markers, max_contigs, min_N50, max_ambiguous, failed_tests) if passed_qc: num_retained += 1 fout_retained.write('%s\t%s' % (gid, ncbi_taxonomy[gid][6])) fout_retained.write( '\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d\n' % (quality_metadata[gid].checkm_completeness, quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_completeness - 5 * quality_metadata[gid].checkm_contamination, ('%.2f' % quality_metadata[gid].checkm_strain_heterogeneity_100) if quality_metadata[gid].checkm_strain_heterogeneity_100 else '-', marker_perc[gid], quality_metadata[gid].contig_count, quality_metadata[gid].n50_contigs, quality_metadata[gid].ambiguous_bases)) else: num_filtered += 1 fout_failed.write('%s\t%s' % (gid, ncbi_taxonomy[gid][6])) fout_failed.write( '\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d' % (quality_metadata[gid].checkm_completeness, quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_completeness - 5 * quality_metadata[gid].checkm_contamination, ('%.2f' % quality_metadata[gid].checkm_strain_heterogeneity_100) if quality_metadata[gid].checkm_strain_heterogeneity_100 else '-', marker_perc[gid], quality_metadata[gid].contig_count, quality_metadata[gid].n50_contigs, quality_metadata[gid].ambiguous_bases)) fout_failed.write( '\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' % (failed_tests['comp'], failed_tests['cont'], failed_tests['qual'], failed_tests['marker_perc'], failed_tests['contig_count'], failed_tests['N50'], failed_tests['ambig'])) fout_retained.close() fout_failed.close() self.logger.info('Retained %d genomes and filtered %d genomes.' % (num_retained, num_filtered)) # QC genomes in each named species self.logger.info( 'Performing QC of type genome for each of the %d NCBI species.' % len(ncbi_species)) fout_type_fail = open( os.path.join(output_dir, 'type_genomes_fail_qc.tsv'), 'w') fout_type_fail.write( 'Species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tType sources\tNCBI assembly type\tGenome size (bp)' ) fout_type_fail.write( '\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%' ) fout_type_fail.write( '\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases\tNCBI exclude from RefSeq\tLost species\n' ) fout_fail_sp = open(os.path.join(output_dir, 'species_fail_qc.tsv'), 'w') fout_fail_sp.write( 'Species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tAssembled from type material\tGenome size (bp)' ) fout_fail_sp.write( '\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%' ) fout_fail_sp.write( '\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases') fout_fail_sp.write( '\tFailed completeness\tFailed contamination\tFailed quality') fout_fail_sp.write( '\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases' ) fout_fail_sp.write('\tNCBI exclude from RefSeq\n') fout_sp_lost = open(os.path.join(output_dir, 'species_lost.tsv'), 'w') fout_sp_lost.write('Species\tNo. genomes\tNo. type genomes') fout_sp_lost.write( '\tFail completeness\tFail contamination\tFail quality\tFailed percent markers' ) fout_sp_lost.write( '\tFail no. contigs\tFail N50 contigs\tFail ambiguous bases\n') lost_type = 0 lost_sp = 0 filtered_genomes = 0 failed_tests_cumulative = defaultdict(int) for sp, gids in ncbi_species.items(): type_pass = set() type_fail = set() other_pass = set() other_fail = set() failed_tests_gids = {} for gid in gids: failed_tests = defaultdict(int) passed_qc = pass_qc(quality_metadata[gid], marker_perc[gid], min_comp, max_cont, min_quality, sh_exception, min_perc_markers, max_contigs, min_N50, max_ambiguous, failed_tests) failed_tests_gids[gid] = failed_tests if gid in gtdb_tsp or gid in ncbi_tsp: if passed_qc: type_pass.add(gid) else: type_fail.add(gid) filtered_genomes += 1 else: if passed_qc: other_pass.add(gid) else: other_fail.add(gid) filtered_genomes += 1 # tally failed species for test, count in failed_tests.items(): failed_tests_cumulative[test] += count if len(type_pass) >= 1: # great: one or more type genomes pass QC and will be selected as the type genome continue if len(type_fail): # all potential type genomes for species failed QC so report these for manual inspection lost_type += 1 for gid in type_fail: fout_type_fail.write( '%s\t%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d\t%s\t%s\n' % (sp, gid, '; '.join(gtdb_taxonomy[gid]), '; '.join( ncbi_taxonomy[gid]), type_metadata[gid].gtdb_type_designation_sources, type_metadata[gid].ncbi_type_material_designation, float(quality_metadata[gid].genome_size) / 1e6, quality_metadata[gid].checkm_completeness, quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_completeness - 5 * quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_strain_heterogeneity_100, marker_perc[gid], quality_metadata[gid].contig_count, quality_metadata[gid].n50_contigs, quality_metadata[gid].ambiguous_bases, excluded_from_refseq_note[gid], len(other_pass) == 0)) if len(other_pass) == 0: # no genomes for species pass QC so report loss of species lost_sp += 1 fout_sp_lost.write('%s\t%d\t%d' % (sp, len(gids), len(type_fail))) fout_sp_lost.write( '\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' % (sum([failed_tests_gids[gid]['comp'] for gid in gids]), sum([failed_tests_gids[gid]['cont'] for gid in gids]), sum([failed_tests_gids[gid]['qual'] for gid in gids]), sum([ failed_tests_gids[gid]['marker_perc'] for gid in gids ]), sum([ failed_tests_gids[gid]['contig_count'] for gid in gids ]), sum([failed_tests_gids[gid]['N50'] for gid in gids]), sum([failed_tests_gids[gid]['ambig'] for gid in gids]))) for gid in type_fail.union(other_fail): fout_fail_sp.write( '%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d' % (sp, gid, '; '.join(gtdb_taxonomy[gid]), '; '.join( ncbi_taxonomy[gid]), gid in type_fail, float(quality_metadata[gid].genome_size) / 1e6, quality_metadata[gid].checkm_completeness, quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_completeness - 5 * quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_strain_heterogeneity_100, marker_perc[gid], quality_metadata[gid].contig_count, quality_metadata[gid].n50_contigs, quality_metadata[gid].ambiguous_bases)) fout_fail_sp.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % (failed_tests_gids[gid]['comp'], failed_tests_gids[gid]['cont'], failed_tests_gids[gid]['qual'], failed_tests_gids[gid]['marker_perc'], failed_tests_gids[gid]['contig_count'], failed_tests_gids[gid]['N50'], failed_tests_gids[gid]['ambig'])) fout_fail_sp.write('\t%s\n' % excluded_from_refseq_note[gid]) fout_type_fail.close() fout_fail_sp.close() fout_sp_lost.close() self.logger.info('Genomes filtered for each criterion:') for test in sorted(failed_tests_cumulative): self.logger.info('%s: %d' % (test, failed_tests_cumulative[test])) self.logger.info('Filtered %d genomes assigned to NCBI species.' % filtered_genomes) self.logger.info( 'Identified %d species with type genomes failing QC and %d total species failing QC.' % (lost_type, lost_sp))