def __worker(self, metadata_file, nt_files, max_genomes, queue_in, queue_out): """Process each species in parallel.""" metadata = read_gtdb_metadata(metadata_file, ['checkm_completeness', 'checkm_contamination']) genome_quality = {} for genome_id, m in metadata.items(): genome_quality[genome_id] = m.checkm_completeness - 5*m.checkm_contamination while True: species, genome_ids = queue_in.get(block=True, timeout=None) if species == None: break # select highest quality genomes if len(genome_ids) > max_genomes: t = [(genome_id, q) for genome_id, q in genome_quality.items() if genome_id in genome_ids] hq_genomes = sorted(t, key=lambda x: x[1], reverse=True)[0:max_genomes] genome_ids = [x[0] for x in hq_genomes] ani = [] af = [] results = '' tmp_dir = tempfile.mkdtemp() for gi, gj in itertools.combinations(genome_ids, 2): tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=tmp_dir) tmp_file.close() cmd = ('ani_calculator ' + '-genome1fna %s ' + '-genome2fna %s ' + '-outfile %s -outdir %s ' + '> /dev/null') % (nt_files[gi], nt_files[gj], tmp_file.name, tmp_dir) os.system(cmd) with open(tmp_file.name) as f: f.readline() for line in f: results += '%s\t%s' % (species, line) line_split = line.strip().split('\t') ani.append(0.5*(float(line_split[2]) + float(line_split[3]))) af.append(0.5*(float(line_split[4]) + float(line_split[5]))) shutil.rmtree(tmp_dir) queue_out.put((species, ani, af, genome_ids, results))
def read_quality_metadata(metadata_file): """Read statistics needed to determine genome quality.""" return read_gtdb_metadata(metadata_file, [ 'gtdb_taxonomy', 'checkm_completeness', 'checkm_contamination', 'checkm_strain_heterogeneity_100', 'genome_size', 'contig_count', 'n50_contigs', 'scaffold_count', 'ambiguous_bases', 'total_gap_length', 'ssu_count', 'ssu_length', 'mimag_high_quality', 'ncbi_assembly_level', 'ncbi_genome_representation', 'ncbi_refseq_category', 'ncbi_type_material_designation', 'ncbi_molecule_count', 'ncbi_unspanned_gaps', 'ncbi_spanned_gaps', 'ncbi_genome_category' ])
def _gtdb_user_genomes(self, gtdb_user_genomes_file, metadata_file): """Get map between GTDB User genomes and GenBank accessions.""" uba_to_genbank = {} for line in open(gtdb_user_genomes_file): line_split = line.strip().split('\t') gb_acc = line_split[0] uba_id = line_split[4] uba_to_genbank[uba_id] = gb_acc user_to_genbank = {} m = read_gtdb_metadata(metadata_file, ['organism_name']) for gid, metadata in m.items(): if '(UBA' in str(metadata.organism_name): uba_id = metadata.organism_name[metadata.organism_name.find('(')+1:-1] if uba_id in uba_to_genbank: user_to_genbank[gid] = uba_to_genbank[uba_id] return user_to_genbank
def _gtdb_user_genomes(self, gtdb_user_genomes_file, metadata_file): """Get map between GTDB User genomes and GenBank accessions.""" uba_to_genbank = {} for line in open(gtdb_user_genomes_file): line_split = line.strip().split('\t') gb_acc = line_split[0] uba_id = line_split[4] uba_to_genbank[uba_id] = gb_acc user_to_genbank = {} m = read_gtdb_metadata(metadata_file, ['organism_name']) for gid, metadata in m.items(): if '(UBA' in str(metadata.organism_name): uba_id = metadata.organism_name[metadata.organism_name. find('(') + 1:-1] if uba_id in uba_to_genbank: user_to_genbank[gid] = uba_to_genbank[uba_id] return user_to_genbank
def _genome_stats(self, metadata_file): """Genome genome and assembly quality metadata.""" stats = read_gtdb_metadata(metadata_file, ['checkm_completeness', 'checkm_contamination', 'contig_count', 'n50_scaffolds', 'ambiguous_bases', 'total_gap_length', 'scaffold_count', 'ssu_count', 'gtdb_taxonomy', 'ncbi_molecule_count', 'ncbi_unspanned_gaps', 'ncbi_genome_representation', 'ncbi_spanned_gaps', 'ncbi_assembly_level', 'ncbi_taxonomy', 'ncbi_organism_name', 'lpsn_strain']) return stats
def read_quality_metadata(metadata_file): """Read statistics needed to determine genome quality.""" return read_gtdb_metadata(metadata_file, ['gtdb_taxonomy', 'checkm_completeness', 'checkm_contamination', 'checkm_strain_heterogeneity_100', 'genome_size', 'contig_count', 'n50_contigs', 'scaffold_count', 'ambiguous_bases', 'total_gap_length', 'ssu_count', 'ssu_length', 'mimag_high_quality', 'ncbi_assembly_level', 'ncbi_genome_representation', 'ncbi_refseq_category', 'ncbi_type_material_designation', 'ncbi_molecule_count', 'ncbi_unspanned_gaps', 'ncbi_spanned_gaps', 'ncbi_genome_category'])
def run(self, metadata_file, gtdb_user_genomes_file, gtdb_user_reps, ncbi_refseq_assembly_file, ncbi_genbank_assembly_file, gtdb_domain_report, qc_exception_file, species_exception_file, min_comp, max_cont, min_quality, sh_exception, min_perc_markers, max_contigs, min_N50, max_ambiguous, output_dir): """Quality check all potential GTDB genomes.""" # get GTDB and NCBI taxonomy strings for each genome self.logger.info('Reading NCBI taxonomy from GTDB metadata file.') ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(metadata_file, species_exception_file) ncbi_species = binomial_species(ncbi_taxonomy) gtdb_taxonomy = read_gtdb_taxonomy(metadata_file) self.logger.info('Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count)) self.logger.info('Read GTDB taxonomy for %d genomes.' % len(gtdb_taxonomy)) # determine User genomes to retain for consideration gtdb_user_to_genbank = self._gtdb_user_genomes(gtdb_user_genomes_file, metadata_file) self.logger.info('Identified %d GTDB User genomes with GenBank accessions to retain for potential inclusion in GTDB.' % len(gtdb_user_to_genbank)) user_genomes = 0 for line in open(gtdb_user_reps): line_split = line.strip().split('\t') gid, taxonomy = line_split if gid not in gtdb_user_to_genbank: if 'd__Bacteria' in taxonomy: self.logger.warning('Bacterial genome %s has no NCBI accession and is being skipped.' % gid) else: gtdb_user_to_genbank[gid] = gid user_genomes += 1 self.logger.info('Identified %d archaeal GTDB User genome WITHOUT GenBank accessions to retain for potential inclusion in GTDB.' % user_genomes) # parse genomes flagged as exceptions from QC qc_exceptions = set() for line in open(qc_exception_file): qc_exceptions.add(line.split('\t')[0].strip()) self.logger.info('Identified %d genomes flagged as exceptions from QC.' % len(qc_exceptions)) # calculate quality score for genomes self.logger.info('Parsing QC statistics for each genome.') quality_metadata = read_gtdb_metadata(metadata_file, ['checkm_completeness', 'checkm_contamination', 'checkm_strain_heterogeneity_100', 'contig_count', 'n50_contigs', 'ambiguous_bases', 'genome_size']) marker_perc = parse_marker_percentages(gtdb_domain_report) # parse NCBI assembly files self.logger.info('Parsing NCBI assembly files.') excluded_from_refseq_note = exclude_from_refseq(ncbi_refseq_assembly_file, ncbi_genbank_assembly_file) # get type material designations for each genome self.logger.info('Reading type material designations for genomes from GTDB metadata file.') type_metadata = read_gtdb_metadata(metadata_file, ['ncbi_type_material_designation', 'gtdb_type_designation', 'gtdb_type_designation_sources']) ncbi_tsp = ncbi_type_strain_of_species(type_metadata) gtdb_tsp = gtdb_type_strain_of_species(type_metadata) # QC all genomes self.logger.info('Validating genomes.') fout_retained = open(os.path.join(output_dir, 'qc_passed.tsv'), 'w') fout_failed = open(os.path.join(output_dir, 'qc_failed.tsv'), 'w') header = 'Accession\tNCBI species' header += '\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%' header += '\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases' fout_retained.write(header + '\tNote\n') fout_failed.write(header) fout_failed.write('\tFailed completeness\tFailed contamination\tFailed quality') fout_failed.write('\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases\n') num_retained = 0 num_filtered = 0 for gid in quality_metadata: if gid.startswith('U_') and gid not in gtdb_user_to_genbank: # skip user genomes not marked for retention continue failed_tests = defaultdict(int) passed_qc = pass_qc(quality_metadata[gid], marker_perc[gid], min_comp, max_cont, min_quality, sh_exception, min_perc_markers, max_contigs, min_N50, max_ambiguous, failed_tests) if passed_qc or gid in qc_exceptions: num_retained += 1 fout_retained.write('%s\t%s' % (gid, ncbi_taxonomy[gid][6])) fout_retained.write('\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d\t%s\n' % ( quality_metadata[gid].checkm_completeness, quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_completeness-5*quality_metadata[gid].checkm_contamination, ('%.2f' % quality_metadata[gid].checkm_strain_heterogeneity_100) if quality_metadata[gid].checkm_strain_heterogeneity_100 else '-', marker_perc[gid], quality_metadata[gid].contig_count, quality_metadata[gid].n50_contigs, quality_metadata[gid].ambiguous_bases, 'Passed QC' if passed_qc else 'Flagged as exception')) else: num_filtered += 1 fout_failed.write('%s\t%s' % (gid, ncbi_taxonomy[gid][6])) fout_failed.write('\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d' % ( quality_metadata[gid].checkm_completeness, quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_completeness-5*quality_metadata[gid].checkm_contamination, ('%.2f' % quality_metadata[gid].checkm_strain_heterogeneity_100) if quality_metadata[gid].checkm_strain_heterogeneity_100 else '-', marker_perc[gid], quality_metadata[gid].contig_count, quality_metadata[gid].n50_contigs, quality_metadata[gid].ambiguous_bases)) fout_failed.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' % ( failed_tests['comp'], failed_tests['cont'], failed_tests['qual'], failed_tests['marker_perc'], failed_tests['contig_count'], failed_tests['N50'], failed_tests['ambig'])) fout_retained.close() fout_failed.close() self.logger.info('Retained %d genomes and filtered %d genomes.' % (num_retained, num_filtered)) # QC genomes in each named species self.logger.info('Performing QC of type genome for each of the %d NCBI species.' % len(ncbi_species)) fout_type_fail = open(os.path.join(output_dir, 'type_genomes_fail_qc.tsv'), 'w') fout_type_fail.write('Species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tType sources\tNCBI assembly type\tGenome size (bp)') fout_type_fail.write('\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%') fout_type_fail.write('\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases\tNCBI exclude from RefSeq\tLost species\n') fout_fail_sp = open(os.path.join(output_dir, 'species_fail_qc.tsv'), 'w') fout_fail_sp.write('Species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tAssembled from type material\tGenome size (bp)') fout_fail_sp.write('\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%') fout_fail_sp.write('\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases') fout_fail_sp.write('\tFailed completeness\tFailed contamination\tFailed quality') fout_fail_sp.write('\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases') fout_fail_sp.write('\tNCBI exclude from RefSeq\n') fout_sp_lost = open(os.path.join(output_dir, 'species_lost.tsv'), 'w') fout_sp_lost.write('Species\tNo. genomes\tNo. type genomes') fout_sp_lost.write('\tFail completeness\tFail contamination\tFail quality\tFailed percent markers') fout_sp_lost.write('\tFail no. contigs\tFail N50 contigs\tFail ambiguous bases\n') lost_type = 0 lost_sp = 0 filtered_genomes = 0 failed_tests_cumulative = defaultdict(int) for sp, gids in ncbi_species.items(): type_pass = set() type_fail = set() other_pass = set() other_fail = set() failed_tests_gids = {} for gid in gids: failed_tests = defaultdict(int) passed_qc = pass_qc(quality_metadata[gid], marker_perc[gid], min_comp, max_cont, min_quality, sh_exception, min_perc_markers, max_contigs, min_N50, max_ambiguous, failed_tests) failed_tests_gids[gid] = failed_tests if gid in gtdb_tsp or gid in ncbi_tsp: if passed_qc: type_pass.add(gid) else: type_fail.add(gid) filtered_genomes += 1 else: if passed_qc: other_pass.add(gid) else: other_fail.add(gid) filtered_genomes += 1 # tally failed species for test, count in failed_tests.items(): failed_tests_cumulative[test] += count if len(type_pass) >= 1: # great: one or more type genomes pass QC and will be selected as the type genome continue if len(type_fail): # all potential type genomes for species failed QC so report these for manual inspection lost_type += 1 for gid in type_fail: fout_type_fail.write('%s\t%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d\t%s\t%s\n' % ( sp, gid, '; '.join(gtdb_taxonomy[gid]), '; '.join(ncbi_taxonomy[gid]), type_metadata[gid].gtdb_type_designation_sources, type_metadata[gid].ncbi_type_material_designation, float(quality_metadata[gid].genome_size)/1e6, quality_metadata[gid].checkm_completeness, quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_completeness-5*quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_strain_heterogeneity_100, marker_perc[gid], quality_metadata[gid].contig_count, quality_metadata[gid].n50_contigs, quality_metadata[gid].ambiguous_bases, excluded_from_refseq_note[gid], len(other_pass) == 0)) if len(other_pass) == 0: # no genomes for species pass QC so report loss of species lost_sp += 1 fout_sp_lost.write('%s\t%d\t%d' % (sp, len(gids), len(type_fail))) fout_sp_lost.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' % ( sum([failed_tests_gids[gid]['comp'] for gid in gids]), sum([failed_tests_gids[gid]['cont'] for gid in gids]), sum([failed_tests_gids[gid]['qual'] for gid in gids]), sum([failed_tests_gids[gid]['marker_perc'] for gid in gids]), sum([failed_tests_gids[gid]['contig_count'] for gid in gids]), sum([failed_tests_gids[gid]['N50'] for gid in gids]), sum([failed_tests_gids[gid]['ambig'] for gid in gids]))) for gid in type_fail.union(other_fail): fout_fail_sp.write('%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d' % ( sp, gid, '; '.join(gtdb_taxonomy[gid]), '; '.join(ncbi_taxonomy[gid]), gid in type_fail, float(quality_metadata[gid].genome_size)/1e6, quality_metadata[gid].checkm_completeness, quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_completeness-5*quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_strain_heterogeneity_100, marker_perc[gid], quality_metadata[gid].contig_count, quality_metadata[gid].n50_contigs, quality_metadata[gid].ambiguous_bases)) fout_fail_sp.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % ( failed_tests_gids[gid]['comp'], failed_tests_gids[gid]['cont'], failed_tests_gids[gid]['qual'], failed_tests_gids[gid]['marker_perc'], failed_tests_gids[gid]['contig_count'], failed_tests_gids[gid]['N50'], failed_tests_gids[gid]['ambig'])) fout_fail_sp.write('\t%s\n' % excluded_from_refseq_note[gid]) fout_type_fail.close() fout_fail_sp.close() fout_sp_lost.close() self.logger.info('Genomes filtered for each criterion:') for test in sorted(failed_tests_cumulative): self.logger.info('%s: %d' % (test, failed_tests_cumulative[test])) self.logger.info('Filtered %d genomes assigned to NCBI species.' % filtered_genomes) self.logger.info('Identified %d species with type genomes failing QC and %d total species failing QC.' % (lost_type, lost_sp))
def run(self, qc_file, gtdb_metadata_file, gtdb_final_clusters, species_exception_file, output_dir): """Quality check all potential GTDB genomes.""" # identify genomes failing quality criteria self.logger.info('Reading QC file.') passed_qc = read_qc_file(qc_file) self.logger.info('Identified %d genomes passing QC.' % len(passed_qc)) # get GTDB and NCBI taxonomy strings for each genome self.logger.info('Reading NCBI and GTDB taxonomy from GTDB metadata file.') ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(gtdb_metadata_file, species_exception_file) prev_gtdb_taxonomy = read_gtdb_taxonomy(gtdb_metadata_file) self.logger.info('Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count)) self.logger.info('Read GTDB taxonomy for %d genomes.' % len(prev_gtdb_taxonomy)) # get GTDB metadata type_metadata = read_gtdb_metadata(gtdb_metadata_file, ['gtdb_type_designation', 'gtdb_type_designation_sources', 'gtdb_type_species_of_genus']) quality_metadata = read_quality_metadata(gtdb_metadata_file) # read species clusters sp_clusters, species, _rep_radius = read_clusters(gtdb_final_clusters) self.logger.info('Read %d species clusters.' % len(sp_clusters)) # sanity check species clusters all defined by genomes passing QC for gid in sp_clusters: if gid not in passed_qc: self.logger.error('Genome %s defines a species cluster, but fails QC.' % gid) sys.exit(-1) # modify GTDB taxonomy to reflect new species clustering and report incongruencies self.logger.info('Identifying species with incongruent specific names.') self._incongruent_specific_names(species, ncbi_taxonomy, prev_gtdb_taxonomy, type_metadata, output_dir) self._incongruent_genus_names(species, ncbi_taxonomy, prev_gtdb_taxonomy, type_metadata, output_dir) # get GIDs for canonical and validation trees fout_bac_can_gtdb = open(os.path.join(output_dir, 'bac_can_taxonomy.tsv'), 'w') fout_bac_val_gtdb = open(os.path.join(output_dir, 'bac_val_taxonomy.tsv'), 'w') fout_ar_can_gtdb = open(os.path.join(output_dir, 'ar_can_taxonomy.tsv'), 'w') fout_ar_val_gtdb = open(os.path.join(output_dir, 'ar_val_taxonomy.tsv'), 'w') fout_bac_val = open(os.path.join(output_dir, 'gids_bac_validation.lst'), 'w') fout_ar_val = open(os.path.join(output_dir, 'gids_ar_validation.lst'), 'w') fout_bac_can = open(os.path.join(output_dir, 'gids_bac_canonical.lst'), 'w') fout_ar_can = open(os.path.join(output_dir, 'gids_ar_canonical.lst'), 'w') fout_bac_val.write('#Accession\tSpecies\tNote\n') fout_ar_val.write('#Accession\tSpecies\tNote\n') fout_bac_can.write('#Accession\tSpecies\tNote\n') fout_ar_can.write('#Accession\tSpecies\tNote\n') for rid in sp_clusters: domain = prev_gtdb_taxonomy[rid][0] if domain == 'd__Bacteria': fout_val = fout_bac_val fout_can = fout_bac_can fout_can_gtdb = fout_bac_can_gtdb fout_val_gtdb = fout_bac_val_gtdb elif domain == 'd__Archaea': fout_val = fout_ar_val fout_can = fout_ar_can fout_can_gtdb = fout_ar_can_gtdb fout_val_gtdb = fout_ar_val_gtdb else: self.logger.error('Genome %s has no GTDB domain assignment.' % rid) sys.exit(-1) # substitute proposed species name into GTDB taxonomy sp = species[rid] canonical_sp = parse_canonical_sp(sp) taxa = prev_gtdb_taxonomy[rid][0:6] + [canonical_sp] new_gtdb_str = '; '.join(taxa) fout_can_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str)) fout_val_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str)) fout_val.write('%s\t%s\t%s\n' % (rid, sp, 'GTDB type or representative genome')) fout_can.write('%s\t%s\t%s\n' % (rid, sp, 'GTDB type or representative genome')) cluster_gids = set(sp_clusters[rid]) for gid in cluster_gids: if gid not in passed_qc: self.logger.error('Genome %s is in a species cluster, but fails QC.' % gid) sys.exit(-1) if len(cluster_gids) > 0: # select highest-quality genome q = quality_score(cluster_gids, quality_metadata) gid = max(q.items(), key=operator.itemgetter(1))[0] taxa = prev_gtdb_taxonomy[gid][0:6] + [canonical_sp] new_gtdb_str = '; '.join(taxa) fout_val.write('%s\t%s\t%s\n' % (gid, sp, 'selected highest-quality genome (Q=%.2f)' % q[gid])) fout_val_gtdb.write('%s\t%s\n' % (gid, new_gtdb_str)) fout_bac_val.close() fout_ar_val.close() fout_bac_can.close() fout_ar_can.close() fout_bac_can_gtdb.close() fout_bac_val_gtdb.close() fout_ar_can_gtdb.close() fout_ar_val_gtdb.close()
def run(self, qc_file, gtdb_metadata_file, gtdb_final_clusters, output_dir): """Quality check all potential GTDB genomes.""" # identify genomes failing quality criteria self.logger.info('Reading QC file.') passed_qc = read_qc_file(qc_file) self.logger.info('Identified %d genomes passing QC.' % len(passed_qc)) # get GTDB and NCBI taxonomy strings for each genome self.logger.info( 'Reading NCBI and GTDB taxonomy from GTDB metadata file.') ncbi_taxonomy = read_gtdb_ncbi_taxonomy(gtdb_metadata_file) prev_gtdb_taxonomy = read_gtdb_taxonomy(gtdb_metadata_file) self.logger.info('Read NCBI taxonomy for %d genomes.' % len(ncbi_taxonomy)) self.logger.info('Read GTDB taxonomy for %d genomes.' % len(prev_gtdb_taxonomy)) # get GTDB metadata type_metadata = read_gtdb_metadata(gtdb_metadata_file, [ 'gtdb_type_designation', 'gtdb_type_designation_sources', 'gtdb_type_species_of_genus' ]) quality_metadata = read_quality_metadata(gtdb_metadata_file) # read species clusters sp_clusters, species = read_clusters(gtdb_final_clusters) self.logger.info('Read %d species clusters.' % len(sp_clusters)) # sanity check species clusters all defined by genomes passing QC for gid in sp_clusters: if gid not in passed_qc: self.logger.error( 'Genome %s defines a species cluster, but fails QC.' % gid) sys.exit(-1) # modify GTDB taxonomy to reflect new species clustering a report incongruencies self.logger.info( 'Identifying species with incongruent specific names.') self._incongruent_specific_names(species, ncbi_taxonomy, prev_gtdb_taxonomy, type_metadata, output_dir) self._incongruent_genus_names(species, ncbi_taxonomy, prev_gtdb_taxonomy, type_metadata, output_dir) # get GIDs for canonical and validation trees fout_bac_can_gtdb = open( os.path.join(output_dir, 'bac_can_taxonomy.tsv'), 'w') fout_bac_val_gtdb = open( os.path.join(output_dir, 'bac_val_taxonomy.tsv'), 'w') fout_ar_can_gtdb = open( os.path.join(output_dir, 'ar_can_taxonomy.tsv'), 'w') fout_ar_val_gtdb = open( os.path.join(output_dir, 'ar_val_taxonomy.tsv'), 'w') fout_bac_val = open( os.path.join(output_dir, 'gids_bac_validation.lst'), 'w') fout_ar_val = open(os.path.join(output_dir, 'gids_ar_validation.lst'), 'w') fout_bac_can = open(os.path.join(output_dir, 'gids_bac_canonical.lst'), 'w') fout_ar_can = open(os.path.join(output_dir, 'gids_ar_canonical.lst'), 'w') fout_bac_val.write('#Accession\tSpecies\tNote\n') fout_ar_val.write('#Accession\tSpecies\tNote\n') fout_bac_can.write('#Accession\tSpecies\tNote\n') fout_ar_can.write('#Accession\tSpecies\tNote\n') for rid in sp_clusters: domain = prev_gtdb_taxonomy[rid][0] if domain == 'd__Bacteria': fout_val = fout_bac_val fout_can = fout_bac_can fout_can_gtdb = fout_bac_can_gtdb fout_val_gtdb = fout_bac_val_gtdb elif domain == 'd__Archaea': fout_val = fout_ar_val fout_can = fout_ar_can fout_can_gtdb = fout_ar_can_gtdb fout_val_gtdb = fout_ar_val_gtdb else: self.logger.error('Genome %s has no GTDB domain assignment.' % rid) sys.exit(-1) # substitute proposed species name into GTDB taxonomy sp = species[rid] canonical_sp = parse_canonical_sp(sp) taxa = prev_gtdb_taxonomy[rid][0:6] + [canonical_sp] new_gtdb_str = '; '.join(taxa) fout_can_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str)) fout_val_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str)) fout_val.write('%s\t%s\t%s\n' % (rid, sp, 'GTDB type or representative genome')) fout_can.write('%s\t%s\t%s\n' % (rid, sp, 'GTDB type or representative genome')) cluster_gids = set(sp_clusters[rid]) for gid in cluster_gids: if gid not in passed_qc: self.logger.error( 'Genome %s is in a species cluster, but fails QC.' % gid) sys.exit(-1) if len(cluster_gids) > 0: # select highest-quality genome q = quality_score(cluster_gids, quality_metadata) gid = max(q.items(), key=operator.itemgetter(1))[0] fout_val.write( '%s\t%s\t%s\n' % (gid, sp, 'selected highest-quality genome (Q=%.2f)' % q[gid])) fout_val_gtdb.write('%s\t%s\n' % (gid, new_gtdb_str)) fout_bac_val.close() fout_ar_val.close() fout_bac_can.close() fout_ar_can.close() fout_bac_can_gtdb.close() fout_bac_val_gtdb.close() fout_ar_can_gtdb.close() fout_ar_val_gtdb.close()
def propagate(self, options): """Propagate labels to all genomes in a cluster.""" check_file_exists(options.input_taxonomy) check_file_exists(options.metadata_file) # get representative genome information rep_metadata = read_gtdb_metadata(options.metadata_file, ['gtdb_representative', 'gtdb_clustered_genomes']) taxonomy = Taxonomy() explict_tax = taxonomy.read(options.input_taxonomy) expanded_taxonomy = {} incongruent_count = 0 for genome_id, taxon_list in explict_tax.iteritems(): taxonomy_str = ';'.join(taxon_list) # Propagate taxonomy strings if genome is a representatives. Also, determine # if genomes clustered together have compatible taxonomies. Note that a genome # may not have metadata as it is possible a User has removed a genome that is # in the provided taxonomy file. _rep_genome, clustered_genomes = rep_metadata.get(genome_id, (None, None)) if clustered_genomes: # genome is a representative clustered_genome_ids = clustered_genomes.split(';') # get taxonomy of all genomes in cluster with a specified taxonomy clustered_genome_tax = {} for cluster_genome_id in clustered_genome_ids: if cluster_genome_id == genome_id: continue if cluster_genome_id not in rep_metadata: continue # genome is no longer in the GTDB so ignore it if cluster_genome_id in explict_tax: clustered_genome_tax[cluster_genome_id] = explict_tax[cluster_genome_id] # determine if representative and clustered genome taxonomy strings are congruent working_cluster_taxonomy = list(taxon_list) incongruent_with_rep = False for cluster_genome_id, cluster_tax in clustered_genome_tax.iteritems(): if incongruent_with_rep: working_cluster_taxonomy = list(taxon_list) # default to rep taxonomy break for r in xrange(0, len(Taxonomy.rank_prefixes)): if cluster_tax[r] == Taxonomy.rank_prefixes[r]: break # no more taxonomy information to consider if cluster_tax[r] != taxon_list[r]: if taxon_list[r] == Taxonomy.rank_prefixes[r]: # clustered genome has a more specific taxonomy string which # should be propagate to the representative if all clustered # genomes are in agreement if working_cluster_taxonomy[r] == Taxonomy.rank_prefixes[r]: # make taxonomy more specific based on genomes in cluster working_cluster_taxonomy[r] = cluster_tax[r] elif working_cluster_taxonomy[r] != cluster_tax[r]: # not all genomes agree on the assignment of this rank so leave it unspecified working_cluster_taxonomy[r] = Taxonomy.rank_prefixes[r] break else: # genomes in cluster have incongruent taxonomies so defer to representative self.logger.warning("Genomes in cluster have incongruent taxonomies.") self.logger.warning("Representative %s: %s" % (genome_id, taxonomy_str)) self.logger.warning("Clustered genome %s: %s" % (cluster_genome_id, ';'.join(cluster_tax))) self.logger.warning("Deferring to taxonomy specified for representative.") incongruent_count += 1 incongruent_with_rep = True break cluster_taxonomy_str = ';'.join(working_cluster_taxonomy) # assign taxonomy to representative and all genomes in the cluster expanded_taxonomy[genome_id] = cluster_taxonomy_str for cluster_genome_id in clustered_genome_ids: expanded_taxonomy[cluster_genome_id] = cluster_taxonomy_str else: if genome_id in expanded_taxonomy: # genome has already been assigned a taxonomy based on its representative pass else: # genome is a singleton expanded_taxonomy[genome_id] = taxonomy_str self.logger.info('Identified %d clusters with incongruent taxonomies.' % incongruent_count) fout = open(options.output_taxonomy, 'w') for genome_id, taxonomy_str in expanded_taxonomy.iteritems(): fout.write('%s\t%s\n' % (genome_id, taxonomy_str)) fout.close() self.logger.info('Taxonomy written to: %s' % options.output_taxonomy)
def run(self, input_tree, lineage_of_interest, outgroup, gtdb_metadata, num_taxa_to_retain, msa_file, keep_unclassified, output_dir): """Dereplicate tree. Parameters ---------- input_tree : str Tree to dereplicate lineage_of_interest : str Named lineage where all taxa should be retain. outgroup : str Named lineage to use as outgroup. gtdb_metadata : str File containing metadata for taxa in tree. num_taxa_to_retain: int Taxa to retain in dereplicated lineages. msa_file : str Multiple sequence alignment to dereplicate along with tree. keep_unclassified : boolean Keep all taxa in unclassified lineages. output_dir: Output dir. """ # read GTDB metadata self.logger.info('Reading metadata.') genome_metadata = read_gtdb_metadata(gtdb_metadata, [ 'checkm_completeness', 'checkm_contamination', 'gtdb_representative' ]) # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # locate node of interest and outgroup node self.logger.info('Identifying lineage of interest and outgroup.') node_of_interest = None outgroup_node = None for node in tree.preorder_node_iter(): _support, taxon_str, _auxiliary_info = parse_label(node.label) if not taxon_str: continue for taxon in [t.strip() for t in taxon_str.split(';')]: if taxon == lineage_of_interest: node_of_interest = node elif taxon == outgroup: outgroup_node = node if not node_of_interest: self.logger.error( 'Could not find specified lineage of interest: %s' % lineage_of_interest) sys.exit() if not outgroup_node: self.logger.error('Could not find outgroup: %s' % outgroup) sys.exit() # select taxa to retain self.logger.info('Selecting taxa to retain.') selected_taxa = self._select_taxa(tree, node_of_interest, outgroup_node, num_taxa_to_retain, keep_unclassified, genome_metadata) self.logger.info('Retaining %d taxa.' % len(selected_taxa)) # prune tree self.logger.info('Pruning tree.') tree.retain_taxa(selected_taxa) # dereplicate MSA if requested if msa_file: self.logger.info('Dereplicating MSA.') msa_name, msa_ext = os.path.splitext(os.path.basename(msa_file)) output_msa = os.path.join(output_dir, msa_name + '.derep' + msa_ext) self._derep_msa(msa_file, selected_taxa, output_msa) # write out results tree_name, tree_ext = os.path.splitext(os.path.basename(input_tree)) output_tree = os.path.join(output_dir, tree_name + '.derep' + tree_ext) tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def propagate(self, options): """Propagate labels to all genomes in a cluster.""" check_file_exists(options.input_taxonomy) check_file_exists(options.metadata_file) user_to_uba = {} if options.uba_mapping_file: self.logger.info('Parsing genome ID mapping file.') with open(options.uba_mapping_file) as f: for line in f: tokens = line.strip().split('\t') if len(tokens) == 2: user_to_uba[tokens[0]] = tokens[1] self.logger.info(' - found mappings for {:,} genomes.'.format( len(user_to_uba))) # get representative genome information rep_metadata = read_gtdb_metadata( options.metadata_file, ['gtdb_representative', 'gtdb_clustered_genomes']) rep_metadata = { canonical_gid(gid): values for gid, values in rep_metadata.items() } rep_metadata = { user_to_uba.get(gid, gid): values for gid, values in rep_metadata.items() } explict_tax = Taxonomy().read(options.input_taxonomy) self.logger.info(f' - identified {len(rep_metadata):,} genomes') # sanity check all representatives have a taxonomy string rep_count = 0 for gid in rep_metadata: is_rep_genome, clustered_genomes = rep_metadata.get( gid, (None, None)) if is_rep_genome: rep_count += 1 if gid not in explict_tax: self.logger.error( 'Expected to find {} in input taxonomy as it is a GTDB representative.' .format(gid)) sys.exit(-1) self.logger.info( 'Identified {:,} representatives in metadata file and {:,} genomes in input taxonomy file.' .format(rep_count, len(explict_tax))) # propagate taxonomy to genomes clustered with each representative fout = open(options.output_taxonomy, 'w') for rid, taxon_list in explict_tax.items(): taxonomy_str = ';'.join(taxon_list) rid = canonical_gid(rid) rid = user_to_uba.get(rid, rid) is_rep_genome, clustered_genomes = rep_metadata[rid] if is_rep_genome: # assign taxonomy to representative and all genomes in the cluster fout.write('{}\t{}\n'.format(rid, taxonomy_str)) for cid in [ gid.strip() for gid in clustered_genomes.split(';') ]: cid = canonical_gid(cid) cid = user_to_uba.get(cid, cid) if cid != rid: if cid in rep_metadata: fout.write('{}\t{}\n'.format(cid, taxonomy_str)) else: self.logger.warning( 'Skipping {} as it is not in GTDB metadata file.' .format(cid)) else: self.logger.error( 'Did not expected to find {} in input taxonomy as it is not a GTDB representative.' .format(rid)) sys.exit(-1) self.logger.info('Taxonomy written to: {}'.format( options.output_taxonomy))
def run(self, max_species, prev_rep_file, trusted_genomes_file, metadata_file, min_rep_comp, max_rep_cont, min_quality, max_contigs, min_N50, max_ambiguous, max_gap_length, strict_filtering, output_file): """Dereplicate genomes to a specific number per named species. Parameters ---------- max_species : int Maximum number of genomes of the same species to retain. prev_rep_file : str File indicating previous representatives to favour during selection. trusted_genomes_file: File containing list of genomes to retain regardless of filtering criteria. metadata_file : str Metadata, including CheckM estimates, for all genomes. min_rep_comp : float [0, 100] Minimum completeness for a genome to be a representative. max_rep_cont : float [0, 100] Maximum contamination for a genome to be a representative. min_quality : float [0, 100] Minimum genome quality (comp-5*cont) for a genome to be a representative. max_contigs : int Maximum number of contigs for a genome to be a representative. min_N50 : int Minimum N50 of scaffolds for a genome to be a representative. max_ambiguous : int Maximum number of ambiguous bases for a genome to be a representative. max_gap_length : int Maximum number of ambiguous bases between contigs for a genome to be a representative. strict_filtering : boolean If True apply filtering to all genomes, otherise apply lenient filtering to genomes where the chromosome and plasmids are reported as complete. output_file : str Output file to contain list of dereplicated genomes. """ trusted_accessions = set() if trusted_genomes_file: for line in open(trusted_genomes_file): line_split = line.split('\t') trusted_accessions.add(line_split[0].strip()) accession_to_taxid, complete_genomes, representative_genomes = ncbi.read_refseq_metadata(metadata_file, keep_db_prefix=True) self.logger.info('Identified %d RefSeq genomes.' % len(accession_to_taxid)) self.logger.info('Identified %d representative or reference genomes.' % len(representative_genomes)) self.logger.info('Identified %d complete genomes.' % len(complete_genomes)) self.logger.info('Identified %d genomes in exception list.' % len(trusted_accessions)) if trusted_accessions.difference(representative_genomes): self.logger.error('There are genomes in the exception list which are not representatives.') sys.exit() gtdb_taxonomy = read_gtdb_taxonomy(metadata_file) ncbi_taxonomy = read_gtdb_ncbi_taxonomy(metadata_file) ncbi_organism_names = read_gtdb_ncbi_organism_name(metadata_file) species = species_label(gtdb_taxonomy, ncbi_taxonomy, ncbi_organism_names) self.logger.info('Identified %d genomes with a GTDB or NCBI species names.' % len(species)) # get previous representatives prev_gtdb_reps = set() for line in open(prev_rep_file): prev_gtdb_reps.add(line.strip().split('\t')[0]) self.logger.info('Identified %d previous GTDB representatives.' % len(prev_gtdb_reps)) # get genome quality genomes_to_consider = list(accession_to_taxid.keys()) genome_stats = read_gtdb_metadata(metadata_file, ['checkm_completeness', 'checkm_contamination', 'contig_count', 'n50_scaffolds', 'ambiguous_bases', 'total_gap_length', 'scaffold_count', 'ssu_count', 'ncbi_molecule_count', 'ncbi_unspanned_gaps', 'ncbi_genome_representation', 'ncbi_spanned_gaps', 'ncbi_assembly_level', 'ncbi_taxonomy', 'ncbi_organism_name', 'lpsn_strain']) missing_quality = set(accession_to_taxid.keys()) - set(genome_stats.keys()) if missing_quality: self.logger.error('There are %d genomes without metadata information.' % len(missing_quality)) self.exit(-1) filtered_reps_file = output_file + '.filtered_reps' fout = open(filtered_reps_file, 'w') fout.write('Genome ID\tCompleteness\tContamination\tContig Count\tN50\tNote\n') lpsn_type_strains = defaultdict(set) new_genomes_to_consider = [] genome_quality = {} filtered_reps = 0 lack_ncbi_taxonomy = 0 contig_filter_count = 0 for genome_id in list(accession_to_taxid.keys()): stats = genome_stats[genome_id] if not stats.ncbi_taxonomy: lack_ncbi_taxonomy += 1 fout.write('%s\t%.2f\t%.2f\t%d\t%d\t%d\t%d\t%s\n' % (genome_id, comp, cont, stats.contig_count, stats.n50_scaffolds, stats.ambiguous_bases, stats.total_gap_length, 'no NCBI taxonomy')) self.logger.warning('Skipping %s as it has no assigned NCBI taxonomy.' % genome_id) continue comp = stats.checkm_completeness cont = stats.checkm_contamination keep = False if genome_id in trusted_accessions: keep = True elif (comp >= min_rep_comp and cont <= max_rep_cont and (comp - 5*cont) >= min_quality and stats.contig_count <= max_contigs and stats.n50_scaffolds >= min_N50 and stats.ambiguous_bases <= max_ambiguous and stats.total_gap_length <= max_gap_length): keep = True elif not strict_filtering: # check if genome appears to consist of only an unspanned # chromosome and unspanned plasmids and thus can be # subjected to a more lenient quality check if (stats.ncbi_assembly_level in ['Complete Genome', 'Chromosome'] and stats.ncbi_genome_representation == 'full' and stats.scaffold_count == stats.ncbi_molecule_count and stats.ncbi_unspanned_gaps == 0 and stats.ncbi_spanned_gaps <= 10 and stats.ambiguous_bases <= 1000 and stats.total_gap_length <= 100000 and stats.ssu_count >= 1): # apply lenient quality check if comp >= 50 and cont <= 15: keep = True if keep: new_genomes_to_consider.append(genome_id) genome_quality[genome_id] = comp - 5*cont if stats.lpsn_strain: ncbi_species = stats.ncbi_taxonomy.split(';')[6].strip() lpsn_type_strains[ncbi_species].add(genome_id) # check if a representative at NCBI is being filtered if genome_id in representative_genomes and genome_id not in new_genomes_to_consider: fout.write('%s\t%.2f\t%.2f\t%d\t%d\t%d\t%d\t%s\n' % (genome_id, comp, cont, stats.contig_count, stats.n50_scaffolds, stats.ambiguous_bases, stats.total_gap_length, stats.ncbi_organism_name)) if stats.contig_count > 300: contig_filter_count += 1 self.logger.warning('Filtered RefSeq representative %s with comp=%.2f, cont=%.2f, contigs=%d, N50=%d' % (genome_id, comp, cont, stats.contig_count, stats.n50_scaffolds)) filtered_reps += 1 fout.close() print('contig_filter_count', contig_filter_count) genomes_to_consider = new_genomes_to_consider self.logger.info('Skipped %d genomes without an assigned NCBI taxonomy.' % lack_ncbi_taxonomy) self.logger.info('Filtered %d representative or reference genomes based on genome or assembly quality.' % filtered_reps) self.logger.info('Filtered representative or reference genomes written to %s' % filtered_reps_file) self.logger.info('Considering %d genomes after filtering for genome quality.' % (len(genomes_to_consider))) ncbi_type_strains = read_gtdb_ncbi_type_strain(metadata_file) self.logger.info('Identified %d genomes marked as type strains at NCBI.' % len(ncbi_type_strains)) self.logger.info('Identified %d genomes marked as type strains at LPSN.' % sum([len(x) for x in list(lpsn_type_strains.values())])) genomes_to_retain = self._dereplicate(genomes_to_consider, max_species, species, representative_genomes, complete_genomes, ncbi_type_strains, lpsn_type_strains, prev_gtdb_reps, genome_quality) self.logger.info('Retained %d genomes.' % len(genomes_to_retain)) if not trusted_genomes_file: trusted_genomes_file = '' fout = open(output_file, 'w') fout.write('# Selection criteria:\n') fout.write('# Maximum species: %d\n' % max_species) fout.write('# Trusted genomes file: %s\n' % trusted_genomes_file) fout.write('# Genome quality metadata file: %s\n' % str(metadata_file)) fout.write('# Min. representative completeness: %s\n' % str(min_rep_comp)) fout.write('# Max. representative contamination: %s\n' % str(max_rep_cont)) fout.write('#\n') fout.write('# Genome Id\tGTDB Taxonomy\tNCBI Taxonomy\tType strain\tComplete\tRepresentative\n') for assembly_accession in genomes_to_retain: representative = 'yes' if assembly_accession in representative_genomes else 'no' complete = 'yes' if assembly_accession in complete_genomes else 'no' ts = 'yes' if assembly_accession in ncbi_type_strains else 'no' gtdb_taxa_str = ';'.join(gtdb_taxonomy.get(assembly_accession, Taxonomy.rank_prefixes)) ncbi_taxa_str = ';'.join(ncbi_taxonomy.get(assembly_accession, Taxonomy.rank_prefixes)) if assembly_accession.startswith('GCF_'): assembly_accession = 'RS_' + assembly_accession elif assembly_accession.startswith('GCA_'): assembly_accession = 'GB_' + assembly_accession fout.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (assembly_accession, gtdb_taxa_str, ncbi_taxa_str, ts, complete, representative)) fout.close()
def run(self, rna_name, gtdb_metadata_file, rna_file, min_rna_length, min_scaffold_length, min_quality, max_contigs, min_N50, tax_filter, genome_list, output_dir, align_method='ssu_align'): """Infer rRNA gene tree spanning select GTDB genomes. Parameters ---------- rna_name : str Name of rRNA gene. gtdb_metadata_file : str File specifying GTDB metadata for each genome. rna_file : str File with rRNA gene sequences in FASTA format. min_rna_length : int Minimum required length of rRNA gene sequences. min_scaffold_length : int Minimum required length of scaffold containing rRNA gene sequence. min_quality : float [0, 100] Minimum genome quality for a genome to be include in tree. max_contigs : int Maximum number of contigs to include genome. min_N50 : int Minimum N50 to include genome. tax_filter : boolean Filter sequences based on incongruent taxonomy classification. genome_list : str Explicit list of genomes to use (ignores --ncbi_rep_only and --user_genomes). output_dir : str Directory to store results """ if rna_name not in ['ssu', 'lsu']: self.logger.error('Unrecognized rRNA gene type: %s' % rna_name) sys.exit(-1) genome_metadata = read_gtdb_metadata(gtdb_metadata_file, [ 'checkm_completeness', 'checkm_contamination', 'scaffold_count', 'n50_scaffolds', 'organism_name', 'gtdb_representative' ]) gtdb_taxonomy = read_gtdb_taxonomy(gtdb_metadata_file) user_genomes = set() uba_genomes = set() ncbi_genomes = set() rep_genomes = set() for genome_id in genome_metadata: org_name = str(genome_metadata[genome_id][4]) if genome_id.startswith('U_'): if '(UBA' in org_name: uba_genomes.add(genome_id) else: user_genomes.add(genome_id) elif genome_id.startswith('RS_') or genome_id.startswith('GB_'): ncbi_genomes.add(genome_id) else: self.logger.warning('Unrecognized genome prefix: %s' % genome_id) rep = genome_metadata[genome_id][5] == 't' if rep: rep_genomes.add(genome_id) self.logger.info( 'Initially considering %d genomes (%d NCBI, %d UBA, %d User).' % (len(genome_metadata), len(ncbi_genomes), len(uba_genomes), len(user_genomes))) self.logger.info('Identified %d representative genomes.' % len(rep_genomes)) # get genomes specified in genome list by user genomes_to_consider = set() if genome_list: for line in open(genome_list): gid = line.rstrip().split('\t')[0] if gid.startswith('RS_') or gid.startswith( 'GB_') or gid.startswith('U_'): genomes_to_consider.add(gid) self.logger.info( 'Restricting genomes to the %d in the genome list.' % len(genomes_to_consider)) else: # filter genomes based on quality and database source self.logger.info('Filtering genomes based on specified critieria.') self.logger.info('Filtering on minimum quality <%d.' % min_quality) self.logger.info('Filtering on number of contigs >%d.' % max_contigs) self.logger.info('Filtering on scaffold N50 <%d.' % min_N50) new_genomes_to_consider = [] filtered_genomes = 0 gt = 0 gq = 0 sc = 0 n50 = 0 for genome_id in genome_metadata: if genome_id not in rep_genomes: gt += 1 filtered_genomes += 1 continue if genome_id not in ncbi_genomes and genome_id not in uba_genomes: gt += 1 filtered_genomes += 1 continue comp, cont, scaffold_count, n50_contigs, _org_name, _rep = genome_metadata[ genome_id] q = float(comp) - 5 * float(cont) if q < min_quality or int(scaffold_count) > max_contigs or int( n50_contigs) < min_N50: if q < min_quality: gq += 1 if int(scaffold_count) > max_contigs: sc += 1 if int(n50_contigs) < min_N50: n50 += 1 filtered_genomes += 1 continue new_genomes_to_consider.append(genome_id) genomes_to_consider = new_genomes_to_consider self.logger.info( 'Filtered %d genomes (%d on genome type, %d on genome quality, %d on number of contigs, %d on N50).' % (filtered_genomes, gt, gq, sc, n50)) self.logger.info('Considering %d genomes after filtering.' % len(genomes_to_consider)) # limit taxonomy to genomes being considered cur_gtdb_taxonomy = {} for gid in genomes_to_consider: cur_gtdb_taxonomy[gid] = gtdb_taxonomy[gid] # get rRNA gene sequences for each genome rna_output_file = self._get_rna_seqs(rna_name, rna_file, min_rna_length, min_scaffold_length, cur_gtdb_taxonomy, genomes_to_consider, output_dir) # identify erroneous rRNA gene sequences if tax_filter: self.logger.info( 'Filtering sequences with incongruent taxonomy strings.') filter = self._tax_filter(rna_output_file, cur_gtdb_taxonomy, output_dir) self.logger.info('Filtered %d sequences.' % len(filter)) if len(filter) > 0: rna_filtered_output = os.path.join( output_dir, 'gtdb_%s.tax_filter.fna' % rna_name) fout = open(rna_filtered_output, 'w') for seq_id, seq, annotation in seq_io.read_seq( rna_output_file, keep_annotation=True): if seq_id not in filter: fout.write('>' + seq_id + ' ' + annotation + '\n') fout.write(seq + '\n') fout.close() rna_output_file = rna_filtered_output # align sequences with ssu-align or mothur if rna_name == 'ssu': if align_method == 'ssu_align': self.logger.info('Aligning sequences with ssu-align.') align_dir = os.path.join(output_dir, '%s_align' % rna_name) os.system('ssu-align --dna %s %s' % (rna_output_file, align_dir)) os.system('ssu-mask --afa %s' % align_dir) elif align_method == 'mothur': self.logger.info('Aligning sequences with mothur.') align_dir = os.path.join(output_dir, 'mothur') if not os.path.exists(align_dir): os.makedirs(align_dir) mothur_cmd = 'mothur "#set.dir(output=%s, blastdir=/srv/sw/Mothur/1.39.5)' % align_dir mothur_cmd += '; align.seqs(candidate=%s, template=/srv/db/mothur/silva_128/silva.seed_v128.align, search=blast, flip=t, processors=%d)' % ( rna_output_file, self.cpus) input_prefix = remove_extension(rna_output_file) align_file = os.path.join(align_dir, input_prefix + '.align') mothur_cmd += '; filter.seqs(fasta=%s, hard=/srv/db/mothur/silva_128/Lane1349.silva.filter, processors=%d);"' % ( align_file, self.cpus) os.system(mothur_cmd) input_msa = os.path.join(align_dir, input_prefix + '.filter.fasta') elif rna_name == 'lsu': self.logger.info('Aligning sequences with ssu-align.') align_dir = os.path.join(output_dir, '%s_align' % rna_name) if not os.path.exists(align_dir): os.makedirs(align_dir) os.system('esl-sfetch --index %s' % rna_output_file) # search fo sequences using domain-specific LSU HMMs for domain in ['archaea', 'bacteria', 'eukaryote']: self.logger.info( 'Matching LSU rRNA genes to %s-specific HMM.' % domain) table_out = os.path.join( align_dir, 'cmsearch.%s.%s.tblout' % (rna_name, domain)) cm_dir = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'cm_files') cm_file = os.path.join(cm_dir, 'lsu_%s.cm' % domain) log_file = os.path.join( align_dir, 'cmsearch.%s.%s.out' % (rna_name, domain)) os.system( 'cmsearch --hmmonly --cpu %d --noali --tblout %s %s %s > %s' % (self.cpus, table_out, cm_file, rna_output_file, log_file)) # identify top hits for each domain self.logger.info( 'Identifying best domain-specific HMM for each LSU rRNA gene.') top_hits = {} for domain in ['archaea', 'bacteria', 'eukaryote']: table_out = os.path.join( align_dir, 'cmsearch.%s.%s.tblout' % (rna_name, domain)) for line in open(table_out): if line[0] == '#': continue line_split = line.split() seq_id = line_split[0] start_seq = int(line_split[7]) end_seq = int(line_split[8]) bitscore = float(line_split[14]) prev_bitscore = top_hits.get(seq_id, [None, 0, 0, 0, 0])[4] if bitscore > prev_bitscore: top_hits[seq_id] = [ domain, seq_id, start_seq, end_seq, bitscore ] # create MSA for each bacteria and archaea for domain in ['archaea', 'bacteria']: # creat file of top hits top_hits_out = os.path.join( align_dir, 'top_hits.%s.%s.tsv' % (rna_name, domain)) fout = open(top_hits_out, 'w') num_hits = 0 for top_domain, seq_id, start_seq, end_seq, bitscore in top_hits.values( ): if top_domain == domain: fout.write('%s\t%d\t%d\%f\n' % (seq_id, start_seq, end_seq, bitscore)) num_hits += 1 fout.close() # align top hits self.logger.info( 'Creating MSA for %s LSU rRNA genes (%d sequences).' % (domain, num_hits)) if num_hits > 0: seq_file = os.path.join( align_dir, 'cmsearch.%s.%s.fna' % (rna_name, domain)) os.system( "grep -v '^#' %s | awk '{print $1, $2, $3, $1}' | esl-sfetch -Cf %s - > %s" % (top_hits_out, rna_output_file, seq_file)) align_file = os.path.join( align_dir, 'cmalign.%s.%s.stk' % (rna_name, domain)) os.system('cmalign --dnaout --outformat Pfam %s %s > %s' % (cm_file, seq_file, align_file)) masked_file = os.path.join( align_dir, 'cmalign.%s.%s.mask.afa' % (rna_name, domain)) os.system('esl-alimask -p --outformat AFA %s > %s' % (align_file, masked_file)) # trim sequences and infer tree if align_method == 'ssu_align': for domain in ['archaea', 'bacteria']: if rna_name == 'ssu': input_msa = os.path.join( align_dir, 'ssu_align.' + domain + '.mask.afa') elif rna_name == 'lsu': input_msa = os.path.join( align_dir, 'cmalign.%s.%s.mask.afa' % (rna_name, domain)) if not os.path.exists(input_msa): continue trimmed_msa = os.path.join(output_dir, domain + '.trimmed.fna') self._trim_seqs(input_msa, trimmed_msa) # infer tree self.logger.info('Inferring tree for %s genes.' % domain) output_tree = os.path.join(output_dir, domain + '.tree') os.system('FastTreeMP -nosupport -nt -gtr -gamma %s > %s' % (trimmed_msa, output_tree)) elif align_method == 'mothur': trimmed_msa = os.path.join(output_dir, input_prefix + '.trimmed.fna') self._trim_seqs(input_msa, trimmed_msa) # infer tree self.logger.info('Inferring tree for %s genes.') output_tree = os.path.join(output_dir, input_prefix + '.tree') os.system('FastTreeMP -nosupport -nt -gtr -gamma %s > %s' % (trimmed_msa, output_tree))
def propagate(self, options): """Propagate labels to all genomes in a cluster.""" check_file_exists(options.input_taxonomy) check_file_exists(options.metadata_file) # get representative genome information rep_metadata = read_gtdb_metadata( options.metadata_file, ['gtdb_representative', 'gtdb_clustered_genomes']) taxonomy = Taxonomy() explict_tax = taxonomy.read(options.input_taxonomy) expanded_taxonomy = {} incongruent_count = 0 for genome_id, taxon_list in explict_tax.iteritems(): taxonomy_str = ';'.join(taxon_list) # Propagate taxonomy strings if genome is a representatives. Also, determine # if genomes clustered together have compatible taxonomies. Note that a genome # may not have metadata as it is possible a User has removed a genome that is # in the provided taxonomy file. _rep_genome, clustered_genomes = rep_metadata.get( genome_id, (None, None)) if clustered_genomes: # genome is a representative clustered_genome_ids = clustered_genomes.split(';') # get taxonomy of all genomes in cluster with a specified taxonomy clustered_genome_tax = {} for cluster_genome_id in clustered_genome_ids: if cluster_genome_id == genome_id: continue if cluster_genome_id not in rep_metadata: continue # genome is no longer in the GTDB so ignore it if cluster_genome_id in explict_tax: clustered_genome_tax[cluster_genome_id] = explict_tax[ cluster_genome_id] # determine if representative and clustered genome taxonomy strings are congruent working_cluster_taxonomy = list(taxon_list) incongruent_with_rep = False for cluster_genome_id, cluster_tax in clustered_genome_tax.iteritems( ): if incongruent_with_rep: working_cluster_taxonomy = list( taxon_list) # default to rep taxonomy break for r in xrange(0, len(Taxonomy.rank_prefixes)): if cluster_tax[r] == Taxonomy.rank_prefixes[r]: break # no more taxonomy information to consider if cluster_tax[r] != taxon_list[r]: if taxon_list[r] == Taxonomy.rank_prefixes[r]: # clustered genome has a more specific taxonomy string which # should be propagate to the representative if all clustered # genomes are in agreement if working_cluster_taxonomy[ r] == Taxonomy.rank_prefixes[r]: # make taxonomy more specific based on genomes in cluster working_cluster_taxonomy[r] = cluster_tax[ r] elif working_cluster_taxonomy[ r] != cluster_tax[r]: # not all genomes agree on the assignment of this rank so leave it unspecified working_cluster_taxonomy[ r] = Taxonomy.rank_prefixes[r] break else: # genomes in cluster have incongruent taxonomies so defer to representative self.logger.warning( "Genomes in cluster have incongruent taxonomies." ) self.logger.warning("Representative %s: %s" % (genome_id, taxonomy_str)) self.logger.warning( "Clustered genome %s: %s" % (cluster_genome_id, ';'.join(cluster_tax))) self.logger.warning( "Deferring to taxonomy specified for representative." ) incongruent_count += 1 incongruent_with_rep = True break cluster_taxonomy_str = ';'.join(working_cluster_taxonomy) # assign taxonomy to representative and all genomes in the cluster expanded_taxonomy[genome_id] = cluster_taxonomy_str for cluster_genome_id in clustered_genome_ids: expanded_taxonomy[cluster_genome_id] = cluster_taxonomy_str else: if genome_id in expanded_taxonomy: # genome has already been assigned a taxonomy based on its representative pass else: # genome is a singleton expanded_taxonomy[genome_id] = taxonomy_str self.logger.info( 'Identified %d clusters with incongruent taxonomies.' % incongruent_count) fout = open(options.output_taxonomy, 'w') for genome_id, taxonomy_str in expanded_taxonomy.iteritems(): fout.write('%s\t%s\n' % (genome_id, taxonomy_str)) fout.close() self.logger.info('Taxonomy written to: %s' % options.output_taxonomy)
def run(self, metadata_file, gtdb_user_genomes_file, gtdb_user_reps, ncbi_refseq_assembly_file, ncbi_genbank_assembly_file, gtdb_domain_report, min_comp, max_cont, min_quality, sh_exception, min_perc_markers, max_contigs, min_N50, max_ambiguous, output_dir): """Quality check all potential GTDB genomes.""" # get GTDB and NCBI taxonomy strings for each genome self.logger.info('Reading NCBI taxonomy from GTDB metadata file.') ncbi_taxonomy = read_gtdb_ncbi_taxonomy(metadata_file) ncbi_species = binomial_species(ncbi_taxonomy) gtdb_taxonomy = read_gtdb_taxonomy(metadata_file) self.logger.info('Read NCBI taxonomy for %d genomes.' % len(ncbi_taxonomy)) self.logger.info('Read GTDB taxonomy for %d genomes.' % len(gtdb_taxonomy)) # determine User genomes to retain for consideration gtdb_user_to_genbank = self._gtdb_user_genomes(gtdb_user_genomes_file, metadata_file) self.logger.info( 'Identified %d GTDB User genomes with GenBank accessions to retain for potential inclusion in GTDB.' % len(gtdb_user_to_genbank)) user_genomes = 0 for line in open(gtdb_user_reps): line_split = line.strip().split('\t') gid, taxonomy = line_split if gid not in gtdb_user_to_genbank: if 'd__Bacteria' in taxonomy: self.logger.warning( 'Bacterial genome %s has no NCBI accession and is being skipped.' % gid) else: gtdb_user_to_genbank[gid] = gid user_genomes += 1 self.logger.info( 'Identified %d archaeal GTDB User genome WITHOUT GenBank accessions to retain for potential inclusion in GTDB.' % user_genomes) # calculate quality score for genomes self.logger.info('Parsing QC statistics for each genome.') quality_metadata = read_gtdb_metadata(metadata_file, [ 'checkm_completeness', 'checkm_contamination', 'checkm_strain_heterogeneity_100', 'contig_count', 'n50_contigs', 'ambiguous_bases', 'genome_size' ]) marker_perc = parse_marker_percentages(gtdb_domain_report) # parse NCBI assembly files self.logger.info('Parsing NCBI assembly files.') excluded_from_refseq_note = exclude_from_refseq( ncbi_refseq_assembly_file, ncbi_genbank_assembly_file) # get type material designations for each genome self.logger.info( 'Reading type material designations for genomes from GTDB metadata file.' ) type_metadata = read_gtdb_metadata(metadata_file, [ 'ncbi_type_material_designation', 'gtdb_type_designation', 'gtdb_type_designation_sources' ]) ncbi_tsp = ncbi_type_strain_of_species(type_metadata) gtdb_tsp = gtdb_type_strain_of_species(type_metadata) # QC all genomes self.logger.info('Validating genomes.') fout_retained = open(os.path.join(output_dir, 'qc_passed.tsv'), 'w') fout_failed = open(os.path.join(output_dir, 'qc_failed.tsv'), 'w') header = 'Accession\tNCBI species' header += '\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%%' header += '\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases' fout_retained.write(header + '\n') fout_failed.write(header) fout_failed.write( '\tFailed completeness\tFailed contamination\tFailed quality') fout_failed.write( '\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases\n' ) num_retained = 0 num_filtered = 0 for gid in quality_metadata: if gid.startswith('U_') and gid not in gtdb_user_to_genbank: # skip user genomes not marked for retention continue failed_tests = defaultdict(int) passed_qc = pass_qc(quality_metadata[gid], marker_perc[gid], min_comp, max_cont, min_quality, sh_exception, min_perc_markers, max_contigs, min_N50, max_ambiguous, failed_tests) if passed_qc: num_retained += 1 fout_retained.write('%s\t%s' % (gid, ncbi_taxonomy[gid][6])) fout_retained.write( '\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d\n' % (quality_metadata[gid].checkm_completeness, quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_completeness - 5 * quality_metadata[gid].checkm_contamination, ('%.2f' % quality_metadata[gid].checkm_strain_heterogeneity_100) if quality_metadata[gid].checkm_strain_heterogeneity_100 else '-', marker_perc[gid], quality_metadata[gid].contig_count, quality_metadata[gid].n50_contigs, quality_metadata[gid].ambiguous_bases)) else: num_filtered += 1 fout_failed.write('%s\t%s' % (gid, ncbi_taxonomy[gid][6])) fout_failed.write( '\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d' % (quality_metadata[gid].checkm_completeness, quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_completeness - 5 * quality_metadata[gid].checkm_contamination, ('%.2f' % quality_metadata[gid].checkm_strain_heterogeneity_100) if quality_metadata[gid].checkm_strain_heterogeneity_100 else '-', marker_perc[gid], quality_metadata[gid].contig_count, quality_metadata[gid].n50_contigs, quality_metadata[gid].ambiguous_bases)) fout_failed.write( '\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' % (failed_tests['comp'], failed_tests['cont'], failed_tests['qual'], failed_tests['marker_perc'], failed_tests['contig_count'], failed_tests['N50'], failed_tests['ambig'])) fout_retained.close() fout_failed.close() self.logger.info('Retained %d genomes and filtered %d genomes.' % (num_retained, num_filtered)) # QC genomes in each named species self.logger.info( 'Performing QC of type genome for each of the %d NCBI species.' % len(ncbi_species)) fout_type_fail = open( os.path.join(output_dir, 'type_genomes_fail_qc.tsv'), 'w') fout_type_fail.write( 'Species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tType sources\tNCBI assembly type\tGenome size (bp)' ) fout_type_fail.write( '\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%' ) fout_type_fail.write( '\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases\tNCBI exclude from RefSeq\tLost species\n' ) fout_fail_sp = open(os.path.join(output_dir, 'species_fail_qc.tsv'), 'w') fout_fail_sp.write( 'Species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tAssembled from type material\tGenome size (bp)' ) fout_fail_sp.write( '\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%' ) fout_fail_sp.write( '\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases') fout_fail_sp.write( '\tFailed completeness\tFailed contamination\tFailed quality') fout_fail_sp.write( '\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases' ) fout_fail_sp.write('\tNCBI exclude from RefSeq\n') fout_sp_lost = open(os.path.join(output_dir, 'species_lost.tsv'), 'w') fout_sp_lost.write('Species\tNo. genomes\tNo. type genomes') fout_sp_lost.write( '\tFail completeness\tFail contamination\tFail quality\tFailed percent markers' ) fout_sp_lost.write( '\tFail no. contigs\tFail N50 contigs\tFail ambiguous bases\n') lost_type = 0 lost_sp = 0 filtered_genomes = 0 failed_tests_cumulative = defaultdict(int) for sp, gids in ncbi_species.items(): type_pass = set() type_fail = set() other_pass = set() other_fail = set() failed_tests_gids = {} for gid in gids: failed_tests = defaultdict(int) passed_qc = pass_qc(quality_metadata[gid], marker_perc[gid], min_comp, max_cont, min_quality, sh_exception, min_perc_markers, max_contigs, min_N50, max_ambiguous, failed_tests) failed_tests_gids[gid] = failed_tests if gid in gtdb_tsp or gid in ncbi_tsp: if passed_qc: type_pass.add(gid) else: type_fail.add(gid) filtered_genomes += 1 else: if passed_qc: other_pass.add(gid) else: other_fail.add(gid) filtered_genomes += 1 # tally failed species for test, count in failed_tests.items(): failed_tests_cumulative[test] += count if len(type_pass) >= 1: # great: one or more type genomes pass QC and will be selected as the type genome continue if len(type_fail): # all potential type genomes for species failed QC so report these for manual inspection lost_type += 1 for gid in type_fail: fout_type_fail.write( '%s\t%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d\t%s\t%s\n' % (sp, gid, '; '.join(gtdb_taxonomy[gid]), '; '.join( ncbi_taxonomy[gid]), type_metadata[gid].gtdb_type_designation_sources, type_metadata[gid].ncbi_type_material_designation, float(quality_metadata[gid].genome_size) / 1e6, quality_metadata[gid].checkm_completeness, quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_completeness - 5 * quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_strain_heterogeneity_100, marker_perc[gid], quality_metadata[gid].contig_count, quality_metadata[gid].n50_contigs, quality_metadata[gid].ambiguous_bases, excluded_from_refseq_note[gid], len(other_pass) == 0)) if len(other_pass) == 0: # no genomes for species pass QC so report loss of species lost_sp += 1 fout_sp_lost.write('%s\t%d\t%d' % (sp, len(gids), len(type_fail))) fout_sp_lost.write( '\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' % (sum([failed_tests_gids[gid]['comp'] for gid in gids]), sum([failed_tests_gids[gid]['cont'] for gid in gids]), sum([failed_tests_gids[gid]['qual'] for gid in gids]), sum([ failed_tests_gids[gid]['marker_perc'] for gid in gids ]), sum([ failed_tests_gids[gid]['contig_count'] for gid in gids ]), sum([failed_tests_gids[gid]['N50'] for gid in gids]), sum([failed_tests_gids[gid]['ambig'] for gid in gids]))) for gid in type_fail.union(other_fail): fout_fail_sp.write( '%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d' % (sp, gid, '; '.join(gtdb_taxonomy[gid]), '; '.join( ncbi_taxonomy[gid]), gid in type_fail, float(quality_metadata[gid].genome_size) / 1e6, quality_metadata[gid].checkm_completeness, quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_completeness - 5 * quality_metadata[gid].checkm_contamination, quality_metadata[gid].checkm_strain_heterogeneity_100, marker_perc[gid], quality_metadata[gid].contig_count, quality_metadata[gid].n50_contigs, quality_metadata[gid].ambiguous_bases)) fout_fail_sp.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % (failed_tests_gids[gid]['comp'], failed_tests_gids[gid]['cont'], failed_tests_gids[gid]['qual'], failed_tests_gids[gid]['marker_perc'], failed_tests_gids[gid]['contig_count'], failed_tests_gids[gid]['N50'], failed_tests_gids[gid]['ambig'])) fout_fail_sp.write('\t%s\n' % excluded_from_refseq_note[gid]) fout_type_fail.close() fout_fail_sp.close() fout_sp_lost.close() self.logger.info('Genomes filtered for each criterion:') for test in sorted(failed_tests_cumulative): self.logger.info('%s: %d' % (test, failed_tests_cumulative[test])) self.logger.info('Filtered %d genomes assigned to NCBI species.' % filtered_genomes) self.logger.info( 'Identified %d species with type genomes failing QC and %d total species failing QC.' % (lost_type, lost_sp))