def run(self, gtdb_bac_taxonomy_file, gtdb_ar_taxonomy_file, silva_ssu_ref, silva_lsu_ref, ssu_blast_table, lsu_blast_table, output_dir): """Create table assigning GTDB taxonomy to SILVA accessions based on SSU and LSU BLAST results.""" if not os.path.exists(output_dir): os.makedirs(output_dir) # read GTDB taxonomy print('Reading GTDB taxonomy.') gtdb_bac_taxonomy = Taxonomy().read(gtdb_bac_taxonomy_file) gtdb_ar_taxonomy = Taxonomy().read(gtdb_ar_taxonomy_file) gtdb_taxonomy = gtdb_bac_taxonomy.copy() gtdb_taxonomy.update(gtdb_ar_taxonomy) print('Identified %d bacterial genomes to process.' % len(gtdb_bac_taxonomy)) print('Identified %d archaeal genomes to process.' % len(gtdb_ar_taxonomy)) print('Identified %d genomes to process.' % len(gtdb_taxonomy)) # read SILVA taxonomy print('Reading SILVA 16S and 23S rRNA taxonomies.') silva_ssu_taxonomy = {} for seq_id, seq, taxonomy in seq_io.read_seq(silva_ssu_ref, keep_annotation=True): silva_ssu_taxonomy[seq_id] = taxonomy silva_lsu_taxonomy = {} for seq_id, seq, taxonomy in seq_io.read_seq(silva_lsu_ref, keep_annotation=True): silva_lsu_taxonomy[seq_id] = taxonomy # parse BLAST tables print('Parsing BLAST tables.') ssu_table = os.path.join(output_dir, 'ssu_silva.tsv') self._parse_blast_table(ssu_blast_table, gtdb_taxonomy, silva_ssu_taxonomy, self.min_ssu_len, ssu_table) lsu_table = os.path.join(output_dir, 'lsu_silva.tsv') self._parse_blast_table(lsu_blast_table, gtdb_taxonomy, silva_lsu_taxonomy, self.min_lsu_len, lsu_table)
def run(self, gtdb_bac_taxonomy_file, gtdb_ar_taxonomy_file, gtdb_path_file, gtdb_metadata_file, output_dir): """Create FASTA files with all 16S and 23S rRNA sequences from GTDB genomes.""" # get User ID to UBA translation print('Reading GTDB metadata to translate User IDs to UBA IDs.') user_id_to_uba = {} with open(gtdb_metadata_file) as f: f.readline() for line in f: line_split = line.strip().split('\t') gid = line_split[0] org_name = line_split[1] if '(UBA' in org_name: uba_id = org_name.split('(')[-1].replace(')', '') user_id_to_uba[gid] = uba_id # read GTDB taxonomy print('Reading GTDB taxonomy.') gtdb_bac_taxonomy = Taxonomy().read(gtdb_bac_taxonomy_file) gtdb_ar_taxonomy = Taxonomy().read(gtdb_ar_taxonomy_file) gtdb_taxonomy = gtdb_bac_taxonomy.copy() gtdb_taxonomy.update(gtdb_ar_taxonomy) print('Identified %d bacterial genomes to process.' % len(gtdb_bac_taxonomy)) print('Identified %d archaeal genomes to process.' % len(gtdb_ar_taxonomy)) print('Identified %d genomes to process.' % len(gtdb_taxonomy)) # read genome paths print('Reading path to genomes.') genome_paths = {} for line in open(gtdb_path_file): gid, gid_path = line.strip().split('\t') if gid in user_id_to_uba: gid = user_id_to_uba[gid] genome_paths[gid] = gid_path # sanity check data missing_paths = set(gtdb_taxonomy.keys()) - set(genome_paths.keys()) if len(missing_paths) > 0: print( '[WARNING] There are %d genomes in the taxonomy file without a specified genome path.' % len(missing_paths)) # create FASTA file with 16S and 23S rRNA sequence files print('Parsing 16S and 23S rRNA sequence files.') if not os.path.exists(output_dir): os.makedirs(output_dir) fout_16S = open(os.path.join(output_dir, 'ssu.fna'), 'w') fout_23S = open(os.path.join(output_dir, 'lsu.fna'), 'w') missing_ssu = 0 missing_lsu = 0 for i, gid in enumerate(gtdb_taxonomy): if i % 1000 == 0: print('Processed %d genomes.' % i) if gid not in genome_paths: print( '[WARNING] Genome %s does not have a specified genome path.' % gid) continue genome_path = genome_paths[gid] ssu_file = os.path.join(genome_path, 'rna_silva', 'ssu.fna') if not os.path.exists(ssu_file): missing_ssu += 1 continue ssu_info_file = os.path.join(genome_path, 'rna_silva', 'ssu.hmm_summary.tsv') ssu_info = {} with open(ssu_info_file) as f: header = f.readline().strip().split('\t') contig_len_index = header.index('Sequence length') for line in f: line_split = line.strip().split('\t') gene_id = line_split[0] contig_length = int(line_split[contig_len_index]) ssu_info[gene_id] = contig_length for ssu_index, (seq_id, seq) in enumerate(seq_io.read_seq(ssu_file)): fout_16S.write('>%s~%s [ssu=%d bp] [contig=%d bp]\n' % (gid, seq_id, len(seq), ssu_info[seq_id])) fout_16S.write('%s\n' % seq) lsu_file = os.path.join(genome_path, 'rna_silva', 'lsu_23S.fna') if not os.path.exists(lsu_file): missing_lsu += 1 continue lsu_info_file = os.path.join(genome_path, 'rna_silva', 'lsu_23S.hmm_summary.tsv') lsu_info = {} with open(lsu_info_file) as f: header = f.readline().strip().split('\t') contig_len_index = header.index('Sequence length') for line in f: line_split = line.strip().split('\t') gene_id = line_split[0] contig_length = int(line_split[contig_len_index]) lsu_info[gene_id] = contig_length for lsu_index, (seq_id, seq) in enumerate(seq_io.read_seq(lsu_file)): fout_23S.write('>%s~%s [ssu=%d bp] [contig=%d bp]\n' % (gid, seq_id, len(seq), lsu_info[seq_id])) fout_23S.write('%s\n' % seq) fout_16S.close() fout_23S.close() print( 'There were %d of %d (%.2f%%) genomes without an identifier 16S rRNA gene.' % (missing_ssu, len(gtdb_taxonomy), missing_ssu * 100.0 / len(gtdb_taxonomy))) print( 'There were %d of %d (%.2f%%) genomes without an identifier 23S rRNA gene.' % (missing_lsu, len(gtdb_taxonomy), missing_lsu * 100.0 / len(gtdb_taxonomy)))
def run(self, gtdb_bac_taxonomy_file, gtdb_ar_taxonomy_file, ssu_silva_table, lsu_silva_table, ssu_info_file, lsu_info_file, output_dir): """Parse tables with SILVA assignments to identify potentially erroneous 16S and 23S rRNA genes in GTDB genomes.""" if not os.path.exists(output_dir): os.makedirs(output_dir) fout = open(os.path.join(output_dir, 'silva_incongruence_test.tsv'), 'w') fout.write('Genome ID\tTest\tIncongruent rank') fout.write( '\tSILVA taxon A\tSILVA taxon B\tSILVA taxonomy A\tSILVA taxonomy B' ) fout.write('\tGTDB taxonomy\tNote') fout.write('\tIn reference tree A\tIn reference tree B') fout.write( '\tGene ID A\trRNA length\tContig length\tGene ID B\trRNA length\tContig length\n' ) # read GTDB taxonomy print('Reading GTDB taxonomy.') gtdb_bac_taxonomy = Taxonomy().read(gtdb_bac_taxonomy_file) gtdb_ar_taxonomy = Taxonomy().read(gtdb_ar_taxonomy_file) gtdb_taxonomy = gtdb_bac_taxonomy.copy() gtdb_taxonomy.update(gtdb_ar_taxonomy) # read genomes in SSU and LSU trees print('Reading genomes in 16S and 23S gene trees.') ssu_ref = {} with open(ssu_info_file) as f: header = f.readline().strip().split('\t') rna_length_index = header.index('SSU gene length') contig_len_index = header.index('Sequence length') for line in f: line_split = line.strip().split('\t') gene_id = line_split[0] contig_id = line_split[1] gene_id = gene_id.split('~')[0] + '~' + contig_id rna_length = int(line_split[rna_length_index]) contig_length = int(line_split[contig_len_index]) ssu_ref[gene_id] = (rna_length, contig_length) lsu_ref = {} with open(lsu_info_file) as f: header = f.readline().strip().split('\t') rna_length_index = header.index('SSU gene length') contig_len_index = header.index('Sequence length') for line in f: line_split = line.strip().split('\t') gene_id = line_split[0] contig_id = line_split[1] gene_id = gene_id.split('~')[0] + '~' + contig_id rna_length = int(line_split[rna_length_index]) contig_length = int(line_split[contig_len_index]) lsu_ref[gene_id] = (rna_length, contig_length) # run tests to find potentially incongruent 16S or 23S rRNA genes print( 'Performing tests to identify potentially incongruent 16S or 23S rRNA genes.' ) self._multigene_silva_assignment_test( ssu_silva_table, gtdb_taxonomy, ssu_ref, 'SSU', 'Genome has multiple 16S rRNA genes with incongrent SILVA assignments.', fout) self._multigene_silva_assignment_test( lsu_silva_table, gtdb_taxonomy, lsu_ref, 'LSU', 'Genome has multiple 23S rRNA genes with incongrent SILVA assignments.', fout) self._ssu_lsu_silva_assignment_test( ssu_silva_table, lsu_silva_table, gtdb_taxonomy, ssu_ref, lsu_ref, 'SSU/LSU', 'Genome has a 16S and 23S rRNA gene with incongruent SILVA assignments.', fout) fout.close()