def _run_prodigal(self, genome_id, fasta_path): """Run Prodigal. Parameters ---------- fasta_path : str Path to FASTA file to process. :return False if an error occurred. """ output_dir = os.path.join(self.marker_gene_dir, genome_id) prodigal = BioLibProdigal(1, False) summary_stats = prodigal.run([fasta_path], output_dir, called_genes=self.proteins) # An error occurred in BioLib Prodigal. if not summary_stats: if self.force: return None else: raise Exception( "An error was encountered while running Prodigal.") summary_stats = summary_stats[list(summary_stats.keys())[0]] # rename output files to adhere to GTDB conventions and desired genome # ID aa_gene_file = os.path.join(output_dir, genome_id + self.protein_file_suffix) shutil.move(summary_stats.aa_gene_file, aa_gene_file) nt_gene_file = None gff_file = None translation_table_file = None if not self.proteins: nt_gene_file = os.path.join(output_dir, genome_id + self.nt_gene_file_suffix) shutil.move(summary_stats.nt_gene_file, nt_gene_file) gff_file = os.path.join(output_dir, genome_id + self.gff_file_suffix) shutil.move(summary_stats.gff_file, gff_file) # save translation table information translation_table_file = os.path.join( output_dir, 'prodigal_translation_table.tsv') with open(translation_table_file, 'w') as fout: fout.write('%s\t%d\n' % ('best_translation_table', summary_stats.best_translation_table)) fout.write( '%s\t%.2f\n' % ('coding_density_4', summary_stats.coding_density_4 * 100)) fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats.coding_density_11 * 100)) # Create a hash of each file for out_file in [ nt_gene_file, gff_file, translation_table_file, aa_gene_file ]: if out_file is not None: with open(out_file + CHECKSUM_SUFFIX, 'w') as fh: fh.write(sha256(out_file)) return aa_gene_file, nt_gene_file, gff_file, translation_table_file, summary_stats.best_translation_table
def _run_prodigal(self, genome_id, fasta_path, usr_tln_table): """Run Prodigal. Parameters ---------- fasta_path : str Path to FASTA file to process. usr_tln_table : int User-specified translation table, None if automatic. :return False if an error occurred. """ # Set the paths for output files. output_dir = os.path.join(self.marker_gene_dir, genome_id) aa_gene_file = os.path.join(output_dir, genome_id + self.protein_file_suffix) nt_gene_file = os.path.join(output_dir, genome_id + self.nt_gene_file_suffix) gff_file = os.path.join(output_dir, genome_id + self.gff_file_suffix) translation_table_file = os.path.join( output_dir, 'prodigal_translation_table.tsv') out_files = (nt_gene_file, gff_file, translation_table_file, aa_gene_file) # Check if this genome has already been processed (skip). if all([file_has_checksum(x) for x in out_files]): tln_table_file = TlnTableFile(translation_table_file) tln_table_file.read() self.warnings.info(f'Skipped Prodigal processing for: {genome_id}') return aa_gene_file, nt_gene_file, gff_file, translation_table_file, tln_table_file.best_tln_table, True # Run Prodigal prodigal = BioLibProdigal(1, False) summary_stats = prodigal.run([fasta_path], output_dir, called_genes=False, translation_table=usr_tln_table) # An error occurred in BioLib Prodigal. if not summary_stats: if self.force: return None else: raise Exception( "An error was encountered while running Prodigal.") summary_stats = summary_stats[list(summary_stats.keys())[0]] # rename output files to adhere to GTDB conventions and desired genome # ID shutil.move(summary_stats.aa_gene_file, aa_gene_file) shutil.move(summary_stats.nt_gene_file, nt_gene_file) shutil.move(summary_stats.gff_file, gff_file) # save translation table information tln_table_file = TlnTableFile( translation_table_file, best_tln_table=summary_stats.best_translation_table, coding_density_4=round(summary_stats.coding_density_4 * 100, 2), coding_density_11=round(summary_stats.coding_density_11 * 100, 2)) tln_table_file.write() # Create a hash of each file for out_file in out_files: if out_file is not None: with open(out_file + CHECKSUM_SUFFIX, 'w') as fh: fh.write(sha256(out_file)) return aa_gene_file, nt_gene_file, gff_file, translation_table_file, summary_stats.best_translation_table, False
def _run_prodigal(self, genome_id, fasta_path): """Run Prodigal. Parameters ---------- fasta_path : str Path to FASTA file to process. :return False if an error occurred. """ # Setup output files output_dir = os.path.join(self.marker_gene_dir, genome_id) aa_gene_file = os.path.join(output_dir, genome_id + self.protein_file_suffix) nt_gene_file = None gff_file = None translation_table_file = None if not self.proteins: nt_gene_file = os.path.join(output_dir, genome_id + self.nt_gene_file_suffix) gff_file = os.path.join(output_dir, genome_id + self.gff_file_suffix) translation_table_file = os.path.join( output_dir, 'prodigal' + TRANSLATION_TABLE_SUFFIX) # Return early if files are already done if not self.proteins and file_has_checksum(aa_gene_file) and file_has_checksum(nt_gene_file) \ and file_has_checksum(gff_file) and file_has_checksum(translation_table_file): best_tln_table = -1 with open(translation_table_file, 'r') as tln_f: for line in tln_f.readlines(): cols = line.strip().split('\t') if cols[0] == 'best_translation_table': best_tln_table = int(cols[1]) break if best_tln_table > 0: self.logger.info( 'Skipping result from a previous run: {}'.format( genome_id)) return aa_gene_file, nt_gene_file, gff_file, translation_table_file, best_tln_table # Did not meet the conditions to skip processing this genome, call genes. prodigal = BioLibProdigal(1, False) summary_stats = prodigal.run([fasta_path], output_dir, called_genes=self.proteins) # An error occured in BioLib Prodigal. if not summary_stats: if self.force: return None else: raise GTDBTkExit( "Prodigal failed to call genes for: {} " "(to skip these genomes, re-run with --force)".format( genome_id)) summary_stats = list(summary_stats.values())[0] # rename output files to adhere to GTDB conventions and desired genome # ID shutil.move(summary_stats.aa_gene_file, aa_gene_file) with open(aa_gene_file + CHECKSUM_SUFFIX, 'w') as f: f.write(sha256(aa_gene_file)) if not self.proteins: shutil.move(summary_stats.nt_gene_file, nt_gene_file) with open(nt_gene_file + CHECKSUM_SUFFIX, 'w') as f: f.write(sha256(nt_gene_file)) shutil.move(summary_stats.gff_file, gff_file) with open(gff_file + CHECKSUM_SUFFIX, 'w') as f: f.write(sha256(gff_file)) # save translation table information translation_table_file = os.path.join( output_dir, 'prodigal_translation_table.tsv') with open(translation_table_file, 'w') as fout: fout.write('%s\t%d\n' % ('best_translation_table', summary_stats.best_translation_table)) fout.write( '%s\t%.2f\n' % ('coding_density_4', summary_stats.coding_density_4 * 100)) fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats.coding_density_11 * 100)) fout.write( '%s\t%.2f\n' % ('probability_4', summary_stats.probability_4 * 100)) fout.write( '%s\t%.2f\n' % ('probability_11', summary_stats.probability_11 * 100)) with open(translation_table_file + CHECKSUM_SUFFIX, 'w') as f: f.write(sha256(translation_table_file)) return aa_gene_file, nt_gene_file, gff_file, translation_table_file, summary_stats.best_translation_table