def _path_to_identify_data(self, identity_dir, warn=True): """Get path to genome data produced by 'identify' command.""" marker_gene_dir = os.path.join(identity_dir, DIR_MARKER_GENE) genomic_files = {} lq_gids = list() for gid in os.listdir(marker_gene_dir): gid_dir = os.path.join(marker_gene_dir, gid) if not os.path.isdir(gid_dir): continue aa_gene_path = os.path.join(gid_dir, gid + self.protein_file_suffix) # Check if any genes were called if os.path.getsize(aa_gene_path) < 1: lq_gids.append(gid) else: genomic_files[gid] = { 'aa_gene_path': aa_gene_path, 'translation_table_path': TlnTableFile.get_path(gid_dir, gid), 'nt_gene_path': os.path.join(gid_dir, gid + self.nt_gene_file_suffix), 'gff_path': os.path.join(gid_dir, gid + self.gff_file_suffix) } if len(lq_gids) > 0 and warn: self.logger.warning( f'Excluding {len(lq_gids)} genomes ' f'in the identify directory which have no genes ' f'called (see gtdbtk.warnings.log)') self.warnings.warning( f'Excluding the following {len(lq_gids)} genomes ' f'which were found in the identify directory ' f'with no genes called.') for lq_gid in lq_gids: self.warnings.info(lq_gid) return genomic_files
def _run_prodigal(self, genome_id, fasta_path, usr_tln_table): """Run Prodigal. Parameters ---------- fasta_path : str Path to FASTA file to process. usr_tln_table : int User-specified translation table, None if automatic. :return False if an error occurred. """ # Set the paths for output files. output_dir = os.path.join(self.marker_gene_dir, genome_id) aa_gene_file = os.path.join(output_dir, genome_id + self.protein_file_suffix) nt_gene_file = os.path.join(output_dir, genome_id + self.nt_gene_file_suffix) gff_file = os.path.join(output_dir, genome_id + self.gff_file_suffix) translation_table_file = os.path.join( output_dir, 'prodigal_translation_table.tsv') out_files = (nt_gene_file, gff_file, translation_table_file, aa_gene_file) # Check if this genome has already been processed (skip). if all([file_has_checksum(x) for x in out_files]): tln_table_file = TlnTableFile(translation_table_file) tln_table_file.read() self.warnings.info(f'Skipped Prodigal processing for: {genome_id}') return aa_gene_file, nt_gene_file, gff_file, translation_table_file, tln_table_file.best_tln_table, True # Run Prodigal prodigal = BioLibProdigal(1, False) summary_stats = prodigal.run([fasta_path], output_dir, called_genes=False, translation_table=usr_tln_table) # An error occurred in BioLib Prodigal. if not summary_stats: if self.force: return None else: raise Exception( "An error was encountered while running Prodigal.") summary_stats = summary_stats[list(summary_stats.keys())[0]] # rename output files to adhere to GTDB conventions and desired genome # ID shutil.move(summary_stats.aa_gene_file, aa_gene_file) shutil.move(summary_stats.nt_gene_file, nt_gene_file) shutil.move(summary_stats.gff_file, gff_file) # save translation table information tln_table_file = TlnTableFile( translation_table_file, best_tln_table=summary_stats.best_translation_table, coding_density_4=round(summary_stats.coding_density_4 * 100, 2), coding_density_11=round(summary_stats.coding_density_11 * 100, 2)) tln_table_file.write() # Create a hash of each file for out_file in out_files: if out_file is not None: with open(out_file + CHECKSUM_SUFFIX, 'w') as fh: fh.write(sha256(out_file)) return aa_gene_file, nt_gene_file, gff_file, translation_table_file, summary_stats.best_translation_table, False