def call_genes(self, options): """Call genes command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - call_genes] Identifying genes within genomes.') self.logger.info('*******************************************************************************') make_sure_path_exists(options.output_dir) genome_files = self._genome_files(options.genome_dir, options.genome_ext) if not genome_files: self.logger.warning(' [Warning] No genome files found. Check the --genome_ext flag used to identify genomes.') sys.exit() prodigal = Prodigal(options.cpus) summary_stats = prodigal.run(genome_files, False, options.force_table, False, options.output_dir) # write gene calling summary fout = open(os.path.join(options.output_dir, 'call_genes.summary.tsv'), 'w') fout.write('Genome Id\tSelected translation table\tTable 4 coding density\tTable 11 coding density\n') for genome_id, stats in summary_stats.iteritems(): fout.write('%s\t%d\t%.2f%%\t%.2f%%\n' % (genome_id, stats.best_translation_table, stats.coding_density_4, stats.coding_density_11)) fout.close() self.logger.info('') self.logger.info(' Identified genes written to: %s' % options.output_dir) self.time_keeper.print_time_stamp()
def call_genes(self, options): """Call genes command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - call_genes] Identifying genes within genomes.') self.logger.info('*******************************************************************************') check_dir_exists(options.genome_nt_dir) make_sure_path_exists(options.output_dir) genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genome_files): self.logger.warning('[Warning] All files must contain nucleotide sequences.') sys.exit() # call genes in genomes prodigal = Prodigal(options.cpus) prodigal.run(genome_files, options.output_dir) self.logger.info(' Genes in genomes written to: %s' % options.output_dir) # call genes in unbinned scaffolds if options.unbinned_file: unbinned_output_dir = os.path.join(options.output_dir, 'unbinned') prodigal.run([options.unbinned_file], unbinned_output_dir, meta=True) self.logger.info(' Genes in unbinned scaffolds written to: %s' % unbinned_output_dir) self.time_keeper.print_time_stamp()
def call_genes(self, options): """Call genes command""" make_sure_path_exists(options.output_dir) genome_files = self._input_files(options.input_genomes, options.file_ext) prodigal = Prodigal(options.cpus, not options.silent) summary_stats = prodigal.run(genome_files, options.output_dir, False, options.force_table, False) # write gene calling summary fout = open(os.path.join(options.output_dir, 'call_genes.summary.tsv'), 'w') fout.write( 'Genome Id\tSelected translation table\tTable 4 coding density\tTable 11 coding density\n' ) for genome_id, stats in summary_stats.items(): fout.write('%s\t%d\t%.2f%%\t%.2f%%\n' % (genome_id, stats.best_translation_table, stats.coding_density_4, stats.coding_density_11)) fout.close() self.logger.info('Identified genes written to: %s' % options.output_dir)
def call_genes(self, options): """Call genes command""" make_sure_path_exists(options.output_dir) genome_files = self._input_files(options.input_genomes, options.file_ext) prodigal = Prodigal(options.cpus, not options.silent) summary_stats = prodigal.run(genome_files, options.output_dir, called_genes=False, translation_table=options.force_table, meta=False, closed_ends=True) # write gene calling summary fout = open(os.path.join(options.output_dir, 'call_genes.summary.tsv'), 'w') fout.write('Genome Id\tSelected translation table\tTable 4 coding density\tTable 11 coding density\n') for genome_id, stats in summary_stats.items(): fout.write('%s\t%d\t%.2f%%\t%.2f%%\n' % (genome_id, stats.best_translation_table, stats.coding_density_4, stats.coding_density_11)) fout.close() self.logger.info('Identified genes written to: %s' % options.output_dir)
def _runProdigal(self, fasta_path): """Run Prodigal. Parameters ---------- fasta_path : str Path to FASTA file to process. """ temp_dir, fasta_file = os.path.split(fasta_path) output_dir = os.path.join(temp_dir, self.userAnnotationDir) genome_id = fasta_file[0:fasta_file.rfind('_')] prodigal = BioLibProdigal(1, False) summary_stats = prodigal.run([fasta_path], output_dir) summary_stats = summary_stats[summary_stats.keys()[0]] # rename output files to adhere to GTDB conventions aa_gene_file = os.path.join( output_dir, genome_id + ConfigMetadata.PROTEIN_FILE_SUFFIX) shutil.move(summary_stats.aa_gene_file, aa_gene_file) nt_gene_file = os.path.join( output_dir, genome_id + ConfigMetadata.NT_GENE_FILE_SUFFIX) shutil.move(summary_stats.nt_gene_file, nt_gene_file) gff_file = os.path.join(output_dir, genome_id + ConfigMetadata.GFF_FILE_SUFFIX) shutil.move(summary_stats.gff_file, gff_file) # save translation table information translation_table_file = os.path.join( output_dir, 'prodigal_translation_table.tsv') fout = open(translation_table_file, 'w') fout.write( '%s\t%d\n' % ('best_translation_table', summary_stats.best_translation_table)) fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats.coding_density_4 * 100)) fout.write( '%s\t%.2f\n' % ('coding_density_11', summary_stats.coding_density_11 * 100)) fout.close() checksum = sha256(aa_gene_file) fout = open(aa_gene_file + ConfigMetadata.CHECKSUM_SUFFIX, 'w') fout.write(checksum) fout.close() return (aa_gene_file, nt_gene_file, gff_file, translation_table_file)
def call_genes(self, options): """Call genes command""" check_dir_exists(options.genome_nt_dir) make_sure_path_exists(options.output_dir) genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genome_files): self.logger.warning('All files must contain nucleotide sequences.') sys.exit() # call genes in genomes prodigal = Prodigal(options.cpus) prodigal.run(genome_files, options.output_dir) self.logger.info('Genes in genomes written to: %s' % options.output_dir) # call genes in unbinned scaffolds if options.unbinned_file: unbinned_output_dir = os.path.join(options.output_dir, 'unbinned') prodigal.run([options.unbinned_file], unbinned_output_dir, meta=True) self.logger.info('Genes in unbinned scaffolds written to: %s' % unbinned_output_dir)
def _run_prodigal(self, genome_id, fasta_path): """Run Prodigal. Parameters ---------- fasta_path : str Path to FASTA file to process. """ output_dir = os.path.join(self.marker_gene_dir, genome_id) prodigal = BioLibProdigal(1, False) summary_stats = prodigal.run([fasta_path], output_dir, called_genes=self.proteins) summary_stats = summary_stats[summary_stats.keys()[0]] # rename output files to adhere to GTDB conventions and desired genome ID aa_gene_file = os.path.join(output_dir, genome_id + self.protein_file_suffix) shutil.move(summary_stats.aa_gene_file, aa_gene_file) nt_gene_file = None gff_file = None translation_table_file = None if not self.proteins: nt_gene_file = os.path.join(output_dir, genome_id + self.nt_gene_file_suffix) shutil.move(summary_stats.nt_gene_file, nt_gene_file) gff_file = os.path.join(output_dir, genome_id + self.gff_file_suffix) shutil.move(summary_stats.gff_file, gff_file) # save translation table information translation_table_file = os.path.join(output_dir, 'prodigal_translation_table.tsv') fout = open(translation_table_file, 'w') fout.write('%s\t%d\n' % ('best_translation_table', summary_stats.best_translation_table)) fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats.coding_density_4 * 100)) fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats.coding_density_11 * 100)) fout.close() return (aa_gene_file, nt_gene_file, gff_file, translation_table_file)
def _run_prodigal(self, genome_paths): """Run Prodigal on genomes.""" # get genome path and translation table for each file self.logger.info('Determining genomic file and translation table for each of the %d genomes.' % len(genome_paths)) genome_files = [] translation_table = {} for gid, gpath in genome_paths.items(): assembly_id = os.path.basename(os.path.normpath(gpath)) canonical_gid = assembly_id[0:assembly_id.find('_', 4)] genome_file = os.path.join(gpath, assembly_id + '_genomic.fna') if os.path.exists(genome_file): if os.stat(genome_file).st_size == 0: self.logger.warning('Genomic file appears to be empty: %s' % genome_file) continue genome_files.append(genome_file) else: self.logger.warning('Genomic file appears to be missing: %s' % genome_file) gff_file = os.path.join(gpath, assembly_id + '_genomic.gff') if os.path.exists(gff_file): if os.stat(gff_file).st_size == 0: self.logger.warning('GFF appears to be empty: %s' % gff_file) continue tt = self._parse_translation_table(gff_file) if tt: translation_table[canonical_gid] = tt else: translation_table[canonical_gid] = None self.logger.warning('Unable to determine translation table for: %s' % gff_file) sys.exit(-1) else: self.logger.warning('GFF appears to be missing: %s' % gff_file) sys.exit(-1) # run Prodigal on each genome self.logger.info('Running Prodigal on %d genomes.' % len(genome_paths)) prodigal = Prodigal(cpus=self.cpus) summary_stats = prodigal.run(genome_files, translation_table=translation_table, output_dir=self.tmp_dir) # move results into individual genome directories self.logger.info('Moving files and calculating checksums.') for genome_file in genome_files: genome_path, genome_id = ntpath.split(genome_file) genome_id = remove_extension(genome_id) canonical_gid = genome_id[0:genome_id.find('_', 4)] aa_gene_file = os.path.join(self.tmp_dir, genome_id + '_genes.faa') nt_gene_file = os.path.join(self.tmp_dir, genome_id + '_genes.fna') gff_file = os.path.join(self.tmp_dir, genome_id + '.gff') genome_root = genome_id[0:genome_id.find('_', 4)] prodigal_path = os.path.join(genome_path, 'prodigal') if not os.path.exists(prodigal_path): os.makedirs(prodigal_path) new_aa_gene_file = os.path.join(prodigal_path, genome_root + '_protein.faa') new_nt_gene_file = os.path.join(prodigal_path, genome_root + '_protein.fna') new_gff_file = os.path.join(prodigal_path, genome_root + '_protein.gff') os.system('mv %s %s' % (aa_gene_file, new_aa_gene_file)) os.system('mv %s %s' % (nt_gene_file, new_nt_gene_file)) os.system('mv %s %s' % (gff_file, new_gff_file)) # save translation table information translation_table_file = os.path.join(prodigal_path, 'prodigal_translation_table.tsv') fout = open(translation_table_file, 'w') if translation_table[canonical_gid]: fout.write('%s\t%d\t%s\n' % ('best_translation_table', summary_stats[genome_id].best_translation_table, 'used table specified by NCBI')) else: fout.write('%s\t%d\n' % ('best_translation_table', summary_stats[genome_id].best_translation_table)) fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats[genome_id].coding_density_4 * 100)) fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats[genome_id].coding_density_11 * 100)) fout.close() checksum = sha256(new_aa_gene_file) fout = open(new_aa_gene_file + '.sha256', 'w') fout.write(checksum) fout.close()
def run(self, input_dir, tmp_dir, threads): # get path to all unprocessed genome files print 'Reading genomes.' genome_files = [] for genome_dir in os.listdir(input_dir): cur_genome_dir = os.path.join(input_dir, genome_dir) if not os.path.isdir(cur_genome_dir): continue for assembly_id in os.listdir(cur_genome_dir): assembly_dir = os.path.join(cur_genome_dir, assembly_id) genome_id = assembly_id[0:assembly_id.find('_', 4)] # check if prodigal has already been called aa_gene_file = os.path.join(assembly_dir, 'prodigal', genome_id + '_protein.faa') if os.path.exists(aa_gene_file): # verify checksum checksum_file = aa_gene_file + '.sha256' if os.path.exists(checksum_file): checksum = sha256(aa_gene_file) cur_checksum = open(checksum_file).readline().strip() if checksum == cur_checksum: continue genome_file = os.path.join(assembly_dir, assembly_id + '_genomic.fna') if os.path.exists(genome_file): if os.stat(genome_file).st_size == 0: print '[Warning] Genome file appears to be empty: %s' % genome_file else: genome_files.append(genome_file) print ' Number of unprocessed genomes: %d' % len(genome_files) # run prodigal on each genome print 'Running prodigal.' prodigal = Prodigal(cpus=threads) summary_stats = prodigal.run(genome_files, output_dir=tmp_dir) # move results into individual genome directories print 'Moving files and calculating checksums.' for genome_file in genome_files: genome_path, genome_id = ntpath.split(genome_file) genome_id = remove_extension(genome_id) aa_gene_file = os.path.join(tmp_dir, genome_id + '_genes.faa') nt_gene_file = os.path.join(tmp_dir, genome_id + '_genes.fna') gff_file = os.path.join(tmp_dir, genome_id + '.gff') genome_root = genome_id[0:genome_id.find('_', 4)] prodigal_path = os.path.join(genome_path, 'prodigal') if not os.path.exists(prodigal_path): os.makedirs(prodigal_path) new_aa_gene_file = os.path.join(prodigal_path, genome_root + '_protein.faa') new_nt_gene_file = os.path.join(prodigal_path, genome_root + '_protein.fna') new_gff_file = os.path.join(prodigal_path, genome_root + '_protein.gff') os.system('mv %s %s' % (aa_gene_file, new_aa_gene_file)) os.system('mv %s %s' % (nt_gene_file, new_nt_gene_file)) os.system('mv %s %s' % (gff_file, new_gff_file)) # save translation table information translation_table_file = os.path.join( prodigal_path, 'prodigal_translation_table.tsv') fout = open(translation_table_file, 'w') fout.write('%s\t%d\n' % ('best_translation_table', summary_stats[genome_id].best_translation_table)) fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats[genome_id].coding_density_4 * 100)) fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats[genome_id].coding_density_11 * 100)) fout.close() checksum = sha256(new_aa_gene_file) fout = open(new_aa_gene_file + '.sha256', 'w') fout.write(checksum) fout.close()
def main(args): global debug if args["--debug"]: debug = True else: debug = None sys.excepthook = exceptionHandler nproc = int(args["--procs"]) ncpu = int(args["--cpus"]) min_len = int(args["--min-len"]) with open(args["<files>"], "r") as json_file: filename = json_file files = json.load(json_file) genome_files = files["genomes"] read_files = files["reads"] comms = list(files["reads"].keys()) tmp_dir = args["--tmp"] if not os.path.isdir(tmp_dir): os.makedirs(tmp_dir, exist_ok=True) tmp_damage = os.path.join(tmp_dir, "damage-aa") if not os.path.isdir(tmp_damage): os.makedirs(tmp_damage, exist_ok=True) out_dir = args["--out-dir"] if not os.path.isdir(out_dir): os.makedirs(out_dir, exist_ok=True) p_procs = nproc * ncpu if p_procs > len(genome_files): p_procs = len(genome_files) logging.info( "Predicting genes from genomes usin {} processes...".format(p_procs)) output_dir = os.path.join(tmp_dir, "gene_prediction") if not os.path.isdir(output_dir): os.makedirs(output_dir) prodigal = Prodigal(cpus=p_procs, verbose=True) gene_preds = prodigal.run( genome_files=genome_files, output_dir=output_dir, called_genes=False, translation_table=None, meta=False, closed_ends=False, ) gene_pred = {} for k in gene_preds.keys(): gene_pred[k] = { "faa": gene_preds[k].aa_gene_file, "fna": gene_preds[k].nt_gene_file, "translation_table": gene_preds[k].best_translation_table, } genome_ids = list(gene_pred.keys()) func = partial( pa.analyze_proteins, files=files, gene_predictions=gene_pred, min_len=min_len, outdir=tmp_damage, debug=debug, nproc=nproc, ) logging.info("Finding damage in codons...") comm_files = list(product(comms, genome_ids)) # if p_procs > len(comm_files): # p_procs = len(comm_files) if debug is True: data = list(map(func, comm_files)) else: p = MyPool(ncpu) data = list( tqdm.tqdm( p.imap_unordered(func, comm_files), total=len(comm_files), )) logging.info("Combining files...") p_procs = nproc * ncpu if p_procs > len(comms): p_procs = len(comms) func = partial(combine_files, tmp_damage=tmp_damage, out_dir=out_dir) if debug is True: ofiles = list(map(func, comms)) else: p = MyPool(p_procs) ofiles = list( tqdm.tqdm( p.imap_unordered(func, comms), total=len(comms), )) # for comm in comms: # out_suffix = ".tsv.gz" # fname = "{}_aa-damage".format(comm) # outfile = Path(out_dir, fname).with_suffix(out_suffix) # files = glob.glob(str(Path(tmp_damage, comm + "*"))) # li = [] # for file in files: # df = pd.read_csv(file, index_col=None, header=0, sep="\t") # li.append(df) # df = pd.concat(li, axis=0, ignore_index=True) # df.to_csv( # path_or_buf=outfile, # sep="\t", # header=True, # index=False, # compression="gzip", # ) logging.info("Protein analysis done.")
def run(self, input_dir, tmp_dir, threads): # get path to all unprocessed genome files print 'Reading genomes.' genome_files = [] for genome_dir in os.listdir(input_dir): cur_genome_dir = os.path.join(input_dir, genome_dir) if not os.path.isdir(cur_genome_dir): continue for assembly_id in os.listdir(cur_genome_dir): assembly_dir = os.path.join(cur_genome_dir, assembly_id) genome_id = assembly_id[0:assembly_id.find('_', 4)] # check if prodigal has already been called if False: # for safety, I am just recalling genes for all genomes right now, # but this is very efficient aa_gene_file = os.path.join(assembly_dir, 'prodigal', genome_id + '_protein.faa') if os.path.exists(aa_gene_file): # verify checksum checksum_file = aa_gene_file + '.sha256' if os.path.exists(checksum_file): checksum = sha256(aa_gene_file) cur_checksum = open(checksum_file).readline().strip() if checksum == cur_checksum: continue genome_file = os.path.join(assembly_dir, assembly_id + '_genomic.fna') if os.path.exists(genome_file): if os.stat(genome_file).st_size == 0: print '[Warning] Genome file appears to be empty: %s' % genome_file else: genome_files.append(genome_file) print ' Number of unprocessed genomes: %d' % len(genome_files) # run prodigal on each genome print 'Running prodigal.' prodigal = Prodigal(cpus=threads) summary_stats = prodigal.run(genome_files, output_dir=tmp_dir) # move results into individual genome directories print 'Moving files and calculating checksums.' for genome_file in genome_files: genome_path, genome_id = ntpath.split(genome_file) genome_id = remove_extension(genome_id) aa_gene_file = os.path.join(tmp_dir, genome_id + '_genes.faa') nt_gene_file = os.path.join(tmp_dir, genome_id + '_genes.fna') gff_file = os.path.join(tmp_dir, genome_id + '.gff') genome_root = genome_id[0:genome_id.find('_', 4)] prodigal_path = os.path.join(genome_path, 'prodigal') if not os.path.exists(prodigal_path): os.makedirs(prodigal_path) new_aa_gene_file = os.path.join(prodigal_path, genome_root + '_protein.faa') new_nt_gene_file = os.path.join(prodigal_path, genome_root + '_protein.fna') new_gff_file = os.path.join(prodigal_path, genome_root + '_protein.gff') os.system('mv %s %s' % (aa_gene_file, new_aa_gene_file)) os.system('mv %s %s' % (nt_gene_file, new_nt_gene_file)) os.system('mv %s %s' % (gff_file, new_gff_file)) # save translation table information translation_table_file = os.path.join(prodigal_path, 'prodigal_translation_table.tsv') fout = open(translation_table_file, 'w') fout.write('%s\t%d\n' % ('best_translation_table', summary_stats[genome_id].best_translation_table)) fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats[genome_id].coding_density_4 * 100)) fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats[genome_id].coding_density_11 * 100)) fout.close() checksum = sha256(new_aa_gene_file) fout = open(new_aa_gene_file + '.sha256', 'w') fout.write(checksum) fout.close()