def test_export_msa_bac(self): """Test that the MSA can be exported when using the CLI.""" path_output = os.path.join(self.dir_tmp, 'msa.faa') args = [ 'python', '-m', 'gtdbtk', 'export_msa', '--domain', 'bac', '--output', path_output ] p = subprocess.Popen(args) p.wait() self.assertEqual(p.returncode, 0) test_hash = sha256(path_output) true_hash = sha256(CONCAT_BAC120) self.assertEqual(test_hash, true_hash)
def _workerThread(self, queueIn, queueOut, n_skipped): """Process each data item in parallel.""" try: while True: gene_file = queueIn.get(block=True, timeout=None) if gene_file is None: break genome_dir, filename = os.path.split(gene_file) genome_id = filename.replace(self.protein_file_suffix, '') output_hit_file = os.path.join(self.output_dir, genome_id, filename.replace(self.protein_file_suffix, self.pfam_suffix)) # Check if this has already been processed. out_files = (output_hit_file, TopHitPfamFile.get_path(self.output_dir, genome_id)) if all([file_has_checksum(x) for x in out_files]): self.warnings.info(f'Skipped Pfam processing for: {genome_id}') with n_skipped.get_lock(): n_skipped.value += 1 else: pfam_scan = PfamScan(cpu=self.cpus_per_genome, fasta=gene_file, dir=self.pfam_hmm_dir) pfam_scan.search() pfam_scan.write_results(output_hit_file, None, None, None, None) # calculate checksum with open(output_hit_file + self.checksum_suffix, 'w') as fh: fh.write(sha256(output_hit_file)) # identify top hit for each gene self._topHit(output_hit_file) queueOut.put(gene_file) except Exception as error: raise error
def _workerThread(self, queueIn, queueOut): """Process each data item in parallel.""" while True: queue_next = queueIn.get(block=True, timeout=None) if queue_next is None: break genome_id, gene_file = queue_next output_hit_file = os.path.join( self.output_dir, genome_id, '{}{}'.format(genome_id, self.tigrfam_suffix)) output_tophit_file = os.path.join( self.output_dir, genome_id, '{}{}'.format(genome_id, self.tigrfam_top_hit_suffix)) # Genome has already been processed if file_has_checksum(output_hit_file) and file_has_checksum( output_tophit_file): self.logger.info( 'Skipping result from a previous run: {}'.format( genome_id)) # Process this genome else: genome_dir = os.path.join(self.output_dir, genome_id) hmmsearch_out = os.path.join( genome_dir, '{}_tigrfam.out'.format(genome_id)) make_sure_path_exists(genome_dir) cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu %d %s %s' % ( hmmsearch_out, output_hit_file, self.cpus_per_genome, self.tigrfam_hmms, gene_file) os.system(cmd) # calculate checksum checksum = sha256(output_hit_file) with open(output_hit_file + self.checksum_suffix, 'w') as fout: fout.write(checksum) # identify top hit for each gene self._topHit(output_hit_file) # allow results to be processed or written to file queueOut.put(gene_file)
def _workerThread(self, queueIn, queueOut): """Process each data item in parallel.""" while True: queue_next = queueIn.get(block=True, timeout=None) if queue_next is None: break genome_id, gene_file = queue_next output_hit_file = os.path.join(self.output_dir, genome_id, '{}{}'.format(genome_id, self.tigrfam_suffix)) output_tophit_file = os.path.join(self.output_dir, genome_id, '{}{}'.format(genome_id, self.tigrfam_top_hit_suffix)) # Genome has already been processed if file_has_checksum(output_hit_file) and file_has_checksum(output_tophit_file): self.logger.info('Skipping result from a previous run: {}'.format(genome_id)) # Process this genome else: genome_dir = os.path.join(self.output_dir, genome_id) hmmsearch_out = os.path.join(genome_dir, '{}_tigrfam.out'.format(genome_id)) make_sure_path_exists(genome_dir) args = ['hmmsearch', '-o', hmmsearch_out, '--tblout', output_hit_file, '--noali', '--notextw', '--cut_nc', '--cpu', str(self.cpus_per_genome), self.tigrfam_hmms, gene_file] proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) proc_out, proc_err = proc.communicate() if proc.returncode != 0: queueOut.put((proc.returncode, genome_id, proc_out, proc_err)) sys.exit(proc.returncode) # calculate checksum checksum = sha256(output_hit_file) with open(output_hit_file + self.checksum_suffix, 'w') as fout: fout.write(checksum) # identify top hit for each gene self._topHit(output_hit_file) # allow results to be processed or written to file queueOut.put((0, genome_id, None, None))
def test_trim_msa__reference_mask_bac(self): """ Test that the expected result is returned when running trim_msa with bacterial reference_mask """ path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta') path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta') shutil.copyfile(Config.CONCAT_BAC120, path_untrimmed_msa) options = argparse.ArgumentParser() # Required arguments options.untrimmed_msa = path_untrimmed_msa options.output = path_output # Mutex arguments options.mask_file = None options.reference_mask = 'bac' self.options_parser.trim_msa(options) actual = sha256(path_output) expected = 'ae6e24e89540fed03b81436147f99bcd120d059a' self.assertEqual(actual, expected)
def test_trim_msa__reference_mask_arc(self): """ Test that the expected result is returned when running trim_msa with archaeal reference_mask """ path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta') path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta') shutil.copyfile(Config.CONCAT_AR122, path_untrimmed_msa) options = argparse.ArgumentParser() # Required arguments options.untrimmed_msa = path_untrimmed_msa options.output = path_output # Mutex arguments options.mask_file = None options.reference_mask = 'arc' self.options_parser.trim_msa(options) actual = sha256(path_output) expected = '1146351be59ae8d27668256c5b2c425a6f38c37c' self.assertEqual(actual, expected)
def _run_prodigal(self, genome_id, fasta_path): """Run Prodigal. Parameters ---------- fasta_path : str Path to FASTA file to process. :return False if an error occurred. """ # Setup output files output_dir = os.path.join(self.marker_gene_dir, genome_id) aa_gene_file = os.path.join(output_dir, genome_id + self.protein_file_suffix) nt_gene_file = None gff_file = None translation_table_file = None if not self.proteins: nt_gene_file = os.path.join(output_dir, genome_id + self.nt_gene_file_suffix) gff_file = os.path.join(output_dir, genome_id + self.gff_file_suffix) translation_table_file = os.path.join( output_dir, 'prodigal' + TRANSLATION_TABLE_SUFFIX) # Return early if files are already done if not self.proteins and file_has_checksum(aa_gene_file) and file_has_checksum(nt_gene_file) \ and file_has_checksum(gff_file) and file_has_checksum(translation_table_file): best_tln_table = -1 with open(translation_table_file, 'r') as tln_f: for line in tln_f.readlines(): cols = line.strip().split('\t') if cols[0] == 'best_translation_table': best_tln_table = int(cols[1]) break if best_tln_table > 0: self.logger.info( 'Skipping result from a previous run: {}'.format( genome_id)) return aa_gene_file, nt_gene_file, gff_file, translation_table_file, best_tln_table # Did not meet the conditions to skip processing this genome, call genes. prodigal = BioLibProdigal(1, False) summary_stats = prodigal.run([fasta_path], output_dir, called_genes=self.proteins) # An error occured in BioLib Prodigal. if not summary_stats: if self.force: return None else: raise GTDBTkExit( "Prodigal failed to call genes for: {} " "(to skip these genomes, re-run with --force)".format( genome_id)) summary_stats = list(summary_stats.values())[0] # rename output files to adhere to GTDB conventions and desired genome # ID shutil.move(summary_stats.aa_gene_file, aa_gene_file) with open(aa_gene_file + CHECKSUM_SUFFIX, 'w') as f: f.write(sha256(aa_gene_file)) if not self.proteins: shutil.move(summary_stats.nt_gene_file, nt_gene_file) with open(nt_gene_file + CHECKSUM_SUFFIX, 'w') as f: f.write(sha256(nt_gene_file)) shutil.move(summary_stats.gff_file, gff_file) with open(gff_file + CHECKSUM_SUFFIX, 'w') as f: f.write(sha256(gff_file)) # save translation table information translation_table_file = os.path.join( output_dir, 'prodigal_translation_table.tsv') with open(translation_table_file, 'w') as fout: fout.write('%s\t%d\n' % ('best_translation_table', summary_stats.best_translation_table)) fout.write( '%s\t%.2f\n' % ('coding_density_4', summary_stats.coding_density_4 * 100)) fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats.coding_density_11 * 100)) fout.write( '%s\t%.2f\n' % ('probability_4', summary_stats.probability_4 * 100)) fout.write( '%s\t%.2f\n' % ('probability_11', summary_stats.probability_11 * 100)) with open(translation_table_file + CHECKSUM_SUFFIX, 'w') as f: f.write(sha256(translation_table_file)) return aa_gene_file, nt_gene_file, gff_file, translation_table_file, summary_stats.best_translation_table