def get_single_copy_hits_worker(job): """For a given genome, obtain the PFAM and TIGRFAM tophit files. Use this information to determine what hits are single copy. Parameters ---------- job : Tuple[str, str, CopyNumberFile] The genome id, path to called genes, and domain-specific copy number file object. Returns ------- Dict[str, Dict[str, str]] dict[marker id][genome id] = sequence """ gid, aa_path, copy_number_file = job # Load the marker top hit files. marker_genes_dir = os.path.dirname(os.path.dirname(aa_path)) pfam_tophit_file = TopHitPfamFile(marker_genes_dir, gid) tigr_tophit_file = TopHitTigrFile(marker_genes_dir, gid) pfam_tophit_file.read() tigr_tophit_file.read() # Process each of the genes to determine if they are single copy. cnf = copy_number_file('/dev/null', None) cnf.add_genome(gid, aa_path, pfam_tophit_file, tigr_tophit_file) single_copy = cnf.get_single_copy_hits(gid) # Store the output out = defaultdict(dict) for marker_id, marker_d in single_copy.items(): out[marker_id][gid] = marker_d['seq'] return out
def _report_identified_marker_genes(self, gene_dict, outdir, prefix): """Report statistics for identified marker genes.""" # Summarise the copy number of each AR122 and BAC120 markers. tln_summary_file = TlnTableSummaryFile(outdir, prefix) ar122_copy_number_file = CopyNumberFileAR122(outdir, prefix) bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix) # Process each genome. for db_genome_id, info in sorted(gene_dict.items()): cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE) pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id) tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id) pfam_tophit_file.read() tigr_tophit_file.read() # Summarise each of the markers for this genome. ar122_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"), pfam_tophit_file, tigr_tophit_file) bac120_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"), pfam_tophit_file, tigr_tophit_file) # Write the best translation table to disk for this genome. tln_summary_file.add_genome(db_genome_id, info.get("best_translation_table")) # Write each of the summary files to disk. ar122_copy_number_file.write() bac120_copy_number_file.write() tln_summary_file.write() # Create a symlink to store the summary files in the root. symlink_f( PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix), os.path.join( outdir, os.path.basename( PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)))) symlink_f( PATH_AR122_MARKER_SUMMARY.format(prefix=prefix), os.path.join( outdir, os.path.basename( PATH_AR122_MARKER_SUMMARY.format(prefix=prefix)))) symlink_f( PATH_TLN_TABLE_SUMMARY.format(prefix=prefix), os.path.join( outdir, os.path.basename( PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))))
def test__merge_hit_files(self): pfam_th = TopHitPfamFile(os.path.join(self.dir_tmp), 'genome_1') pfam_th.add_hit('gene_a', 'PFAM_1', 0.05, 100) pfam_th.add_hit('gene_b', 'PFAM_2', 0.05, 200) pfam_th.add_hit('gene_c', 'PFAM_2', 0.05, 100) tigr_th = TopHitTigrFile(os.path.join(self.dir_tmp), 'genome_1') tigr_th.add_hit('gene_x', 'TIGR_1', 0.05, 100) expected = {'TIGR_1': [Hit('gene_x', 'TIGR_1', 0.05, 100)], 'PFAM_1': [Hit('gene_a', 'PFAM_1', 0.05, 100)], 'PFAM_2': [Hit('gene_b', 'PFAM_2', 0.05, 200), Hit('gene_c', 'PFAM_2', 0.05, 100)]} self.assertDictEqual(expected, CopyNumberFile._merge_hit_files(pfam_th, tigr_th))
def _workerThread(self, queueIn, queueOut, n_skipped): """Process each data item in parallel.""" while True: gene_file = queueIn.get(block=True, timeout=None) if gene_file is None: break assembly_dir, filename = os.path.split(gene_file) genome_id = filename.replace(self.protein_file_suffix, '') genome_dir = os.path.join(self.output_dir, genome_id) output_hit_file = os.path.join( genome_dir, filename.replace(self.protein_file_suffix, self.tigrfam_suffix)) hmmsearch_out = os.path.join( genome_dir, filename.replace(self.protein_file_suffix, '_tigrfam.out')) # Check if this has already been processed. out_files = (output_hit_file, hmmsearch_out, TopHitTigrFile.get_path(self.output_dir, genome_id)) if all([file_has_checksum(x) for x in out_files]): self.warnings.info( f'Skipped TIGRFAM processing for: {genome_id}') with n_skipped.get_lock(): n_skipped.value += 1 else: args = [ 'hmmsearch', '-o', hmmsearch_out, '--tblout', output_hit_file, '--noali', '--notextw', '--cut_nc', '--cpu', str(self.cpus_per_genome), self.tigrfam_hmms, gene_file ] p = subprocess.Popen(args, stdout=subprocess.PIPE, encoding='utf-8') stdout, stderr = p.communicate() if p.returncode != 0: raise GTDBTkExit( f'Non-zero exit code returned when running hmsearch: {stdout}' ) # calculate checksum for out_file in [output_hit_file, hmmsearch_out]: checksum = sha256(out_file) with open(out_file + self.checksum_suffix, 'w') as fh: fh.write(checksum) # identify top hit for each gene self._topHit(output_hit_file) # allow results to be processed or written to file queueOut.put(gene_file)
def _topHit(self, tigrfam_file): """Determine top hits to TIGRFAMs. A gene is assigned to a single TIGRFAM family. This will be the top hit among all TIGRFAM HMMs and pass the threshold for the HMM. Parameters ---------- tigrfam_file : str Name of file containing hits to TIGRFAM HMMs. """ assembly_dir, filename = os.path.split(tigrfam_file) genome_id = filename.replace(self.tigrfam_suffix, '') tophit_file = TopHitTigrFile(self.output_dir, genome_id) # Populate the top-hit file. with open(tigrfam_file, 'r') as fh_tigrfam: for line in fh_tigrfam: if line[0] == '#': continue line_split = line.split() gene_id = line_split[0] hmm_id = line_split[3] evalue = float(line_split[4]) bitscore = float(line_split[5]) tophit_file.add_hit(gene_id, hmm_id, evalue, bitscore) # Write the top-hit file to disk and calculate checksum. tophit_file.write()
class TestTopHitTigrFile(unittest.TestCase): def setUp(self): self.dir_tmp = tempfile.mkdtemp(prefix='gtdbtk_tmp_') self.th = TopHitTigrFile(self.dir_tmp, 'g') def tearDown(self): self.th = None shutil.rmtree(self.dir_tmp) def test___init__(self): self.assertEqual( os.path.join(self.dir_tmp, 'g', f'g{TIGRFAM_TOP_HIT_SUFFIX}'), self.th.path) def test_add_hit(self): self.th.add_hit('a', 'b', 0.1, 20) self.th.add_hit('a', 'c', 0.1, 10) self.th.add_hit('x', 'y', 0.1, 20) self.th.add_hit('x', 'z', 0.0, 20) expected = {('a', Hit('a', 'b', 0.1, 20)), ('x', Hit('x', 'z', 0.0, 20))} self.assertSetEqual(expected, set(self.th.iter_hits()))
def _workerThread(self, queueIn, queueOut, n_skipped): """Process each data item in parallel.""" while True: gene_file = queueIn.get(block=True, timeout=None) if gene_file is None: break assembly_dir, filename = os.path.split(gene_file) genome_id = filename.replace(self.protein_file_suffix, '') genome_dir = os.path.join(self.output_dir, genome_id) output_hit_file = os.path.join( genome_dir, filename.replace(self.protein_file_suffix, self.tigrfam_suffix)) hmmsearch_out = os.path.join( genome_dir, filename.replace(self.protein_file_suffix, '_tigrfam.out')) # Check if this has already been processed. out_files = (output_hit_file, hmmsearch_out, TopHitTigrFile.get_path(self.output_dir, genome_id)) if all([file_has_checksum(x) for x in out_files]): self.warnings.info( f'Skipped TIGRFAM processing for: {genome_id}') with n_skipped.get_lock(): n_skipped.value += 1 else: cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu %d %s %s' % ( hmmsearch_out, output_hit_file, self.cpus_per_genome, self.tigrfam_hmms, gene_file) os.system(cmd) # calculate checksum for out_file in [output_hit_file, hmmsearch_out]: checksum = sha256(out_file) with open(out_file + self.checksum_suffix, 'w') as fh: fh.write(checksum) # identify top hit for each gene self._topHit(output_hit_file) # allow results to be processed or written to file queueOut.put(gene_file)
def _report_identified_marker_genes(self, gene_dict, outdir, prefix, write_single_copy_genes): """Report statistics for identified marker genes.""" # Summarise the copy number of each AR53 and BAC120 markers. tln_summary_file = TlnTableSummaryFile(outdir, prefix) ar53_copy_number_file = CopyNumberFileAR53(outdir, prefix) bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix) # Process each genome. for db_genome_id, info in tqdm_log(sorted(gene_dict.items()), unit='genome'): cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE) pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id) tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id) pfam_tophit_file.read() tigr_tophit_file.read() # Summarise each of the markers for this genome. ar53_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"), pfam_tophit_file, tigr_tophit_file) bac120_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"), pfam_tophit_file, tigr_tophit_file) # Write the best translation table to disk for this genome. tln_summary_file.add_genome(db_genome_id, info.get("best_translation_table")) # Write each of the summary files to disk. ar53_copy_number_file.write() bac120_copy_number_file.write() tln_summary_file.write() # Create a symlink to store the summary files in the root. # symlink_f(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix), # os.path.join(outdir, os.path.basename(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)))) # symlink_f(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix), # os.path.join(outdir, os.path.basename(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix)))) # symlink_f(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix), # os.path.join(outdir, os.path.basename(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix)))) symlink_f( PATH_FAILS.format(prefix=prefix), os.path.join(outdir, os.path.basename(PATH_FAILS.format(prefix=prefix)))) # Write the single copy AR53/BAC120 FASTA files to disk. if write_single_copy_genes: fasta_dir = os.path.join(outdir, DIR_IDENTIFY_FASTA) self.logger.info( f'Writing unaligned single-copy genes to: {fasta_dir}') # Iterate over each domain. marker_doms = list() marker_doms.append( (Config.AR53_MARKERS['PFAM'] + Config.AR53_MARKERS['TIGRFAM'], ar53_copy_number_file, 'ar53')) marker_doms.append((Config.BAC120_MARKERS['PFAM'] + Config.BAC120_MARKERS['TIGRFAM'], bac120_copy_number_file, 'bac120')) for marker_names, marker_file, marker_d in marker_doms: # Create the domain-specific subdirectory. fasta_d_dir = os.path.join(fasta_dir, marker_d) make_sure_path_exists(fasta_d_dir) # Iterate over each marker. for marker_name in marker_names: marker_name = marker_name.rstrip(r'\.[HMMhmm]') marker_path = os.path.join(fasta_d_dir, f'{marker_name}.fa') to_write = list() for genome_id in sorted(gene_dict): unq_hits = marker_file.get_single_copy_hits(genome_id) if marker_name in unq_hits: to_write.append(f'>{genome_id}') to_write.append(unq_hits[marker_name]['seq']) if len(to_write) > 0: with open(marker_path, 'w') as fh: fh.write('\n'.join(to_write))
def _run_multi_align(self, db_genome_id, path, marker_set_id): """ Returns the concatenated marker sequence for a specific genome :param db_genome_id: Selected genome :param path: Path to the genomic fasta file for the genome :param marker_set_id: Unique ID of marker set to use for alignment """ cur_marker_dir = os.path.dirname(os.path.dirname(path)) pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id) tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id) pfam_tophit_file.read() tigr_tophit_file.read() if marker_set_id == 'bac120': copy_number_file = CopyNumberFileBAC120('/dev/null', None) elif marker_set_id == 'ar122': copy_number_file = CopyNumberFileAR122('/dev/null', None) else: raise GTDBTkException('Unknown marker set.') copy_number_file.add_genome(db_genome_id, path, pfam_tophit_file, tigr_tophit_file) single_copy_hits = copy_number_file.get_single_copy_hits(db_genome_id) # gather information for all marker genes marker_paths = { "PFAM": os.path.join(self.pfam_hmm_dir, 'individual_hmms'), "TIGRFAM": os.path.join(os.path.dirname(self.tigrfam_hmm_dir), 'individual_hmms') } marker_dict_original = {} if marker_set_id == "bac120": for db_marker in sorted(self.bac120_markers): marker_dict_original.update({ marker.replace(".HMM", "").replace(".hmm", ""): os.path.join(marker_paths[db_marker], marker) for marker in self.bac120_markers[db_marker] }) elif marker_set_id == "ar122": for db_marker in sorted(self.ar122_markers): marker_dict_original.update({ marker.replace(".HMM", "").replace(".hmm", ""): os.path.join(marker_paths[db_marker], marker) for marker in self.ar122_markers[db_marker] }) elif marker_set_id == "rps23": for db_marker in sorted(self.rps23_markers): marker_dict_original.update({ marker.replace(".HMM", "").replace(".hmm", ""): os.path.join(marker_paths[db_marker], marker) for marker in self.rps23_markers[db_marker] }) # Iterate over each of the expected markers and store the gene sequence. gene_dict = dict() result_align = dict() for marker_id, marker_path in marker_dict_original.items(): hit = single_copy_hits.get(marker_id) if hit: # print(marker_id) gene_dict[marker_id] = { "marker_path": marker_path, "gene": hit['hit'].gene_id, "gene_seq": hit['seq'], "bitscore": hit['hit'].bit_score } else: hmm_len = self._get_hmm_size(marker_path) result_align[marker_id] = '-' * hmm_len # Align the markers. result_align.update(self._run_align(gene_dict, db_genome_id)) # we concatenate the aligned markers together and associate them with # the genome. return ''.join([x[1] for x in sorted(result_align.items())])
def setUp(self): self.dir_tmp = tempfile.mkdtemp(prefix='gtdbtk_tmp_') # Build a test set of markers self.test_markers = { "PFAM": ["PFAM_1.hmm", "PFAM_2.hmm", "PFAM_3.hmm"], "TIGRFAM": ["TIGR_1.HMM", "TIGR_2.HMM"] } # Build the copy number file self.cn_path = os.path.join(self.dir_tmp, 'cn.tsv') self.cn = CopyNumberFile(self.cn_path, 'test5', self.test_markers) # Store the hit files self.pfam_th = dict() self.tigr_th = dict() self.faa = dict() # Test case for genome_1 """ Single Copy: PFAM_1, TIGR_1 Multi-Unique: PFAM_2 Multi-Copy: None Missing: PFAM_3, TIGR_2 """ self.pfam_th['genome_1'] = TopHitPfamFile(os.path.join(self.dir_tmp), 'genome_1') self.pfam_th['genome_1'].add_hit('gene_a', 'PFAM_1', 0.05, 100) self.pfam_th['genome_1'].add_hit('gene_b', 'PFAM_2', 0.05, 200) self.pfam_th['genome_1'].add_hit('gene_c', 'PFAM_2', 0.05, 100) self.pfam_th['genome_1'].write() self.tigr_th['genome_1'] = TopHitTigrFile(os.path.join(self.dir_tmp), 'genome_1') self.tigr_th['genome_1'].add_hit('gene_x', 'TIGR_1', 0.05, 100) self.faa['genome_1'] = os.path.join(self.dir_tmp, 'genome_1.faa') with open(self.faa['genome_1'], 'w') as fh: fh.write('>gene_a\n') fh.write('VVVVVV\n') fh.write('>gene_b\n') fh.write('AAVVPP\n') fh.write('>gene_c\n') fh.write('AAVVPP\n') fh.write('>gene_x\n') fh.write('AAAAAA\n') # Test case for genome_2 """ Single Copy: PFAM_2 Multi-Unique: TIGR_1, TIGR_2 Multi-Copy: PFAM_1, PFAM_3 Missing: None """ self.pfam_th['genome_2'] = TopHitPfamFile(os.path.join(self.dir_tmp), 'genome_2') self.pfam_th['genome_2'].add_hit('gene_a', 'PFAM_2', 0.05, 100) self.pfam_th['genome_2'].add_hit('gene_w', 'PFAM_1', 0.05, 100) self.pfam_th['genome_2'].add_hit('gene_z', 'PFAM_1', 0.05, 100) self.pfam_th['genome_2'].add_hit('gene_y', 'PFAM_3', 0.05, 100) self.pfam_th['genome_2'].add_hit('gene_x', 'PFAM_3', 0.05, 100) self.pfam_th['genome_2'].write() self.tigr_th['genome_2'] = TopHitTigrFile(os.path.join(self.dir_tmp), 'genome_2') self.tigr_th['genome_2'].add_hit('gene_w', 'TIGR_1', 0.05, 100) self.tigr_th['genome_2'].add_hit('gene_x', 'TIGR_1', 0.05, 200) self.tigr_th['genome_2'].add_hit('gene_y', 'TIGR_2', 0.01, 100) self.tigr_th['genome_2'].add_hit('gene_z', 'TIGR_2', 0.05, 100) self.faa['genome_2'] = os.path.join(self.dir_tmp, 'genome_2.faa') with open(self.faa['genome_2'], 'w') as fh: fh.write('>gene_a\n') fh.write('VVVVVV\n') fh.write('>gene_w\n') fh.write('AAAVVV\n') fh.write('>gene_x\n') fh.write('AAAVVV\n') fh.write('>gene_y\n') fh.write('VVVAAA\n') fh.write('>gene_z\n') fh.write('VVVAAA\n') # Add the genomes for gid in self.faa: self.cn.add_genome(gid, self.faa[gid], pfam_th=self.pfam_th[gid], tigr_th=self.tigr_th[gid])
def setUp(self): self.dir_tmp = tempfile.mkdtemp(prefix='gtdbtk_tmp_') self.th = TopHitTigrFile(self.dir_tmp, 'g')