def align_marker_set(self, db_genome_ids, marker_set_id): """Threaded alignment using hmmalign for a given set of genomes. Parameters ---------- db_genome_ids : dict A dictionary containing the genome ids and aa paths to process. marker_set_id : str The marker set of these genomes (bac120/ar122). Returns ------- dict A dictionary of genome_ids -> aligned sequence. """ q_worker = mp.Queue() q_writer = mp.Queue() for gid, gid_dict in db_genome_ids.items(): q_worker.put((gid, gid_dict.get('aa_gene_path'), marker_set_id)) [q_worker.put(None) for _ in range(self.threads)] manager = mp.Manager() out_dict = manager.dict() p_workers = [ mp.Process(target=self._worker, args=(q_worker, q_writer, out_dict)) for _ in range(self.threads) ] p_writer = mp.Process(target=self._writer, args=(q_writer, len(db_genome_ids))) try: p_writer.start() for p_worker in p_workers: p_worker.start() for p_worker in p_workers: p_worker.join() # Gracefully terminate the program. if p_worker.exitcode != 0: raise GTDBTkException( 'hmmalign returned a non-zero exit code.') q_writer.put(None) p_writer.join() except Exception: for p in p_workers: p.terminate() p_writer.terminate() raise return {k: v for k, v in out_dict.items()}
def run(self, fastani_verification, genomes): """Using the instance defined number of CPUs, run FastANI.""" q_worker = mp.Queue() q_writer = mp.Queue() for userleaf, potential_nodes in fastani_verification.items(): q_worker.put((userleaf, potential_nodes)) [q_worker.put(None) for _ in range(self.cpus)] manager = mp.Manager() dict_out = manager.dict() p_workers = [mp.Process(target=self._worker, args=(q_worker, q_writer, genomes, dict_out)) for _ in range(self.cpus)] p_writer = mp.Process(target=self._writer, args=(q_writer, len(fastani_verification))) try: p_writer.start() for p_worker in p_workers: p_worker.start() for p_worker in p_workers: p_worker.join() # Gracefully terminate the program. if p_worker.exitcode != 0: raise GTDBTkException('FastANI returned a non-zero exit code.') q_writer.put(None) p_writer.join() except Exception: for p in p_workers: p.terminate() p_writer.terminate() raise return {k: v for k, v in dict_out.items()}
def trim_msa(self, untrimmed_msa, mask_type, maskid, output_file): """Trim the multiple sequence alignment using a mask. Parameters ---------- untrimmed_msa : str The path to the untrimmed MSA. mask_type : str Which mask should be used, reference or user specified. maskid : str The path to the mask used for trimming. output_file : str The path to the output trimmed MSA. """ if maskid == 'bac' and mask_type == 'reference': mask = os.path.join(Config.MASK_DIR, Config.MASK_BAC120) elif maskid == 'arc' and mask_type == 'reference': mask = os.path.join(Config.MASK_DIR, Config.MASK_AR122) elif mask_type == 'file': mask = maskid else: self.logger.error('Command not understood.') raise GTDBTkException('Command not understood.') with open(mask, 'r') as f: maskstr = f.readline() with open(output_file, 'w') as outfwriter: dict_genomes = read_fasta(untrimmed_msa, False) for k, v in dict_genomes.items(): aligned_seq = ''.join([ v[i] for i in range(0, len(maskstr)) if maskstr[i] == '1' ]) fasta_outstr = ">%s\n%s\n" % (k, aligned_seq) outfwriter.write(fasta_outstr)
def _run_multi_align(self, db_genome_id, path, marker_set_id): """ Returns the concatenated marker sequence for a specific genome :param db_genome_id: Selected genome :param path: Path to the genomic fasta file for the genome :param marker_set_id: Unique ID of marker set to use for alignment """ cur_marker_dir = os.path.dirname(os.path.dirname(path)) pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id) tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id) pfam_tophit_file.read() tigr_tophit_file.read() if marker_set_id == 'bac120': copy_number_file = CopyNumberFileBAC120('/dev/null', None) elif marker_set_id == 'ar122': copy_number_file = CopyNumberFileAR122('/dev/null', None) else: raise GTDBTkException('Unknown marker set.') copy_number_file.add_genome(db_genome_id, path, pfam_tophit_file, tigr_tophit_file) single_copy_hits = copy_number_file.get_single_copy_hits(db_genome_id) # gather information for all marker genes marker_paths = { "PFAM": os.path.join(self.pfam_hmm_dir, 'individual_hmms'), "TIGRFAM": os.path.join(os.path.dirname(self.tigrfam_hmm_dir), 'individual_hmms') } marker_dict_original = {} if marker_set_id == "bac120": for db_marker in sorted(self.bac120_markers): marker_dict_original.update({ marker.replace(".HMM", "").replace(".hmm", ""): os.path.join(marker_paths[db_marker], marker) for marker in self.bac120_markers[db_marker] }) elif marker_set_id == "ar122": for db_marker in sorted(self.ar122_markers): marker_dict_original.update({ marker.replace(".HMM", "").replace(".hmm", ""): os.path.join(marker_paths[db_marker], marker) for marker in self.ar122_markers[db_marker] }) elif marker_set_id == "rps23": for db_marker in sorted(self.rps23_markers): marker_dict_original.update({ marker.replace(".HMM", "").replace(".hmm", ""): os.path.join(marker_paths[db_marker], marker) for marker in self.rps23_markers[db_marker] }) # Iterate over each of the expected markers and store the gene sequence. gene_dict = dict() result_align = dict() for marker_id, marker_path in marker_dict_original.items(): hit = single_copy_hits.get(marker_id) if hit: # print(marker_id) gene_dict[marker_id] = { "marker_path": marker_path, "gene": hit['hit'].gene_id, "gene_seq": hit['seq'], "bitscore": hit['hit'].bit_score } else: hmm_len = self._get_hmm_size(marker_path) result_align[marker_id] = '-' * hmm_len # Align the markers. result_align.update(self._run_align(gene_dict, db_genome_id)) # we concatenate the aligned markers together and associate them with # the genome. return ''.join([x[1] for x in sorted(result_align.items())])