Exemple #1
0
    def align_marker_set(self, db_genome_ids, marker_set_id):
        """Threaded alignment using hmmalign for a given set of genomes.

        Parameters
        ----------
        db_genome_ids : dict
            A dictionary containing the genome ids and aa paths to process.
        marker_set_id : str
            The marker set of these genomes (bac120/ar122).

        Returns
        -------
        dict
            A dictionary of genome_ids -> aligned sequence.
        """
        q_worker = mp.Queue()
        q_writer = mp.Queue()

        for gid, gid_dict in db_genome_ids.items():
            q_worker.put((gid, gid_dict.get('aa_gene_path'), marker_set_id))
        [q_worker.put(None) for _ in range(self.threads)]

        manager = mp.Manager()
        out_dict = manager.dict()

        p_workers = [
            mp.Process(target=self._worker,
                       args=(q_worker, q_writer, out_dict))
            for _ in range(self.threads)
        ]

        p_writer = mp.Process(target=self._writer,
                              args=(q_writer, len(db_genome_ids)))

        try:
            p_writer.start()
            for p_worker in p_workers:
                p_worker.start()

            for p_worker in p_workers:
                p_worker.join()

                # Gracefully terminate the program.
                if p_worker.exitcode != 0:
                    raise GTDBTkException(
                        'hmmalign returned a non-zero exit code.')

            q_writer.put(None)
            p_writer.join()

        except Exception:
            for p in p_workers:
                p.terminate()

            p_writer.terminate()
            raise

        return {k: v for k, v in out_dict.items()}
Exemple #2
0
    def run(self, fastani_verification, genomes):
        """Using the instance defined number of CPUs, run FastANI."""

        q_worker = mp.Queue()
        q_writer = mp.Queue()

        for userleaf, potential_nodes in fastani_verification.items():
            q_worker.put((userleaf, potential_nodes))
        [q_worker.put(None) for _ in range(self.cpus)]

        manager = mp.Manager()
        dict_out = manager.dict()

        p_workers = [mp.Process(target=self._worker,
                                args=(q_worker, q_writer, genomes, dict_out))
                     for _ in range(self.cpus)]

        p_writer = mp.Process(target=self._writer,
                              args=(q_writer, len(fastani_verification)))

        try:
            p_writer.start()
            for p_worker in p_workers:
                p_worker.start()

            for p_worker in p_workers:
                p_worker.join()

                # Gracefully terminate the program.
                if p_worker.exitcode != 0:
                    raise GTDBTkException('FastANI returned a non-zero exit code.')

            q_writer.put(None)
            p_writer.join()

        except Exception:
            for p in p_workers:
                p.terminate()

            p_writer.terminate()
            raise

        return {k: v for k, v in dict_out.items()}
Exemple #3
0
    def trim_msa(self, untrimmed_msa, mask_type, maskid, output_file):
        """Trim the multiple sequence alignment using a mask.

        Parameters
        ----------
        untrimmed_msa : str
            The path to the untrimmed MSA.
        mask_type : str
            Which mask should be used, reference or user specified.
        maskid : str
            The path to the mask used for trimming.
        output_file : str
            The path to the output trimmed MSA.
        """
        if maskid == 'bac' and mask_type == 'reference':
            mask = os.path.join(Config.MASK_DIR, Config.MASK_BAC120)
        elif maskid == 'arc' and mask_type == 'reference':
            mask = os.path.join(Config.MASK_DIR, Config.MASK_AR122)
        elif mask_type == 'file':
            mask = maskid
        else:
            self.logger.error('Command not understood.')
            raise GTDBTkException('Command not understood.')

        with open(mask, 'r') as f:
            maskstr = f.readline()

        with open(output_file, 'w') as outfwriter:
            dict_genomes = read_fasta(untrimmed_msa, False)

            for k, v in dict_genomes.items():
                aligned_seq = ''.join([
                    v[i] for i in range(0, len(maskstr)) if maskstr[i] == '1'
                ])
                fasta_outstr = ">%s\n%s\n" % (k, aligned_seq)
                outfwriter.write(fasta_outstr)
Exemple #4
0
    def _run_multi_align(self, db_genome_id, path, marker_set_id):
        """
        Returns the concatenated marker sequence for a specific genome
        :param db_genome_id: Selected genome
        :param path: Path to the genomic fasta file for the genome
        :param marker_set_id: Unique ID of marker set to use for alignment
        """

        cur_marker_dir = os.path.dirname(os.path.dirname(path))
        pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id)
        tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id)
        pfam_tophit_file.read()
        tigr_tophit_file.read()

        if marker_set_id == 'bac120':
            copy_number_file = CopyNumberFileBAC120('/dev/null', None)
        elif marker_set_id == 'ar122':
            copy_number_file = CopyNumberFileAR122('/dev/null', None)
        else:
            raise GTDBTkException('Unknown marker set.')

        copy_number_file.add_genome(db_genome_id, path, pfam_tophit_file,
                                    tigr_tophit_file)
        single_copy_hits = copy_number_file.get_single_copy_hits(db_genome_id)

        # gather information for all marker genes
        marker_paths = {
            "PFAM":
            os.path.join(self.pfam_hmm_dir, 'individual_hmms'),
            "TIGRFAM":
            os.path.join(os.path.dirname(self.tigrfam_hmm_dir),
                         'individual_hmms')
        }

        marker_dict_original = {}
        if marker_set_id == "bac120":
            for db_marker in sorted(self.bac120_markers):
                marker_dict_original.update({
                    marker.replace(".HMM", "").replace(".hmm", ""):
                    os.path.join(marker_paths[db_marker], marker)
                    for marker in self.bac120_markers[db_marker]
                })
        elif marker_set_id == "ar122":
            for db_marker in sorted(self.ar122_markers):
                marker_dict_original.update({
                    marker.replace(".HMM", "").replace(".hmm", ""):
                    os.path.join(marker_paths[db_marker], marker)
                    for marker in self.ar122_markers[db_marker]
                })
        elif marker_set_id == "rps23":
            for db_marker in sorted(self.rps23_markers):
                marker_dict_original.update({
                    marker.replace(".HMM", "").replace(".hmm", ""):
                    os.path.join(marker_paths[db_marker], marker)
                    for marker in self.rps23_markers[db_marker]
                })

        # Iterate over each of the expected markers and store the gene sequence.
        gene_dict = dict()
        result_align = dict()
        for marker_id, marker_path in marker_dict_original.items():
            hit = single_copy_hits.get(marker_id)
            if hit:
                # print(marker_id)
                gene_dict[marker_id] = {
                    "marker_path": marker_path,
                    "gene": hit['hit'].gene_id,
                    "gene_seq": hit['seq'],
                    "bitscore": hit['hit'].bit_score
                }
            else:
                hmm_len = self._get_hmm_size(marker_path)
                result_align[marker_id] = '-' * hmm_len

        # Align the markers.
        result_align.update(self._run_align(gene_dict, db_genome_id))

        # we concatenate the aligned markers together and associate them with
        # the genome.
        return ''.join([x[1] for x in sorted(result_align.items())])