Example #1
0
def get_single_copy_hits_worker(job):
    """For a given genome, obtain the PFAM and TIGRFAM tophit files. Use
    this information to determine what hits are single copy.

    Parameters
    ----------
    job : Tuple[str, str, CopyNumberFile]
        The genome id, path to called genes, and domain-specific copy number file object.

    Returns
    -------
    Dict[str, Dict[str, str]]
        dict[marker id][genome id] = sequence
    """
    gid, aa_path, copy_number_file = job

    # Load the marker top hit files.
    marker_genes_dir = os.path.dirname(os.path.dirname(aa_path))
    pfam_tophit_file = TopHitPfamFile(marker_genes_dir, gid)
    tigr_tophit_file = TopHitTigrFile(marker_genes_dir, gid)
    pfam_tophit_file.read()
    tigr_tophit_file.read()

    # Process each of the genes to determine if they are single copy.
    cnf = copy_number_file('/dev/null', None)
    cnf.add_genome(gid, aa_path, pfam_tophit_file, tigr_tophit_file)
    single_copy = cnf.get_single_copy_hits(gid)

    # Store the output
    out = defaultdict(dict)
    for marker_id, marker_d in single_copy.items():
        out[marker_id][gid] = marker_d['seq']
    return out
Example #2
0
    def _report_identified_marker_genes(self, gene_dict, outdir, prefix):
        """Report statistics for identified marker genes."""

        # Summarise the copy number of each AR122 and BAC120 markers.
        tln_summary_file = TlnTableSummaryFile(outdir, prefix)
        ar122_copy_number_file = CopyNumberFileAR122(outdir, prefix)
        bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix)

        # Process each genome.
        for db_genome_id, info in sorted(gene_dict.items()):
            cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE)
            pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id)
            tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id)
            pfam_tophit_file.read()
            tigr_tophit_file.read()

            # Summarise each of the markers for this genome.
            ar122_copy_number_file.add_genome(db_genome_id,
                                              info.get("aa_gene_path"),
                                              pfam_tophit_file,
                                              tigr_tophit_file)
            bac120_copy_number_file.add_genome(db_genome_id,
                                               info.get("aa_gene_path"),
                                               pfam_tophit_file,
                                               tigr_tophit_file)

            # Write the best translation table to disk for this genome.
            tln_summary_file.add_genome(db_genome_id,
                                        info.get("best_translation_table"))

        # Write each of the summary files to disk.
        ar122_copy_number_file.write()
        bac120_copy_number_file.write()
        tln_summary_file.write()

        # Create a symlink to store the summary files in the root.
        symlink_f(
            PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix),
            os.path.join(
                outdir,
                os.path.basename(
                    PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix))))
        symlink_f(
            PATH_AR122_MARKER_SUMMARY.format(prefix=prefix),
            os.path.join(
                outdir,
                os.path.basename(
                    PATH_AR122_MARKER_SUMMARY.format(prefix=prefix))))
        symlink_f(
            PATH_TLN_TABLE_SUMMARY.format(prefix=prefix),
            os.path.join(
                outdir,
                os.path.basename(
                    PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))))
Example #3
0
    def _report_identified_marker_genes(self, gene_dict, outdir, prefix,
                                        write_single_copy_genes):
        """Report statistics for identified marker genes."""

        # Summarise the copy number of each AR53 and BAC120 markers.
        tln_summary_file = TlnTableSummaryFile(outdir, prefix)
        ar53_copy_number_file = CopyNumberFileAR53(outdir, prefix)
        bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix)

        # Process each genome.
        for db_genome_id, info in tqdm_log(sorted(gene_dict.items()),
                                           unit='genome'):
            cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE)
            pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id)
            tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id)
            pfam_tophit_file.read()
            tigr_tophit_file.read()

            # Summarise each of the markers for this genome.
            ar53_copy_number_file.add_genome(db_genome_id,
                                             info.get("aa_gene_path"),
                                             pfam_tophit_file,
                                             tigr_tophit_file)
            bac120_copy_number_file.add_genome(db_genome_id,
                                               info.get("aa_gene_path"),
                                               pfam_tophit_file,
                                               tigr_tophit_file)

            # Write the best translation table to disk for this genome.
            tln_summary_file.add_genome(db_genome_id,
                                        info.get("best_translation_table"))

        # Write each of the summary files to disk.
        ar53_copy_number_file.write()
        bac120_copy_number_file.write()
        tln_summary_file.write()

        # Create a symlink to store the summary files in the root.
        # symlink_f(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix),
        #           os.path.join(outdir, os.path.basename(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix))))
        # symlink_f(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix),
        #           os.path.join(outdir, os.path.basename(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix))))
        # symlink_f(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix),
        #           os.path.join(outdir, os.path.basename(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))))
        symlink_f(
            PATH_FAILS.format(prefix=prefix),
            os.path.join(outdir,
                         os.path.basename(PATH_FAILS.format(prefix=prefix))))

        # Write the single copy AR53/BAC120 FASTA files to disk.
        if write_single_copy_genes:
            fasta_dir = os.path.join(outdir, DIR_IDENTIFY_FASTA)
            self.logger.info(
                f'Writing unaligned single-copy genes to: {fasta_dir}')

            # Iterate over each domain.
            marker_doms = list()
            marker_doms.append(
                (Config.AR53_MARKERS['PFAM'] + Config.AR53_MARKERS['TIGRFAM'],
                 ar53_copy_number_file, 'ar53'))
            marker_doms.append((Config.BAC120_MARKERS['PFAM'] +
                                Config.BAC120_MARKERS['TIGRFAM'],
                                bac120_copy_number_file, 'bac120'))
            for marker_names, marker_file, marker_d in marker_doms:

                # Create the domain-specific subdirectory.
                fasta_d_dir = os.path.join(fasta_dir, marker_d)
                make_sure_path_exists(fasta_d_dir)

                # Iterate over each marker.
                for marker_name in marker_names:
                    marker_name = marker_name.rstrip(r'\.[HMMhmm]')
                    marker_path = os.path.join(fasta_d_dir,
                                               f'{marker_name}.fa')

                    to_write = list()
                    for genome_id in sorted(gene_dict):
                        unq_hits = marker_file.get_single_copy_hits(genome_id)
                        if marker_name in unq_hits:
                            to_write.append(f'>{genome_id}')
                            to_write.append(unq_hits[marker_name]['seq'])

                    if len(to_write) > 0:
                        with open(marker_path, 'w') as fh:
                            fh.write('\n'.join(to_write))
Example #4
0
    def _run_multi_align(self, db_genome_id, path, marker_set_id):
        """
        Returns the concatenated marker sequence for a specific genome
        :param db_genome_id: Selected genome
        :param path: Path to the genomic fasta file for the genome
        :param marker_set_id: Unique ID of marker set to use for alignment
        """

        cur_marker_dir = os.path.dirname(os.path.dirname(path))
        pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id)
        tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id)
        pfam_tophit_file.read()
        tigr_tophit_file.read()

        if marker_set_id == 'bac120':
            copy_number_file = CopyNumberFileBAC120('/dev/null', None)
        elif marker_set_id == 'ar122':
            copy_number_file = CopyNumberFileAR122('/dev/null', None)
        else:
            raise GTDBTkException('Unknown marker set.')

        copy_number_file.add_genome(db_genome_id, path, pfam_tophit_file,
                                    tigr_tophit_file)
        single_copy_hits = copy_number_file.get_single_copy_hits(db_genome_id)

        # gather information for all marker genes
        marker_paths = {
            "PFAM":
            os.path.join(self.pfam_hmm_dir, 'individual_hmms'),
            "TIGRFAM":
            os.path.join(os.path.dirname(self.tigrfam_hmm_dir),
                         'individual_hmms')
        }

        marker_dict_original = {}
        if marker_set_id == "bac120":
            for db_marker in sorted(self.bac120_markers):
                marker_dict_original.update({
                    marker.replace(".HMM", "").replace(".hmm", ""):
                    os.path.join(marker_paths[db_marker], marker)
                    for marker in self.bac120_markers[db_marker]
                })
        elif marker_set_id == "ar122":
            for db_marker in sorted(self.ar122_markers):
                marker_dict_original.update({
                    marker.replace(".HMM", "").replace(".hmm", ""):
                    os.path.join(marker_paths[db_marker], marker)
                    for marker in self.ar122_markers[db_marker]
                })
        elif marker_set_id == "rps23":
            for db_marker in sorted(self.rps23_markers):
                marker_dict_original.update({
                    marker.replace(".HMM", "").replace(".hmm", ""):
                    os.path.join(marker_paths[db_marker], marker)
                    for marker in self.rps23_markers[db_marker]
                })

        # Iterate over each of the expected markers and store the gene sequence.
        gene_dict = dict()
        result_align = dict()
        for marker_id, marker_path in marker_dict_original.items():
            hit = single_copy_hits.get(marker_id)
            if hit:
                # print(marker_id)
                gene_dict[marker_id] = {
                    "marker_path": marker_path,
                    "gene": hit['hit'].gene_id,
                    "gene_seq": hit['seq'],
                    "bitscore": hit['hit'].bit_score
                }
            else:
                hmm_len = self._get_hmm_size(marker_path)
                result_align[marker_id] = '-' * hmm_len

        # Align the markers.
        result_align.update(self._run_align(gene_dict, db_genome_id))

        # we concatenate the aligned markers together and associate them with
        # the genome.
        return ''.join([x[1] for x in sorted(result_align.items())])