Esempio n. 1
0
def get_single_copy_hits_worker(job):
    """For a given genome, obtain the PFAM and TIGRFAM tophit files. Use
    this information to determine what hits are single copy.

    Parameters
    ----------
    job : Tuple[str, str, CopyNumberFile]
        The genome id, path to called genes, and domain-specific copy number file object.

    Returns
    -------
    Dict[str, Dict[str, str]]
        dict[marker id][genome id] = sequence
    """
    gid, aa_path, copy_number_file = job

    # Load the marker top hit files.
    marker_genes_dir = os.path.dirname(os.path.dirname(aa_path))
    pfam_tophit_file = TopHitPfamFile(marker_genes_dir, gid)
    tigr_tophit_file = TopHitTigrFile(marker_genes_dir, gid)
    pfam_tophit_file.read()
    tigr_tophit_file.read()

    # Process each of the genes to determine if they are single copy.
    cnf = copy_number_file('/dev/null', None)
    cnf.add_genome(gid, aa_path, pfam_tophit_file, tigr_tophit_file)
    single_copy = cnf.get_single_copy_hits(gid)

    # Store the output
    out = defaultdict(dict)
    for marker_id, marker_d in single_copy.items():
        out[marker_id][gid] = marker_d['seq']
    return out
Esempio n. 2
0
    def _report_identified_marker_genes(self, gene_dict, outdir, prefix):
        """Report statistics for identified marker genes."""

        # Summarise the copy number of each AR122 and BAC120 markers.
        tln_summary_file = TlnTableSummaryFile(outdir, prefix)
        ar122_copy_number_file = CopyNumberFileAR122(outdir, prefix)
        bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix)

        # Process each genome.
        for db_genome_id, info in sorted(gene_dict.items()):
            cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE)
            pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id)
            tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id)
            pfam_tophit_file.read()
            tigr_tophit_file.read()

            # Summarise each of the markers for this genome.
            ar122_copy_number_file.add_genome(db_genome_id,
                                              info.get("aa_gene_path"),
                                              pfam_tophit_file,
                                              tigr_tophit_file)
            bac120_copy_number_file.add_genome(db_genome_id,
                                               info.get("aa_gene_path"),
                                               pfam_tophit_file,
                                               tigr_tophit_file)

            # Write the best translation table to disk for this genome.
            tln_summary_file.add_genome(db_genome_id,
                                        info.get("best_translation_table"))

        # Write each of the summary files to disk.
        ar122_copy_number_file.write()
        bac120_copy_number_file.write()
        tln_summary_file.write()

        # Create a symlink to store the summary files in the root.
        symlink_f(
            PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix),
            os.path.join(
                outdir,
                os.path.basename(
                    PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix))))
        symlink_f(
            PATH_AR122_MARKER_SUMMARY.format(prefix=prefix),
            os.path.join(
                outdir,
                os.path.basename(
                    PATH_AR122_MARKER_SUMMARY.format(prefix=prefix))))
        symlink_f(
            PATH_TLN_TABLE_SUMMARY.format(prefix=prefix),
            os.path.join(
                outdir,
                os.path.basename(
                    PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))))
Esempio n. 3
0
    def _workerThread(self, queueIn, queueOut, n_skipped):
        """Process each data item in parallel."""
        try:
            while True:
                gene_file = queueIn.get(block=True, timeout=None)
                if gene_file is None:
                    break

                genome_dir, filename = os.path.split(gene_file)
                genome_id = filename.replace(self.protein_file_suffix, '')
                output_hit_file = os.path.join(self.output_dir, genome_id, filename.replace(self.protein_file_suffix,
                                                                                            self.pfam_suffix))

                # Check if this has already been processed.
                out_files = (output_hit_file, TopHitPfamFile.get_path(self.output_dir, genome_id))
                if all([file_has_checksum(x) for x in out_files]):
                    self.warnings.info(f'Skipped Pfam processing for: {genome_id}')
                    with n_skipped.get_lock():
                        n_skipped.value += 1
                else:
                    pfam_scan = PfamScan(cpu=self.cpus_per_genome, fasta=gene_file, dir=self.pfam_hmm_dir)
                    pfam_scan.search()
                    pfam_scan.write_results(output_hit_file, None, None, None, None)

                    # calculate checksum
                    with open(output_hit_file + self.checksum_suffix, 'w') as fh:
                        fh.write(sha256(output_hit_file))

                    # identify top hit for each gene
                    self._topHit(output_hit_file)

                queueOut.put(gene_file)
        except Exception as error:
            raise error
Esempio n. 4
0
    def _topHit(self, pfam_file):
        """Determine top hits to PFAMs.

        A gene may be assigned to multiple
        PFAM families from the same clan. The
        search_pfam.pl script takes care of
        most of these issues and here the results
        are simply parsed.

        Parameters
        ----------
        pfam_file : str
            Name of file containing hits to PFAM HMMs.
        """

        assembly_dir, filename = os.path.split(pfam_file)
        genome_id = filename.replace(self.pfam_suffix, '')
        tophit_file = TopHitPfamFile(self.output_dir, genome_id)

        with open(pfam_file, 'r') as fh_pfam:
            for line in fh_pfam:
                if line[0] == '#' or not line.strip():
                    continue

                line_split = line.split()
                gene_id = line_split[0]
                hmm_id = line_split[5]
                evalue = float(line_split[12])
                bitscore = float(line_split[11])
                tophit_file.add_hit(gene_id, hmm_id, evalue, bitscore)

        tophit_file.write()
Esempio n. 5
0
    def test__merge_hit_files(self):
        pfam_th = TopHitPfamFile(os.path.join(self.dir_tmp), 'genome_1')
        pfam_th.add_hit('gene_a', 'PFAM_1', 0.05, 100)
        pfam_th.add_hit('gene_b', 'PFAM_2', 0.05, 200)
        pfam_th.add_hit('gene_c', 'PFAM_2', 0.05, 100)

        tigr_th = TopHitTigrFile(os.path.join(self.dir_tmp), 'genome_1')
        tigr_th.add_hit('gene_x', 'TIGR_1', 0.05, 100)

        expected = {'TIGR_1': [Hit('gene_x', 'TIGR_1', 0.05, 100)],
                    'PFAM_1': [Hit('gene_a', 'PFAM_1', 0.05, 100)],
                    'PFAM_2': [Hit('gene_b', 'PFAM_2', 0.05, 200),
                               Hit('gene_c', 'PFAM_2', 0.05, 100)]}
        self.assertDictEqual(expected, CopyNumberFile._merge_hit_files(pfam_th, tigr_th))
Esempio n. 6
0
    def _report_identified_marker_genes(self, gene_dict, outdir, prefix,
                                        write_single_copy_genes):
        """Report statistics for identified marker genes."""

        # Summarise the copy number of each AR53 and BAC120 markers.
        tln_summary_file = TlnTableSummaryFile(outdir, prefix)
        ar53_copy_number_file = CopyNumberFileAR53(outdir, prefix)
        bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix)

        # Process each genome.
        for db_genome_id, info in tqdm_log(sorted(gene_dict.items()),
                                           unit='genome'):
            cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE)
            pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id)
            tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id)
            pfam_tophit_file.read()
            tigr_tophit_file.read()

            # Summarise each of the markers for this genome.
            ar53_copy_number_file.add_genome(db_genome_id,
                                             info.get("aa_gene_path"),
                                             pfam_tophit_file,
                                             tigr_tophit_file)
            bac120_copy_number_file.add_genome(db_genome_id,
                                               info.get("aa_gene_path"),
                                               pfam_tophit_file,
                                               tigr_tophit_file)

            # Write the best translation table to disk for this genome.
            tln_summary_file.add_genome(db_genome_id,
                                        info.get("best_translation_table"))

        # Write each of the summary files to disk.
        ar53_copy_number_file.write()
        bac120_copy_number_file.write()
        tln_summary_file.write()

        # Create a symlink to store the summary files in the root.
        # symlink_f(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix),
        #           os.path.join(outdir, os.path.basename(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix))))
        # symlink_f(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix),
        #           os.path.join(outdir, os.path.basename(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix))))
        # symlink_f(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix),
        #           os.path.join(outdir, os.path.basename(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))))
        symlink_f(
            PATH_FAILS.format(prefix=prefix),
            os.path.join(outdir,
                         os.path.basename(PATH_FAILS.format(prefix=prefix))))

        # Write the single copy AR53/BAC120 FASTA files to disk.
        if write_single_copy_genes:
            fasta_dir = os.path.join(outdir, DIR_IDENTIFY_FASTA)
            self.logger.info(
                f'Writing unaligned single-copy genes to: {fasta_dir}')

            # Iterate over each domain.
            marker_doms = list()
            marker_doms.append(
                (Config.AR53_MARKERS['PFAM'] + Config.AR53_MARKERS['TIGRFAM'],
                 ar53_copy_number_file, 'ar53'))
            marker_doms.append((Config.BAC120_MARKERS['PFAM'] +
                                Config.BAC120_MARKERS['TIGRFAM'],
                                bac120_copy_number_file, 'bac120'))
            for marker_names, marker_file, marker_d in marker_doms:

                # Create the domain-specific subdirectory.
                fasta_d_dir = os.path.join(fasta_dir, marker_d)
                make_sure_path_exists(fasta_d_dir)

                # Iterate over each marker.
                for marker_name in marker_names:
                    marker_name = marker_name.rstrip(r'\.[HMMhmm]')
                    marker_path = os.path.join(fasta_d_dir,
                                               f'{marker_name}.fa')

                    to_write = list()
                    for genome_id in sorted(gene_dict):
                        unq_hits = marker_file.get_single_copy_hits(genome_id)
                        if marker_name in unq_hits:
                            to_write.append(f'>{genome_id}')
                            to_write.append(unq_hits[marker_name]['seq'])

                    if len(to_write) > 0:
                        with open(marker_path, 'w') as fh:
                            fh.write('\n'.join(to_write))
Esempio n. 7
0
    def _run_multi_align(self, db_genome_id, path, marker_set_id):
        """
        Returns the concatenated marker sequence for a specific genome
        :param db_genome_id: Selected genome
        :param path: Path to the genomic fasta file for the genome
        :param marker_set_id: Unique ID of marker set to use for alignment
        """

        cur_marker_dir = os.path.dirname(os.path.dirname(path))
        pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id)
        tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id)
        pfam_tophit_file.read()
        tigr_tophit_file.read()

        if marker_set_id == 'bac120':
            copy_number_file = CopyNumberFileBAC120('/dev/null', None)
        elif marker_set_id == 'ar122':
            copy_number_file = CopyNumberFileAR122('/dev/null', None)
        else:
            raise GTDBTkException('Unknown marker set.')

        copy_number_file.add_genome(db_genome_id, path, pfam_tophit_file,
                                    tigr_tophit_file)
        single_copy_hits = copy_number_file.get_single_copy_hits(db_genome_id)

        # gather information for all marker genes
        marker_paths = {
            "PFAM":
            os.path.join(self.pfam_hmm_dir, 'individual_hmms'),
            "TIGRFAM":
            os.path.join(os.path.dirname(self.tigrfam_hmm_dir),
                         'individual_hmms')
        }

        marker_dict_original = {}
        if marker_set_id == "bac120":
            for db_marker in sorted(self.bac120_markers):
                marker_dict_original.update({
                    marker.replace(".HMM", "").replace(".hmm", ""):
                    os.path.join(marker_paths[db_marker], marker)
                    for marker in self.bac120_markers[db_marker]
                })
        elif marker_set_id == "ar122":
            for db_marker in sorted(self.ar122_markers):
                marker_dict_original.update({
                    marker.replace(".HMM", "").replace(".hmm", ""):
                    os.path.join(marker_paths[db_marker], marker)
                    for marker in self.ar122_markers[db_marker]
                })
        elif marker_set_id == "rps23":
            for db_marker in sorted(self.rps23_markers):
                marker_dict_original.update({
                    marker.replace(".HMM", "").replace(".hmm", ""):
                    os.path.join(marker_paths[db_marker], marker)
                    for marker in self.rps23_markers[db_marker]
                })

        # Iterate over each of the expected markers and store the gene sequence.
        gene_dict = dict()
        result_align = dict()
        for marker_id, marker_path in marker_dict_original.items():
            hit = single_copy_hits.get(marker_id)
            if hit:
                # print(marker_id)
                gene_dict[marker_id] = {
                    "marker_path": marker_path,
                    "gene": hit['hit'].gene_id,
                    "gene_seq": hit['seq'],
                    "bitscore": hit['hit'].bit_score
                }
            else:
                hmm_len = self._get_hmm_size(marker_path)
                result_align[marker_id] = '-' * hmm_len

        # Align the markers.
        result_align.update(self._run_align(gene_dict, db_genome_id))

        # we concatenate the aligned markers together and associate them with
        # the genome.
        return ''.join([x[1] for x in sorted(result_align.items())])
Esempio n. 8
0
    def setUp(self):
        self.dir_tmp = tempfile.mkdtemp(prefix='gtdbtk_tmp_')

        # Build a test set of markers
        self.test_markers = {
            "PFAM": ["PFAM_1.hmm", "PFAM_2.hmm", "PFAM_3.hmm"],
            "TIGRFAM": ["TIGR_1.HMM", "TIGR_2.HMM"]
        }

        # Build the copy number file
        self.cn_path = os.path.join(self.dir_tmp, 'cn.tsv')
        self.cn = CopyNumberFile(self.cn_path, 'test5', self.test_markers)

        # Store the hit files
        self.pfam_th = dict()
        self.tigr_th = dict()
        self.faa = dict()

        # Test case for genome_1
        """
        Single Copy: PFAM_1, TIGR_1
        Multi-Unique: PFAM_2
        Multi-Copy: None
        Missing: PFAM_3, TIGR_2
        """
        self.pfam_th['genome_1'] = TopHitPfamFile(os.path.join(self.dir_tmp),
                                                  'genome_1')
        self.pfam_th['genome_1'].add_hit('gene_a', 'PFAM_1', 0.05, 100)
        self.pfam_th['genome_1'].add_hit('gene_b', 'PFAM_2', 0.05, 200)
        self.pfam_th['genome_1'].add_hit('gene_c', 'PFAM_2', 0.05, 100)
        self.pfam_th['genome_1'].write()

        self.tigr_th['genome_1'] = TopHitTigrFile(os.path.join(self.dir_tmp),
                                                  'genome_1')
        self.tigr_th['genome_1'].add_hit('gene_x', 'TIGR_1', 0.05, 100)

        self.faa['genome_1'] = os.path.join(self.dir_tmp, 'genome_1.faa')
        with open(self.faa['genome_1'], 'w') as fh:
            fh.write('>gene_a\n')
            fh.write('VVVVVV\n')
            fh.write('>gene_b\n')
            fh.write('AAVVPP\n')
            fh.write('>gene_c\n')
            fh.write('AAVVPP\n')
            fh.write('>gene_x\n')
            fh.write('AAAAAA\n')

        # Test case for genome_2
        """
        Single Copy: PFAM_2
        Multi-Unique: TIGR_1, TIGR_2
        Multi-Copy: PFAM_1, PFAM_3
        Missing: None
        """
        self.pfam_th['genome_2'] = TopHitPfamFile(os.path.join(self.dir_tmp),
                                                  'genome_2')
        self.pfam_th['genome_2'].add_hit('gene_a', 'PFAM_2', 0.05, 100)
        self.pfam_th['genome_2'].add_hit('gene_w', 'PFAM_1', 0.05, 100)
        self.pfam_th['genome_2'].add_hit('gene_z', 'PFAM_1', 0.05, 100)
        self.pfam_th['genome_2'].add_hit('gene_y', 'PFAM_3', 0.05, 100)
        self.pfam_th['genome_2'].add_hit('gene_x', 'PFAM_3', 0.05, 100)
        self.pfam_th['genome_2'].write()

        self.tigr_th['genome_2'] = TopHitTigrFile(os.path.join(self.dir_tmp),
                                                  'genome_2')
        self.tigr_th['genome_2'].add_hit('gene_w', 'TIGR_1', 0.05, 100)
        self.tigr_th['genome_2'].add_hit('gene_x', 'TIGR_1', 0.05, 200)
        self.tigr_th['genome_2'].add_hit('gene_y', 'TIGR_2', 0.01, 100)
        self.tigr_th['genome_2'].add_hit('gene_z', 'TIGR_2', 0.05, 100)

        self.faa['genome_2'] = os.path.join(self.dir_tmp, 'genome_2.faa')
        with open(self.faa['genome_2'], 'w') as fh:
            fh.write('>gene_a\n')
            fh.write('VVVVVV\n')
            fh.write('>gene_w\n')
            fh.write('AAAVVV\n')
            fh.write('>gene_x\n')
            fh.write('AAAVVV\n')
            fh.write('>gene_y\n')
            fh.write('VVVAAA\n')
            fh.write('>gene_z\n')
            fh.write('VVVAAA\n')

        # Add the genomes
        for gid in self.faa:
            self.cn.add_genome(gid,
                               self.faa[gid],
                               pfam_th=self.pfam_th[gid],
                               tigr_th=self.tigr_th[gid])
Esempio n. 9
0
 def setUp(self):
     self.dir_tmp = tempfile.mkdtemp(prefix='gtdbtk_tmp_')
     self.path = os.path.join(self.dir_tmp, 'top_hit.tsv')
     self.th = TopHitPfamFile(self.path, 'g')