Example #1
0
    def identify(self, genomes, tln_tables, out_dir, prefix, force):
        """Identify marker genes in genomes.

        Parameters
        ----------
        genomes : dict
            Genome IDs as the key, path to genome file as value.
        tln_tables: Dict[str, int]
            Genome ID -> translation table mapping for those user-specified.
        out_dir : str
            Path to the output directory.
        prefix : str
            Prefix to append to generated files.
        force : bool
            Overwrite any existing files.

        Raises
        ------
        GTDBTkException
            If an exception is encountered during the identify step.

        """
        check_dependencies(['prodigal', 'hmmsearch'])

        self.logger.info('Identifying markers in %d genomes with %d threads.' %
                         (len(genomes), self.cpus))

        self.marker_gene_dir = os.path.join(out_dir, DIR_MARKER_GENE)
        prodigal = Prodigal(self.cpus, self.marker_gene_dir,
                            self.protein_file_suffix, self.nt_gene_file_suffix,
                            self.gff_file_suffix, force)
        self.logger.info("Running Prodigal {} to identify genes.".format(
            prodigal.version))
        genome_dictionary = prodigal.run(genomes, tln_tables)

        # annotated genes against TIGRFAM and Pfam databases
        self.logger.info("Identifying TIGRFAM protein families.")
        gene_files = [
            genome_dictionary[db_genome_id]['aa_gene_path']
            for db_genome_id in genome_dictionary.keys()
        ]

        tigr_search = TigrfamSearch(self.cpus, self.tigrfam_hmms,
                                    self.protein_file_suffix,
                                    self.tigrfam_suffix,
                                    self.tigrfam_top_hit_suffix,
                                    self.checksum_suffix, self.marker_gene_dir)
        tigr_search.run(gene_files)

        self.logger.info("Identifying Pfam protein families.")
        pfam_search = PfamSearch(self.cpus, self.pfam_hmm_dir,
                                 self.protein_file_suffix, self.pfam_suffix,
                                 self.pfam_top_hit_suffix,
                                 self.checksum_suffix, self.marker_gene_dir)
        pfam_search.run(gene_files)
        self.logger.info("Annotations done using HMMER {}.".format(
            tigr_search.version))

        self._report_identified_marker_genes(genome_dictionary, out_dir,
                                             prefix)
Example #2
0
    def identify(self, genomes, tln_tables, out_dir, prefix, force, genes,
                 write_single_copy_genes):
        """Identify marker genes in genomes.

        Parameters
        ----------
        genomes : dict
            Genome IDs as the key, path to genome file as value.
        tln_tables: Dict[str, int]
            Genome ID -> translation table mapping for those user-specified.
        out_dir : str
            Path to the output directory.
        prefix : str
            Prefix to append to generated files.
        force : bool
            Overwrite any existing files.
        genes : bool
            True if the supplied genomes are called genes, False otherwise.
        write_single_copy_genes : bool
            Write unique AR53/BAC120 marker files to disk.

        Raises
        ------
        GTDBTkException
            If an exception is encountered during the identify step.

        """
        check_dependencies(['prodigal', 'hmmsearch'])

        self.logger.info(
            f'Identifying markers in {len(genomes):,} genomes with '
            f'{self.cpus} threads.')

        self.marker_gene_dir = os.path.join(out_dir, DIR_MARKER_GENE)
        self.failed_genomes = os.path.join(out_dir,
                                           PATH_FAILS.format(prefix=prefix))

        if not genes:
            prodigal = Prodigal(self.cpus, self.failed_genomes,
                                self.marker_gene_dir, self.protein_file_suffix,
                                self.nt_gene_file_suffix, self.gff_file_suffix,
                                force)
            self.logger.log(
                Config.LOG_TASK,
                f'Running Prodigal {prodigal.version} to identify genes.')
            genome_dictionary = prodigal.run(genomes, tln_tables)

        else:
            self.logger.info(
                'Using supplied genomes as called genes, skipping Prodigal.')
            genome_dictionary = dict()
            for gid, gpath in genomes.items():
                genome_dictionary[gid] = {
                    'aa_gene_path': gpath,
                    'translation_table_path': None,
                    'nt_gene_path': None,
                    'best_translation_table': 'user_supplied',
                    'gff_path': None
                }

        # annotated genes against TIGRFAM and Pfam databases
        self.logger.log(Config.LOG_TASK,
                        'Identifying TIGRFAM protein families.')
        gene_files = [
            genome_dictionary[db_genome_id]['aa_gene_path']
            for db_genome_id in genome_dictionary.keys()
        ]
        tigr_search = TigrfamSearch(self.cpus, self.tigrfam_hmms,
                                    self.protein_file_suffix,
                                    self.tigrfam_suffix,
                                    self.tigrfam_top_hit_suffix,
                                    self.checksum_suffix, self.marker_gene_dir)
        tigr_search.run(gene_files)

        self.logger.log(Config.LOG_TASK, 'Identifying Pfam protein families.')
        pfam_search = PfamSearch(self.cpus, self.pfam_hmm_dir,
                                 self.protein_file_suffix, self.pfam_suffix,
                                 self.pfam_top_hit_suffix,
                                 self.checksum_suffix, self.marker_gene_dir)
        pfam_search.run(gene_files)
        self.logger.info(
            f'Annotations done using HMMER {tigr_search.version}.')

        self.logger.log(Config.LOG_TASK,
                        'Summarising identified marker genes.')
        self._report_identified_marker_genes(genome_dictionary, out_dir,
                                             prefix, write_single_copy_genes)
Example #3
0
    def identify(self, genomes, out_dir, prefix, force, genes):
        """Identify marker genes in genomes.

        Parameters
        ----------
        genomes : dict
            Genome IDs as the key, path to genome file as value.
        out_dir : str
            Path to the output directory.
        prefix : str
            Prefix to append to generated files.
        force : bool
            Overwrite any existing files.
        genes : bool
            True if the supplied genomes are called genes, False otherwise.

        Raises
        ------
        GTDBTkException
            If an exception is encountered during the identify step.

        """
        check_dependencies(['prodigal', 'hmmsearch'])

        self.logger.info('Identifying markers in %d genomes with %d threads.' %
                         (len(genomes), self.cpus))
        marker_gene_dir = os.path.join(out_dir, DIR_MARKER_GENE)

        if not genes:
            self.logger.info("Running Prodigal to identify genes.")
            prodigal = Prodigal(self.cpus, False, marker_gene_dir,
                                self.protein_file_suffix,
                                self.nt_gene_file_suffix, self.gff_file_suffix,
                                force)
            genome_dictionary = prodigal.run(genomes)

        else:
            self.logger.info(
                'Using supplied genomes as called genes, skipping Prodigal.')
            genome_dictionary = dict()
            for gid, gpath in genomes.items():
                genome_dictionary[gid] = {
                    'aa_gene_path': gpath,
                    'translation_table_path': None,
                    'nt_gene_path': None,
                    'best_translation_table': 'user_supplied',
                    'gff_path': None
                }

        gene_files = [(db_genome_id,
                       genome_dictionary[db_genome_id]['aa_gene_path'])
                      for db_genome_id in genome_dictionary.keys()]

        # annotated genes against TIGRFAM and Pfam databases
        self.logger.info("Identifying TIGRFAM protein families.")
        tigr_search = TigrfamSearch(self.cpus, self.tigrfam_hmms,
                                    self.protein_file_suffix,
                                    self.tigrfam_suffix,
                                    self.tigrfam_top_hit_suffix,
                                    self.checksum_suffix, marker_gene_dir)
        tigr_search.run(gene_files)

        self.logger.info("Identifying Pfam protein families.")
        pfam_search = PfamSearch(self.cpus, self.pfam_hmm_dir,
                                 self.protein_file_suffix, self.pfam_suffix,
                                 self.pfam_top_hit_suffix,
                                 self.checksum_suffix, marker_gene_dir)
        pfam_search.run(gene_files)

        self._report_identified_marker_genes(genome_dictionary, out_dir,
                                             prefix)