def identify(self, genomes, tln_tables, out_dir, prefix, force): """Identify marker genes in genomes. Parameters ---------- genomes : dict Genome IDs as the key, path to genome file as value. tln_tables: Dict[str, int] Genome ID -> translation table mapping for those user-specified. out_dir : str Path to the output directory. prefix : str Prefix to append to generated files. force : bool Overwrite any existing files. Raises ------ GTDBTkException If an exception is encountered during the identify step. """ check_dependencies(['prodigal', 'hmmsearch']) self.logger.info('Identifying markers in %d genomes with %d threads.' % (len(genomes), self.cpus)) self.marker_gene_dir = os.path.join(out_dir, DIR_MARKER_GENE) prodigal = Prodigal(self.cpus, self.marker_gene_dir, self.protein_file_suffix, self.nt_gene_file_suffix, self.gff_file_suffix, force) self.logger.info("Running Prodigal {} to identify genes.".format( prodigal.version)) genome_dictionary = prodigal.run(genomes, tln_tables) # annotated genes against TIGRFAM and Pfam databases self.logger.info("Identifying TIGRFAM protein families.") gene_files = [ genome_dictionary[db_genome_id]['aa_gene_path'] for db_genome_id in genome_dictionary.keys() ] tigr_search = TigrfamSearch(self.cpus, self.tigrfam_hmms, self.protein_file_suffix, self.tigrfam_suffix, self.tigrfam_top_hit_suffix, self.checksum_suffix, self.marker_gene_dir) tigr_search.run(gene_files) self.logger.info("Identifying Pfam protein families.") pfam_search = PfamSearch(self.cpus, self.pfam_hmm_dir, self.protein_file_suffix, self.pfam_suffix, self.pfam_top_hit_suffix, self.checksum_suffix, self.marker_gene_dir) pfam_search.run(gene_files) self.logger.info("Annotations done using HMMER {}.".format( tigr_search.version)) self._report_identified_marker_genes(genome_dictionary, out_dir, prefix)
def identify(self, genomes, tln_tables, out_dir, prefix, force, genes, write_single_copy_genes): """Identify marker genes in genomes. Parameters ---------- genomes : dict Genome IDs as the key, path to genome file as value. tln_tables: Dict[str, int] Genome ID -> translation table mapping for those user-specified. out_dir : str Path to the output directory. prefix : str Prefix to append to generated files. force : bool Overwrite any existing files. genes : bool True if the supplied genomes are called genes, False otherwise. write_single_copy_genes : bool Write unique AR53/BAC120 marker files to disk. Raises ------ GTDBTkException If an exception is encountered during the identify step. """ check_dependencies(['prodigal', 'hmmsearch']) self.logger.info( f'Identifying markers in {len(genomes):,} genomes with ' f'{self.cpus} threads.') self.marker_gene_dir = os.path.join(out_dir, DIR_MARKER_GENE) self.failed_genomes = os.path.join(out_dir, PATH_FAILS.format(prefix=prefix)) if not genes: prodigal = Prodigal(self.cpus, self.failed_genomes, self.marker_gene_dir, self.protein_file_suffix, self.nt_gene_file_suffix, self.gff_file_suffix, force) self.logger.log( Config.LOG_TASK, f'Running Prodigal {prodigal.version} to identify genes.') genome_dictionary = prodigal.run(genomes, tln_tables) else: self.logger.info( 'Using supplied genomes as called genes, skipping Prodigal.') genome_dictionary = dict() for gid, gpath in genomes.items(): genome_dictionary[gid] = { 'aa_gene_path': gpath, 'translation_table_path': None, 'nt_gene_path': None, 'best_translation_table': 'user_supplied', 'gff_path': None } # annotated genes against TIGRFAM and Pfam databases self.logger.log(Config.LOG_TASK, 'Identifying TIGRFAM protein families.') gene_files = [ genome_dictionary[db_genome_id]['aa_gene_path'] for db_genome_id in genome_dictionary.keys() ] tigr_search = TigrfamSearch(self.cpus, self.tigrfam_hmms, self.protein_file_suffix, self.tigrfam_suffix, self.tigrfam_top_hit_suffix, self.checksum_suffix, self.marker_gene_dir) tigr_search.run(gene_files) self.logger.log(Config.LOG_TASK, 'Identifying Pfam protein families.') pfam_search = PfamSearch(self.cpus, self.pfam_hmm_dir, self.protein_file_suffix, self.pfam_suffix, self.pfam_top_hit_suffix, self.checksum_suffix, self.marker_gene_dir) pfam_search.run(gene_files) self.logger.info( f'Annotations done using HMMER {tigr_search.version}.') self.logger.log(Config.LOG_TASK, 'Summarising identified marker genes.') self._report_identified_marker_genes(genome_dictionary, out_dir, prefix, write_single_copy_genes)
def identify(self, genomes, out_dir, prefix, force, genes): """Identify marker genes in genomes. Parameters ---------- genomes : dict Genome IDs as the key, path to genome file as value. out_dir : str Path to the output directory. prefix : str Prefix to append to generated files. force : bool Overwrite any existing files. genes : bool True if the supplied genomes are called genes, False otherwise. Raises ------ GTDBTkException If an exception is encountered during the identify step. """ check_dependencies(['prodigal', 'hmmsearch']) self.logger.info('Identifying markers in %d genomes with %d threads.' % (len(genomes), self.cpus)) marker_gene_dir = os.path.join(out_dir, DIR_MARKER_GENE) if not genes: self.logger.info("Running Prodigal to identify genes.") prodigal = Prodigal(self.cpus, False, marker_gene_dir, self.protein_file_suffix, self.nt_gene_file_suffix, self.gff_file_suffix, force) genome_dictionary = prodigal.run(genomes) else: self.logger.info( 'Using supplied genomes as called genes, skipping Prodigal.') genome_dictionary = dict() for gid, gpath in genomes.items(): genome_dictionary[gid] = { 'aa_gene_path': gpath, 'translation_table_path': None, 'nt_gene_path': None, 'best_translation_table': 'user_supplied', 'gff_path': None } gene_files = [(db_genome_id, genome_dictionary[db_genome_id]['aa_gene_path']) for db_genome_id in genome_dictionary.keys()] # annotated genes against TIGRFAM and Pfam databases self.logger.info("Identifying TIGRFAM protein families.") tigr_search = TigrfamSearch(self.cpus, self.tigrfam_hmms, self.protein_file_suffix, self.tigrfam_suffix, self.tigrfam_top_hit_suffix, self.checksum_suffix, marker_gene_dir) tigr_search.run(gene_files) self.logger.info("Identifying Pfam protein families.") pfam_search = PfamSearch(self.cpus, self.pfam_hmm_dir, self.protein_file_suffix, self.pfam_suffix, self.pfam_top_hit_suffix, self.checksum_suffix, marker_gene_dir) pfam_search.run(gene_files) self._report_identified_marker_genes(genome_dictionary, out_dir, prefix)