Ejemplo n.º 1
0
    def get_functions_and_sequences_dicts_from_contigs_db(self, genome_name):
        g = self.genomes[genome_name]

        args = argparse.Namespace(contigs_db=g['contigs_db_path'])
        contigs_super = dbops.ContigsSuperclass(
            args, r=anvio.terminal.Run(verbose=False))

        if self.functions_are_available:
            contigs_super.init_functions(
                requested_sources=list(self.function_annotation_sources))
            function_calls_dict = contigs_super.gene_function_calls_dict
        else:
            function_calls_dict = {}

        # get dna sequences
        gene_caller_ids_list, dna_sequences_dict = contigs_super.get_sequences_for_gene_callers_ids(
            gene_caller_ids_list=list(g['gene_caller_ids']))

        # get amino acid sequences.
        # FIXME: this should be done in the contigs super.
        contigs_db = dbops.ContigsDatabase(g['contigs_db_path'])
        aa_sequences_dict = contigs_db.db.get_table_as_dict(
            t.gene_amino_acid_sequences_table_name)
        contigs_db.disconnect()

        return (function_calls_dict, aa_sequences_dict, dna_sequences_dict)
Ejemplo n.º 2
0
    def process(self, skip_init=False):
        if not skip_init:
            self.init()

        if not len(self.gene_caller_ids_of_interest):
            self.run.warning(
                "There aren't any gene calls that match to the criteria you provided to anvi'o\
                              export locus magic. Is this yet another case of you did everything right\
                              yet anvi'o failed you? If that's the case, let us know :( This class will quietly\
                              kill this process without reporting any error since a lack of hit may be the\
                              expected outcome of some weird processes somewhere."
            )

        self.contigs_db = dbops.ContigsSuperclass(self.args, r=self.run_object)
        self.contigs_db.init_functions()
        counter = 1
        for gene_callers_id in self.gene_caller_ids_of_interest:
            self.run.warning(None,
                             header="Exporting locus %d of %d" % \
                                        (counter, len(self.gene_caller_ids_of_interest)),
                             nl_after=0)

            output_path_prefix = os.path.join(
                self.output_dir,
                "%s_%.4d" % (self.output_file_prefix, counter))

            self.export_locus(gene_callers_id, output_path_prefix)

            counter += 1
Ejemplo n.º 3
0
    def get_functions_and_sequences_dicts_from_contigs_db(
            self, contigs_db_path, gene_caller_ids=None):
        """Returns function calls, dna and amino acid sequences for `gene_caller_ids`
           from a contigs database"""

        args = argparse.Namespace(contigs_db=contigs_db_path)
        contigs_super = dbops.ContigsSuperclass(
            args, r=anvio.terminal.Run(verbose=False))

        # get functions
        if self.functions_are_available:
            contigs_super.init_functions(
                requested_sources=self.function_annotation_sources)
            function_calls_dict = contigs_super.gene_function_calls_dict
        else:
            function_calls_dict = {}

        # get dna sequences
        gene_caller_ids_list, dna_sequences_dict = contigs_super.get_sequences_for_gene_callers_ids(
            gene_caller_ids_list=list(gene_caller_ids))

        # get amino acid sequences.
        # FIXME: this should be done in the contigs super.
        contigs_db = dbops.ContigsDatabase(contigs_db_path)
        aa_sequences_dict = contigs_db.db.get_table_as_dict(
            t.gene_protein_sequences_table_name)
        contigs_db.disconnect()

        return (function_calls_dict, aa_sequences_dict, dna_sequences_dict)
Ejemplo n.º 4
0
    def init(self):
        """The whole purpose of this function is to identify which gene calls to focus"""

        self.sanity_check()

        self.run.warning(None, header="Initialization bleep bloops", lc="cyan")

        if self.gene_caller_ids:
            self.run.info('Mode', 'User-provided gene caller id(s)')

            gene_caller_ids_of_interest = list(
                utils.get_gene_caller_ids_from_args(self.gene_caller_ids,
                                                    self.delimiter))
            self.sources = ['gene_caller_ids']
        elif self.use_hmm:
            self.run.info('Mode', 'HMM search')

            s = hmmops.SequencesForHMMHits(self.input_contigs_db_path,
                                           sources=self.hmm_sources)

            self.run.info('Search term', self.search_term, mc='green')
            self.run.info('HMM sources being used', ', '.join(s.sources))

            hmm_hits = utils.get_filtered_dict(s.hmm_hits, 'gene_name',
                                               {self.search_term})
            gene_caller_ids_of_interest = [
                entry['gene_callers_id'] for entry in hmm_hits.values()
            ]

            self.targets.append('HMMs')
            self.sources = s.sources
        else:
            self.run.info('Mode', 'Function search')

            contigs_db = dbops.ContigsSuperclass(self.args, r=self.run_object)
            # use functional annotation
            contigs_db.init_functions()
            self.run.info('Search term', self.search_term, mc='green')
            self.run.info('Function calls being used',
                          ', '.join(contigs_db.gene_function_call_sources))

            foo, search_report = contigs_db.search_for_gene_functions(
                [self.search_term], verbose=True)
            # gene id's of genes with the searched function
            gene_caller_ids_of_interest = [i[0] for i in search_report]

            self.targets.append('functions')
            self.sources = contigs_db.gene_function_call_sources

        # Multiple sources could annotate the same gene, so make sure the list is unique
        self.gene_caller_ids_of_interest = set(gene_caller_ids_of_interest)

        if len(self.gene_caller_ids_of_interest):
            run.info('Matching genes',
                     '%d genes matched your search' %
                     len(self.gene_caller_ids_of_interest),
                     mc='green',
                     nl_after=1)
Ejemplo n.º 5
0
    def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()):

        self.run = run
        self.progress = progress

        self.run.warning("Anvi'o will use 'InteracDome' by Kobren and Singh (DOI: 10.1093/nar/gky1224) to attribute binding frequencies. "
                         "If you publish your findings, please do not forget to properly credit their work.", lc='green', header="CITATION")

        A = lambda x, t: t(args.__dict__[x]) if x in args.__dict__ else None
        null = lambda x: x
        self.interacdome_data_dir = A('interacdome_data_dir', null) or constants.default_interacdome_data_path
        self.information_content_cutoff = A('information_content_cutoff', null) or 4
        self.min_binding_frequency = A('min_binding_frequency', null) or 0
        self.min_hit_fraction = A('min_hit_fraction', null) or 0.8
        self.interacdome_dataset = A('interacdome_dataset', null) or 'representable'
        self.output_prefix = A('output_file_prefix', null)
        self.just_do_it = A('just_do_it', null)

        self.run.warning("", header='INITIALIZATION', lc='green')
        self.run.info("Interacdome dataset used", self.interacdome_dataset)
        self.run.info("Minimum hit fraction", self.min_hit_fraction)

        self.hmm_filepath = os.path.join(self.interacdome_data_dir, 'Pfam-A.hmm')

        # Init the InteracDome table
        self.interacdome_table = InteracDomeTableData(kind=self.interacdome_dataset, interacdome_data_dir=self.interacdome_data_dir)
        self.interacdome_table.load()

        # Init the Pfam baseclass
        args.hmmer_program = 'hmmsearch' # Force use of hmmsearch
        args.pfam_data_dir = self.interacdome_data_dir
        Pfam.__init__(self, args, run=self.run, progress=self.progress)

        # Init contigs database
        args = argparse.Namespace(contigs_db=self.contigs_db_path)
        self.contigs_db = dbops.ContigsSuperclass(args)

        self.potentially_remove_previous_interacdome_data()

        # Init the HMM profile
        self.hmms = pfam.HMMProfile(self.hmm_filepath)

        # This dictionary is populated and cast as a dataframe. It contains all of the per-residue
        # binding frequency information for each hit
        self.bind_freq = {}

        # This dictionary (eventual dataframe) is just like self.bind_freq, except has averaged
        # binding frequencies for residue-ligand combos that have multiple contributing hits. It
        # also drops all contributing match state information
        self.avg_bind_freq = {}

        # This is a modified version of self.avg_bind_freq that is compatible with the
        # amino_acid_additional_data table structure, i.e.
        # tables.amino_acid_additional_data_table_structure
        self.amino_acid_additional_data = {}
Ejemplo n.º 6
0
    def process(self):
        hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')

        # initialize contigs database
        class Args:
            pass

        args = Args()
        args.contigs_db = self.contigs_db_path
        contigs_db = dbops.ContigsSuperclass(args)
        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # export AA sequences for genes
        target_files_dict = {
            'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa')
        }
        contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(
            output_file_path=target_files_dict['AA:GENE'],
            simple_headers=True,
            rna_alphabet=False,
            report_aa_sequences=True)

        # run hmmscan
        hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads)
        hmm_hits_file = hmmer.run_hmmscan('Pfam', 'AA', 'GENE', None, None,
                                          len(self.function_catalog), hmm_file,
                                          None, '--cut_ga')

        # parse hmmscan output
        parser = parser_modules['search']['hmmscan'](hmm_hits_file,
                                                     alphabet='AA',
                                                     context='GENE')
        search_results_dict = parser.get_search_results()

        # add functions to database
        functions_dict = {}
        counter = 0
        for hmm_hit in search_results_dict.values():
            functions_dict[counter] = {
                'gene_callers_id': hmm_hit['gene_callers_id'],
                'source': 'Pfam',
                'accession': hmm_hit['gene_hmm_id'],
                'function':
                self.get_function_from_catalog(hmm_hit['gene_hmm_id']),
                'e_value': hmm_hit['e_value'],
            }

            counter += 1

        gene_function_calls_table = TableForGeneFunctions(
            self.contigs_db_path, self.run, self.progress)
        gene_function_calls_table.create(functions_dict)
Ejemplo n.º 7
0
    def get_functions_dict_from_contigs_db(self, contigs_db_path):
        if not self.functions_are_available:
            return {}

        class Args:
            pass

        args = Args()
        args.contigs_db = contigs_db_path
        contigs_super = dbops.ContigsSuperclass(
            args, r=anvio.terminal.Run(verbose=False))
        contigs_super.init_functions(
            requested_sources=self.function_annotation_sources)

        return contigs_super.gene_function_calls_dict
Ejemplo n.º 8
0
    def export_locus(self, gene_callers_id, output_path_prefix):
        """Takes a gene callers ID, and exports a contigs database.

           Output path prefix should be unique for every export locus call. If the prefix you provide
           looks like this:

                >>> output_path_prefix = '/path/to/dir/file_name_prefix'

           the output files will be stored as this:

                >>> '/path/to/dir/file_name_prefix.fa'
                >>> '/path/to/dir/file_name_prefix.db'

           """

        if os.path.isdir(output_path_prefix):
            raise ConfigError(
                "Output path prefix can't be a directory name...")

        filesnpaths.is_output_file_writable(output_path_prefix + '.fa')

        if not self.contigs_db:
            self.contigs_db = dbops.ContigsSuperclass(self.args,
                                                      r=self.run_object)
            self.contigs_db.init_functions()

        gene_call = self.contigs_db.genes_in_contigs_dict[gene_callers_id]
        contig_name = self.contigs_db.genes_in_contigs_dict[gene_callers_id][
            'contig']
        genes_in_contig_sorted = sorted(
            list(self.contigs_db.contig_name_to_genes[contig_name]))

        D = lambda: 1 if gene_call['direction'] == 'f' else -1
        premature = False

        self.run.info("Contig name", contig_name)
        self.run.info(
            "Contig length",
            self.contigs_db.contigs_basic_info[contig_name]['length'])
        self.run.info("Num genes in contig", len(genes_in_contig_sorted))
        self.run.info("Target gene call", gene_callers_id)
        self.run.info("Target gene direction",
                      "Forward" if D() == 1 else "Reverse",
                      mc='green' if D() == 1 else 'red')

        gene_1 = gene_callers_id - self.num_genes_list[0] * D()
        gene_2 = gene_callers_id + self.num_genes_list[1] * D()
        first_gene_of_the_block = min(gene_1, gene_2)
        last_gene_of_the_block = max(gene_1, gene_2)

        self.run.info(
            "First and last gene of the locus (raw)",
            "%d and %d" % (first_gene_of_the_block, last_gene_of_the_block))

        # getting the ids for the first and last genes in the contig
        last_gene_in_contig = genes_in_contig_sorted[-1][0]
        first_gene_in_contig = genes_in_contig_sorted[0][0]

        if last_gene_of_the_block > last_gene_in_contig:
            last_gene_of_the_block = last_gene_in_contig
            premature = True

        if first_gene_of_the_block < first_gene_in_contig:
            first_gene_of_the_block = first_gene_in_contig
            premature = True

        if premature and self.remove_partial_hits:
            self.run.info_single(
                "A premature locus is found .. the current configuration says 'skip'. Skipping.",
                mc="red",
                nl_before=1)
            return
        elif premature and not self.remove_partial_hits:
            self.run.info_single(
                "A premature locus is found .. the current configuration says 'whatevs'. Anvi'o will continue.",
                mc="yellow",
                nl_before=1,
                nl_after=1)

        self.run.info(
            "First and last gene of the locus (final)",
            "%d and %d" % (first_gene_of_the_block, last_gene_of_the_block))

        locus_start = self.contigs_db.genes_in_contigs_dict[
            first_gene_of_the_block]['start']
        locus_stop = self.contigs_db.genes_in_contigs_dict[
            last_gene_of_the_block]['stop']

        # being a performance nerd here yes
        contig_sequence = db.DB(self.input_contigs_db_path, None, ignore_version=True) \
                            .get_some_rows_from_table(t.contig_sequences_table_name,
                                                      where_clause="contig='%s'" % contig_name)[0][1]
        locus_sequence = contig_sequence[locus_start:locus_stop]

        # here we will create a gene calls dict for genes that are specific to our locus. since we trimmed
        # the contig sequence to the locus of interest, we will have to adjust start and stop positions of
        # genes in teh gene calls dict.
        locus_gene_calls_dict = {}
        for g in range(first_gene_of_the_block, last_gene_of_the_block + 1):
            locus_gene_calls_dict[g] = copy.deepcopy(
                self.contigs_db.genes_in_contigs_dict[g])
            excess = self.contigs_db.genes_in_contigs_dict[
                first_gene_of_the_block]['start']
            locus_gene_calls_dict[g]['start'] -= excess
            locus_gene_calls_dict[g]['stop'] -= excess

        self.run.info("Locus gene call start/stops excess (nts)", excess)

        if D() != 1 and self.reverse_complement_if_necessary:
            reverse_complement = True
        else:
            reverse_complement = False

        self.run.info('Reverse complementing everything',
                      reverse_complement,
                      mc='green')

        # report a stupid FASTA file.
        if self.include_fasta_output:
            fasta_file_path = output_path_prefix + ".fa"

            self.run.info("Output FASTA file", fasta_file_path)
            with open(fasta_file_path, 'w') as f:
                locus_header = contig_name + ' ' + \
                               '|'.join(['target:%s' % ','.join(self.targets),
                                         'sources:%s' % ','.join(self.sources),
                                         'query:%s' % self.search_term or 'None',
                                         'hit_contig:%s' % contig_name,
                                         'hit_gene_callers_id:%s' % str(gene_callers_id),
                                         'project_name:%s' % self.contigs_db.a_meta['project_name'].replace(' ', '_').replace("'", '_').replace('"', '_'),
                                         'locus:%s,%s' % (str(first_gene_of_the_block), str(last_gene_of_the_block)),
                                         'nt_positions_in_contig:%s:%s' % (str(locus_start), str(locus_stop)),
                                         'premature:%s' % str(premature),
                                         'reverse_complemented:%s' % str(reverse_complement)])

                f.write('>%s\n' % locus_header)
                f.write('%s\n' % utils.rev_comp(locus_sequence)
                        if reverse_complement else locus_sequence)

        # report a fancy anvi'o contigs database
        self.store_locus_as_contigs_db(contig_name, locus_sequence,
                                       locus_gene_calls_dict,
                                       output_path_prefix, reverse_complement)
Ejemplo n.º 9
0
    def process(self):
        hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')

        # initialize contigs database
        class Args:
            pass

        args = Args()
        args.contigs_db = self.contigs_db_path
        contigs_db = dbops.ContigsSuperclass(args)
        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # get an instance of gene functions table
        gene_function_calls_table = TableForGeneFunctions(
            self.contigs_db_path, self.run, self.progress)

        # export AA sequences for genes
        target_files_dict = {
            'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa')
        }
        contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(
            output_file_path=target_files_dict['AA:GENE'],
            simple_headers=True,
            rna_alphabet=False,
            report_aa_sequences=True)

        # run hmmscan
        hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads)
        hmm_hits_file = hmmer.run_hmmscan('Pfam', 'AA', 'GENE', None, None,
                                          len(self.function_catalog), hmm_file,
                                          None, '--cut_ga')

        if not hmm_hits_file:
            run.info_single(
                "The HMM search returned no hits :/ So there is nothing to add to the contigs database. But "
                "now anvi'o will add PFAMs as a functional source with no hits, clean the temporary directories "
                "and gracefully quit.",
                nl_before=1,
                nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
            gene_function_calls_table.add_empty_sources_to_functional_sources(
                {'Pfam'})
            return

        # parse hmmscan output
        parser = parser_modules['search']['hmmscan'](hmm_hits_file,
                                                     alphabet='AA',
                                                     context='GENE')
        search_results_dict = parser.get_search_results()

        # add functions to database
        functions_dict = {}
        counter = 0
        for hmm_hit in search_results_dict.values():
            functions_dict[counter] = {
                'gene_callers_id':
                hmm_hit['gene_callers_id'],
                'source':
                'Pfam',
                'accession':
                hmm_hit['gene_hmm_id'],
                'function':
                self.get_function_from_catalog(
                    hmm_hit['gene_hmm_id'], ok_if_missing_from_catalog=True),
                'e_value':
                hmm_hit['e_value'],
            }

            counter += 1

        if functions_dict:
            gene_function_calls_table.create(functions_dict)
        else:
            self.run.warning(
                "Pfam class has no hits to process. Returning empty handed, but still adding Pfam as "
                "a functional source.")
            gene_function_calls_table.add_empty_sources_to_functional_sources(
                {'Pfam'})

        if anvio.DEBUG:
            run.warning(
                "The temp directories, '%s' and '%s' are kept. Please don't forget to clean those up "
                "later" % (tmp_directory_path, ', '.join(hmmer.tmp_dirs)),
                header="Debug")
        else:
            run.info_single(
                'Cleaning up the temp directory (you can use `--debug` if you would '
                'like to keep it for testing purposes)',
                nl_before=1,
                nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
Ejemplo n.º 10
0
    def process(self, aa_sequences_file_path=None):
        if self.search_with not in self.available_search_methods:
            raise ConfigError("Let us start by making it clear that we probably like '%s' as much as you do, but it doesn't "
                              "seem to be available on your system OR recognized by the COGs class since anvi'o couldn't "
                              "find it among the available search methods. You probably need to try something else :/" \
                                                                                                    % self.search_with)

        if self.search_with not in self.available_db_search_program_targets:
            raise ConfigError("Anvi'o understands that you want to use '%s' to search for COGs, however, there is no "
                              "database formatted under the COGs data directory for that program :/ You may need to "
                              "re-run the COGs setup (anvi-setup-ncbi-cogs), UNLESS, you set up your COG data directory "
                              "somewhere else than what anvi'o attempts to use at the moment ('%s'). If that is the case, "
                              "this may be the best time to point the right directory using the --cog-data-dir parameter, "
                              "or the environmental variable 'ANVIO_COG_DATA_DIR'." % (self.search_with, self.COG_data_dir))

        if not aa_sequences_file_path and not self.contigs_db_path:
            raise ConfigError("You either need to provide an anvi'o contigs database path, or a FASTA file for AA "
                              "sequences")

        if aa_sequences_file_path and self.contigs_db_path:
            raise ConfigError("You can't provide both an AA sequences file and a contigs database. Choose one!")

        if self.contigs_db_path:
            utils.is_contigs_db(self.contigs_db_path)

        if not self.temp_dir_path:
            self.temp_dir_path = filesnpaths.get_temp_directory_path()
            self.remove_temp_dir_path = True
        else:
            filesnpaths.is_file_exists(self.temp_dir_path)
            filesnpaths.is_output_dir_writable(self.temp_dir_path)

            self.run.warning("Because you set the temporary directory path by hand, anvi'o will not remove its content "
                             "when it is done. But she certainly hopes that you will clean those files later.")

            self.remove_temp_dir_path = False

        self.run.info('COG data directory', self.COG_data_dir)
        self.run.info('Searching with', self.search_with)
        self.run.info('Directory to store temporary files', self.temp_dir_path)
        self.run.info('Directory will be removed after the run', self.remove_temp_dir_path)


        if not aa_sequences_file_path:
            aa_sequences_file_path = J(self.temp_dir_path, 'aa_sequences.fa')
            dbops.ContigsSuperclass(self.args).get_sequences_for_gene_callers_ids(output_file_path=aa_sequences_file_path,
                                                                                  report_aa_sequences=True,
                                                                                  simple_headers=True)

        # do the search
        search_results_tabular = self.search_methods_factory[self.search_with](aa_sequences_file_path)

        # convert the output to a hits dict
        if self.COG_version == 'COG14':
            self.hits = utils.get_BLAST_tabular_output_as_dict(search_results_tabular, target_id_parser_func=lambda x: x.split('|')[1])
        elif self.COG_version == 'COG20':
            self.hits = utils.get_BLAST_tabular_output_as_dict(search_results_tabular)
        else:
            raise ConfigError("You need to edit all the if/else statements with COG version checks to ensure proper "
                              "parsing of a new generation of COG files.")

        # store hits into the contigs database
        self.store_hits_into_contigs_db()

        if self.remove_temp_dir_path:
            shutil.rmtree(self.temp_dir_path)
Ejemplo n.º 11
0
    def get_genes_of_interest(self,
                              genes_of_interest_path=None,
                              gene_caller_ids=None):
        """
        nabs the genes of interest based on user arguments (self.args)
        """
        genes_of_interest = None

        # identify the gene caller ids of all genes available
        self.genes_in_contigs_database = set(
            dbops.ContigsSuperclass(self.args).genes_in_splits.keys())

        if not self.genes_in_contigs_database:
            raise ConfigError(
                "This contigs database does not contain any identified genes..."
            )

        # settling genes of interest
        if genes_of_interest_path and gene_caller_ids:
            raise ConfigError(
                "You can't provide a gene caller id from the command line, and a list of gene caller ids\
                               as a file at the same time, obviously.")

        if gene_caller_ids:
            gene_caller_ids = set(
                [x.strip() for x in gene_caller_ids.split(',')])

            genes_of_interest = []
            for gene in gene_caller_ids:
                try:
                    genes_of_interest.append(int(gene))
                except:
                    raise ConfigError(
                        "Anvi'o does not like your gene caller id '%s'..." %
                        str(gene))

            genes_of_interest = set(genes_of_interest)

        elif genes_of_interest_path:
            filesnpaths.is_file_tab_delimited(genes_of_interest_path,
                                              expected_number_of_fields=1)

            try:
                genes_of_interest = set([
                    int(s.strip())
                    for s in open(genes_of_interest_path).readlines()
                ])
            except ValueError:
                raise ConfigError(
                    "Well. Anvi'o was working on your genes of interest ... and ... those gene IDs did not\
                                   look like anvi'o gene caller ids :/ Anvi'o is now sad."
                )

        if not genes_of_interest:
            # no genes of interest are specified. Assuming all, which could be innumerable--raise warning
            genes_of_interest = self.genes_in_contigs_database
            self.run.warning(
                "You did not specify any genes of interest, so anvi'o will assume all of them are of interest."
            )

        return genes_of_interest
Ejemplo n.º 12
0
    def process(self):
        hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')

        # initialize contigs database
        class Args:
            pass

        args = Args()
        args.contigs_db = self.contigs_db_path
        contigs_db = dbops.ContigsSuperclass(args)
        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # export AA sequences for genes
        target_files_dict = {
            'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa')
        }
        contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(
            output_file_path=target_files_dict['AA:GENE'],
            simple_headers=True,
            rna_alphabet=False,
            report_aa_sequences=True)

        # run hmmscan
        hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads)
        hmm_hits_file = hmmer.run_hmmscan('Pfam', 'AA', 'GENE', None, None,
                                          len(self.function_catalog), hmm_file,
                                          None, '--cut_ga')

        # parse hmmscan output
        parser = parser_modules['search']['hmmscan'](hmm_hits_file,
                                                     alphabet='AA',
                                                     context='GENE')
        search_results_dict = parser.get_search_results()

        # add functions to database
        functions_dict = {}
        counter = 0
        for hmm_hit in search_results_dict.values():
            functions_dict[counter] = {
                'gene_callers_id': hmm_hit['gene_callers_id'],
                'source': 'Pfam',
                'accession': hmm_hit['gene_hmm_id'],
                'function':
                self.get_function_from_catalog(hmm_hit['gene_hmm_id']),
                'e_value': hmm_hit['e_value'],
            }

            counter += 1

        gene_function_calls_table = TableForGeneFunctions(
            self.contigs_db_path, self.run, self.progress)
        gene_function_calls_table.create(functions_dict)

        if anvio.DEBUG:
            run.warning(
                "The temp directories, '%s' and '%s' are kept. Please don't forget to clean those up\
                         later" %
                (tmp_directory_path, ', '.join(hmmer.tmp_dirs)),
                header="Debug")
        else:
            run.info_single(
                'Cleaning up the temp directory (you can use `--debug` if you would\
                             like to keep it for testing purposes)',
                nl_before=1,
                nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()