Example #1
0
    def process(self):
        hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')

        # initialize contigs database
        class Args: pass
        args = Args()
        args.contigs_db = self.contigs_db_path
        contigs_db = dbops.ContigsSuperclass(args)
        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # export AA sequences for genes
        target_files_dict = {'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa')}
        contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(output_file_path=target_files_dict['AA:GENE'],
                                                                   simple_headers=True,
                                                                   rna_alphabet=False,
                                                                   report_aa_sequences=True)

        # run hmmscan
        hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads)
        hmm_hits_file = hmmer.run_hmmscan('Pfam', 'AA', 'GENE', None, None, len(self.function_catalog), hmm_file, None, '--cut_ga')

        # parse hmmscan output
        parser = parser_modules['search']['hmmscan'](hmm_hits_file, alphabet='AA', context='GENE')
        search_results_dict = parser.get_search_results()

        # add functions to database
        functions_dict = {}
        counter = 0
        for hmm_hit in search_results_dict.values():
            functions_dict[counter] = {
                'gene_callers_id': hmm_hit['gene_callers_id'],
                'source': 'Pfam',
                'accession': hmm_hit['gene_hmm_id'],
                'function': self.get_function_from_catalog(hmm_hit['gene_hmm_id']),
                'e_value': hmm_hit['e_value'],
            }

            counter += 1

        gene_function_calls_table = TableForGeneFunctions(self.contigs_db_path, self.run, self.progress)
        gene_function_calls_table.create(functions_dict)

        if anvio.DEBUG:
            run.warning("The temp directories, '%s' and '%s' are kept. Please don't forget to clean those up\
                         later" % (tmp_directory_path, ', '.join(hmmer.tmp_dirs)), header="Debug")
        else:
            run.info_single('Cleaning up the temp directory (you can use `--debug` if you would\
                             like to keep it for testing purposes)', nl_before=1, nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
Example #2
0
    def populate_search_tables(self, sources={}):
        # make sure the output file is OK to write.
        filesnpaths.is_output_file_writable(self.db_path, ok_if_exists=True)

        # if we end up generating a temporary file for amino acid sequences:
        if not len(sources):
            import anvio.data.hmm
            sources = anvio.data.hmm.sources

        if not sources:
            return

        self.check_sources(sources)

        target_files_dict = {}

        tmp_directory_path = filesnpaths.get_temp_directory_path()

        hmmpressed_files = self.hmmpress_sources(sources, tmp_directory_path)

        self.run.info("Contigs DB", self.db_path)
        self.run.info("HMM sources", ', '.join(sources.keys()))

        # here we will go through targets and populate target_files_dict based on what we find among them.
        targets = set([s['target'] for s in list(sources.values())])
        have_hmm_sources_with_non_RNA_contig_context = False
        for target in targets:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(
                target)

            if not self.genes_are_called and context != "CONTIG":
                raise ConfigError(
                    "You are in trouble. The gene calling was skipped for this contigs database, yet anvi'o asked to run an "
                    "HMM profile that wishes to operate on %s context using the %s alphabet. It is not OK. You still could run "
                    "HMM profiles that does not require gene calls to be present (such as the HMM profile that identifies Ribosomal "
                    "RNAs in contigs, but for that you would have to explicitly ask for it by using the additional parameter "
                    "'--installed-hmm-profile PROFILE_NAME_HERE')." %
                    (context, alphabet))

            self.run.info('Alphabet/context target found',
                          '%s:%s' % (alphabet, context))

            if context == 'CONTIG' and alphabet != 'RNA':
                have_hmm_sources_with_non_RNA_contig_context = True

            class Args:
                pass

            args = Args()
            args.contigs_db = self.db_path
            contigs_db = ContigsSuperclass(args, r=terminal.Run(verbose=False))

            if context == 'GENE':
                target_files_dict['%s:GENE' % alphabet] = os.path.join(
                    tmp_directory_path, '%s_gene_sequences.fa' % alphabet)
                contigs_db.get_sequences_for_gene_callers_ids(
                    output_file_path=target_files_dict['%s:GENE' % alphabet],
                    simple_headers=True,
                    rna_alphabet=True if alphabet == 'RNA' else False,
                    report_aa_sequences=True if alphabet == 'AA' else False)
            elif context == 'CONTIG':
                if alphabet == 'AA':
                    raise ConfigError(
                        "You are somewhere you shouldn't be. You came here because you thought it would be OK "
                        "to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If "
                        "you think this is dumb, please let us know.")
                else:
                    target_files_dict['%s:CONTIG' % alphabet] = os.path.join(
                        tmp_directory_path,
                        '%s_contig_sequences.fa' % alphabet)
                    utils.export_sequences_from_contigs_db(
                        self.db_path,
                        target_files_dict['%s:CONTIG' % alphabet],
                        rna_alphabet=True if alphabet == 'RNA' else False)

        if have_hmm_sources_with_non_RNA_contig_context:
            # in that case, we should remind people what's up.
            self.run.warning(
                "The HMM profiles that are about to be run includes at least one HMM profile that runs on "
                "contigs and not genes. Thus, this HMM operation will not be working with gene calls anvi'o "
                "already knows about. Which means, the resulting hits will need to be added as 'new gene calls' "
                "into the contigs database. So far so good. But because we are in the realm of contigs rather "
                "than genes, the resulting HMM hits will unlikely correspond to open reading frames that are "
                "supposed to be translated (such as ribosomal RNAs). While anvi'o adds new gene calls to your "
                "contigs database for these hits, it will NOT report amino acid sequences for the "
                "new gene calls that will emerge from these HMMs, expecting you to judge whether this will "
                "influence your pangenomic analyses or other things you thought you would be doing with the "
                "result of this HMM search downstream. If you do not feel like being the judge of anything today "
                "you can move on yet remember to remember this if things look somewhat weird later on.",
                header="THE MORE YOU KNOW 🌈",
                lc="green")

        commander = HMMer(target_files_dict,
                          num_threads_to_use=self.num_threads_to_use,
                          program_to_use=self.hmm_program)

        for source in sources:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(
                sources[source]['target'])

            if alphabet in ['DNA', 'RNA'
                            ] and 'domtable' in self.hmmer_desired_output:
                raise ConfigError(
                    "Domain table output was requested (probably with the --get-domtable-output flag, "
                    "does that look familiar?) but unfortunately this option is incompatible with the "
                    f"current source of HMM profiles, {source}, because this source uses a nucleotide "
                    "alphabet.")

            kind_of_search = sources[source]['kind']
            domain = sources[source]['domain']
            all_genes_searched_against = sources[source]['genes']
            hmm_model = hmmpressed_files[source]
            reference = sources[source]['ref']
            noise_cutoff_terms = sources[source]['noise_cutoff_terms']

            hmmer_output = commander.run_hmmer(
                source,
                alphabet,
                context,
                kind_of_search,
                domain,
                len(all_genes_searched_against),
                hmm_model,
                reference,
                noise_cutoff_terms,
                desired_output=self.hmmer_desired_output,
                hmmer_output_dir=self.hmmer_output_dir)

            if self.hmmer_output_dir:
                self.run.info("HMMER output directory", self.hmmer_output_dir)

            if not isinstance(hmmer_output, tuple):
                hmm_scan_hits_txt = hmmer_output
            else:
                hmm_scan_hits_txt, domain_hits_txt = hmmer_output
                self.run.info("Domain table output", domain_hits_txt)

            if not hmm_scan_hits_txt:
                search_results_dict = {}
            else:
                try:
                    parser = parser_modules['search']['hmmer_table_output'](
                        hmm_scan_hits_txt,
                        alphabet=alphabet,
                        context=context,
                        program=self.hmm_program)
                except StupidHMMError as e:
                    raise ConfigError(
                        f"Unfortunately something went wrong while anvi'o was trying to parse some HMM output for your data. "
                        f"This error is typically due to contig names that are long and variable in length, which that "
                        f"confuses HMMER and so it generates output tables that are simply unparseable. Anvi'o does its best, "
                        f"but occasionally fails, which leads to this error. If you are curious why is this happening, you can take a "
                        f"look at this issue where this issue is described: https://github.com/merenlab/anvio/issues/1564. "
                        f"Solution to this is relatively easy: use `anvi-script-reformat-fasta` with `--simplify-names` flag "
                        f"BEFORE generating your contigs database as we advice you to. Sorry you came all this way just to "
                        f"find out about this :/ Here is the origial error message anvi'o produced from the code beneath: {e}."
                    )

                search_results_dict = parser.get_search_results()

            if not len(search_results_dict):
                run.info_single(
                    "The HMM source '%s' returned 0 hits. SAD (but it's stil OK)."
                    % source,
                    nl_before=1)

            if context == 'CONTIG':
                # we are in trouble here. because our search results dictionary contains no gene calls, but contig
                # names contain our hits. on the other hand, the rest of the code outside of this if statement
                # expects a `search_results_dict` with gene caller ids in it. so there are two things we need to do.
                # one is to come up with some new gene calls and add them to the contigs database. so things
                # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks
                # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these
                # steps are going to be taken care of in the following function. magic.
                num_hits_before = len(search_results_dict)
                search_results_dict = utils.get_pruned_HMM_hits_dict(
                    search_results_dict)
                num_hits_after = len(search_results_dict)

                if num_hits_before != num_hits_after:
                    self.run.info(
                        'Pruned',
                        '%d out of %d hits were removed due to redundancy' %
                        (num_hits_before - num_hits_after, num_hits_before))

                search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(
                    kind_of_search,
                    search_results_dict,
                    skip_amino_acid_sequences=True)

            self.append(source, reference, kind_of_search, domain,
                        all_genes_searched_against, search_results_dict)

        # FIXME: I have no clue why importing the anvio module is necessary at this point,
        #        but without this, mini test fails becasue "`anvio.DEBUG` is being used
        #        before initialization". nonsense.
        import anvio
        if not anvio.DEBUG:
            commander.clean_tmp_dirs()
            for v in list(target_files_dict.values()):
                os.remove(v)

            shutil.rmtree(tmp_directory_path)
Example #3
0
    def process(self):
        hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')

        # initialize contigs database
        class Args:
            pass

        args = Args()
        args.contigs_db = self.contigs_db_path
        contigs_db = dbops.ContigsSuperclass(args)
        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # get an instance of gene functions table
        gene_function_calls_table = TableForGeneFunctions(
            self.contigs_db_path, self.run, self.progress)

        # export AA sequences for genes
        target_files_dict = {
            'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa')
        }
        contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(
            output_file_path=target_files_dict['AA:GENE'],
            simple_headers=True,
            rna_alphabet=False,
            report_aa_sequences=True)

        # run hmmscan
        hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads)
        hmm_hits_file = hmmer.run_hmmscan('Pfam', 'AA', 'GENE', None, None,
                                          len(self.function_catalog), hmm_file,
                                          None, '--cut_ga')

        if not hmm_hits_file:
            run.info_single(
                "The HMM search returned no hits :/ So there is nothing to add to the contigs database. But "
                "now anvi'o will add PFAMs as a functional source with no hits, clean the temporary directories "
                "and gracefully quit.",
                nl_before=1,
                nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
            gene_function_calls_table.add_empty_sources_to_functional_sources(
                {'Pfam'})
            return

        # parse hmmscan output
        parser = parser_modules['search']['hmmscan'](hmm_hits_file,
                                                     alphabet='AA',
                                                     context='GENE')
        search_results_dict = parser.get_search_results()

        # add functions to database
        functions_dict = {}
        counter = 0
        for hmm_hit in search_results_dict.values():
            functions_dict[counter] = {
                'gene_callers_id':
                hmm_hit['gene_callers_id'],
                'source':
                'Pfam',
                'accession':
                hmm_hit['gene_hmm_id'],
                'function':
                self.get_function_from_catalog(
                    hmm_hit['gene_hmm_id'], ok_if_missing_from_catalog=True),
                'e_value':
                hmm_hit['e_value'],
            }

            counter += 1

        if functions_dict:
            gene_function_calls_table.create(functions_dict)
        else:
            self.run.warning(
                "Pfam class has no hits to process. Returning empty handed, but still adding Pfam as "
                "a functional source.")
            gene_function_calls_table.add_empty_sources_to_functional_sources(
                {'Pfam'})

        if anvio.DEBUG:
            run.warning(
                "The temp directories, '%s' and '%s' are kept. Please don't forget to clean those up "
                "later" % (tmp_directory_path, ', '.join(hmmer.tmp_dirs)),
                header="Debug")
        else:
            run.info_single(
                'Cleaning up the temp directory (you can use `--debug` if you would '
                'like to keep it for testing purposes)',
                nl_before=1,
                nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
Example #4
0
    def process(self):
        """Runs InteracDome."""

        tmp_directory_path = filesnpaths.get_temp_directory_path()
        gene_caller_ids = list(self.contigs_db.genes_in_contigs_dict.keys())

        self.run.info("num genes that HMM will be run on",
                      len(gene_caller_ids))

        # export AA sequences for genes
        target_files_dict = {
            'AA:DOMAIN': os.path.join(tmp_directory_path,
                                      'AA_gene_sequences.fa')
        }
        self.contigs_db.get_sequences_for_gene_callers_ids(
            gene_caller_ids_list=gene_caller_ids,
            output_file_path=target_files_dict['AA:DOMAIN'],
            simple_headers=True,
            report_aa_sequences=True)

        # run hmmer
        hmmer = HMMer(target_files_dict,
                      num_threads_to_use=self.num_threads,
                      program_to_use=self.hmm_program)
        hmm_hits_file, domain_hits_file = hmmer.run_hmmer(
            source='InteracDome',
            alphabet='AA',
            context='DOMAIN',
            kind=None,
            domain=None,
            num_genes_in_model=len(self.function_catalog),
            hmm=self.hmm_filepath,
            ref=None,
            noise_cutoff_terms='--cut_ga',
            desired_output=('standard', 'domtable'),
        )

        self.run.warning("", header='HMMER results', lc='green')
        self.hmm_out = parser_modules['search']['hmmer_std_output'](
            hmm_hits_file, context='interacdome')

        self.run.info('num total domain hits', self.hmm_out.dom_hits.shape[0])
        self.run.info(
            'num unique genes',
            self.hmm_out.dom_hits['corresponding_gene_call'].unique().shape[0])
        self.run.info('num unique HMMs',
                      self.hmm_out.dom_hits['pfam_id'].unique().shape[0])

        if self.hmm_out.dom_hits.shape[0] == 0:
            self.run.info_single(
                "The HMM search returned no hits :/ So there is nothing to do. Anvi'o "
                "will now clean the temporary directories and gracefully quit.",
                nl_before=1,
                nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
            return

        self.filter_hits()
        self.attribute_binding_frequencies()
        self.filter_positions()

        self.bind_freq = self.bind_freq.sort_values(
            by=['gene_callers_id', 'ligand', 'codon_order_in_gene'])
        self.avg_bind_freq = self.avg_bind_freq.sort_values(
            by=['gene_callers_id', 'ligand', 'codon_order_in_gene'])

        if self.bind_freq.empty:
            self.run.warning(
                "There are 0 HMM hits, so there is nothing to do :( Binding frequencies were not "
                "added to your database",
                header="Oh no...")
        else:
            self.store()

        if anvio.DEBUG:
            self.run.warning(
                "The temp directories, '%s' and '%s' are kept. Please don't forget to "
                "clean those up later" %
                (tmp_directory_path, ', '.join(hmmer.tmp_dirs)),
                header="Debug")
        else:
            self.run.info_single(
                "Cleaning up the temp directory (you can use `--debug` if you would "
                "like to keep it for testing purposes)",
                nl_before=1,
                nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
Example #5
0
    def populate_search_tables(self, sources={}):
        # if we end up generating a temporary file for amino acid sequences:
        if not len(sources):
            import anvio.data.hmm
            sources = anvio.data.hmm.sources

        if not sources:
            return

        target_files_dict = {}

        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # here we will go through targets and populate target_files_dict based on what we find among them.
        targets = set([s['target'] for s in list(sources.values())])
        for target in targets:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(target)

            if not self.genes_are_called and context != "CONTIG":
                raise ConfigError("You are in trouble. The gene calling was skipped for this contigs database, yet anvi'o asked to run an\
                                   HMM profile that wishes to operate on %s context using the %s alphabet. It is not OK. You still could run\
                                   HMM profiles that does not require gene calls to be present (such as the HMM profile that identifies Ribosomal\
                                   RNAs in contigs, but for that you would have to explicitly ask for it by using the additional parameter\
                                   '--installed-hmm-profile Ribosomal_RNAs')." % (context, alphabet))

            self.run.info('Target found', '%s:%s' % (alphabet, context))

            class Args: pass
            args = Args()
            args.contigs_db = self.db_path
            contigs_db = ContigsSuperclass(args, r=terminal.Run(verbose=False))

            if context == 'GENE':
                target_files_dict['%s:GENE' % alphabet] = os.path.join(tmp_directory_path, '%s_gene_sequences.fa' % alphabet)
                contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(output_file_path=target_files_dict['%s:GENE' % alphabet],
                                                                           simple_headers=True,
                                                                           rna_alphabet=True if alphabet=='RNA' else False,
                                                                           report_aa_sequences=True if alphabet=='AA' else False)
            elif context == 'CONTIG':
                if alphabet == 'AA':
                    raise ConfigError("You are somewhere you shouldn't be. You came here because you thought it would be OK\
                                       to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If\
                                       you think this is dumb, please let us know.")
                else:
                    target_files_dict['%s:CONTIG' % alphabet] = os.path.join(tmp_directory_path, '%s_contig_sequences.fa' % alphabet)
                    utils.export_sequences_from_contigs_db(self.db_path,
                                                           target_files_dict['%s:CONTIG' % alphabet],
                                                           rna_alphabet=True if alphabet=='RNA' else False)

        commander = HMMer(target_files_dict, num_threads_to_use=self.num_threads_to_use)

        for source in sources:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(sources[source]['target'])

            kind_of_search = sources[source]['kind']
            domain = sources[source]['domain']
            all_genes_searched_against = sources[source]['genes']
            hmm_model = sources[source]['model']
            reference = sources[source]['ref']
            noise_cutoff_terms = sources[source]['noise_cutoff_terms']

            hmm_scan_hits_txt = commander.run_hmmscan(source,
                                                      alphabet,
                                                      context,
                                                      kind_of_search,
                                                      domain,
                                                      len(all_genes_searched_against),
                                                      hmm_model,
                                                      reference,
                                                      noise_cutoff_terms)

            if not hmm_scan_hits_txt:
                search_results_dict = {}
            else:
                parser = parser_modules['search']['hmmscan'](hmm_scan_hits_txt, alphabet=alphabet, context=context)
                search_results_dict = parser.get_search_results()

            if not len(search_results_dict):
                run.info_single("The HMM source '%s' returned 0 hits. SAD (but it's stil OK)." % source, nl_before=1)


            if context == 'CONTIG':
                # we are in trouble here. because our search results dictionary contains no gene calls, but contig
                # names that contain our hits. on the other hand, the rest of the code outside of this if statement
                # expects a `search_results_dict` with gene callers id in it. so there are two things we need to do
                # to do. one is to come up with some new gene calls and add them to the contigs database. so things
                # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks
                # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these
                # steps are going to be taken care of in the following function. magic.

                if source != "Ribosomal_RNAs":
                    self.run.warning("You just called an HMM profile that runs on contigs and not genes. Because this HMM\
                                      operation is not directly working with gene calls anvi'o already knows about, the resulting\
                                      hits will need to be added as 'new gene calls' into the contigs database. So far so good.\
                                      But because we are in the contigs realm rater than genes realm, it is likely that\
                                      resulting hits will not correspond to open reading frames that are supposed to be\
                                      translated (such as ribosomal RNAs), because otherwise you would be working with genes\
                                      instad of defining CONTIGS as your context in that HMM profile you just used unless you\
                                      not sure what you are doing. Hence, anvi'o will not report amino acid sequences for the\
                                      new gene calls it will recover through these HMMs. Please take a moment and you be the\
                                      judge of whether this will influence your pangenomic analyses or other things you thought\
                                      you would be doing with the result of this HMM search downstream. If you do not feel like\
                                      being the judge of anything today you can move on yet remember to remember this if things\
                                      look somewhat weird later on.",
                                     header="Psst. Your fancy HMM profile '%s' speaking" % source,
                                     lc="green")

                num_hits_before = len(search_results_dict)
                search_results_dict = utils.get_pruned_HMM_hits_dict(search_results_dict)
                num_hits_after = len(search_results_dict)

                if num_hits_before != num_hits_after:
                    self.run.info('Pruned', '%d out of %d hits were removed due to redundancy' % (num_hits_before - num_hits_after, num_hits_before))

                search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(kind_of_search,
                                                                                                           search_results_dict,
                                                                                                           skip_amino_acid_sequences=True)

            self.append(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict)

        # FIXME: I have no clue why importing the anvio module is necessary at this point,
        #        but without this, mini test fails becasue "`anvio.DEBUG` is being used
        #        before initialization". nonsense.
        import anvio
        if not anvio.DEBUG:
            commander.clean_tmp_dirs()
            for v in list(target_files_dict.values()):
                os.remove(v)
Example #6
0
    def populate_search_tables(self, sources={}):
        # if we end up generating a temporary file for amino acid sequences:
        if not len(sources):
            import anvio.data.hmm
            sources = anvio.data.hmm.sources

        if not sources:
            return

        target_files_dict = {}

        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # here we will go through targets and populate target_files_dict based on what we find among them.
        targets = set([s['target'] for s in list(sources.values())])
        for target in targets:

            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(
                target)

            self.run.info('Target found', '%s:%s' % (alphabet, context))

            class Args:
                pass

            args = Args()
            args.contigs_db = self.db_path
            contigs_db = ContigsSuperclass(args)

            if context == 'GENE':
                target_files_dict['%s:GENE' % alphabet] = os.path.join(
                    tmp_directory_path, '%s_gene_sequences.fa' % alphabet)
                contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(
                    output_file_path=target_files_dict['%s:GENE' % alphabet],
                    simple_headers=True,
                    rna_alphabet=True if alphabet == 'RNA' else False,
                    report_aa_sequences=True if alphabet == 'AA' else False)
            elif context == 'CONTIG':
                if alphabet == 'AA':
                    raise ConfigError(
                        "You are somewhere you shouldn't be. You came here because you thought it would be OK\
                                       to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If\
                                       you think this is dumb, please let us know."
                    )
                else:
                    target_files_dict['%s:CONTIG' % alphabet] = os.path.join(
                        tmp_directory_path,
                        '%s_contig_sequences.fa' % alphabet)
                    utils.export_sequences_from_contigs_db(
                        self.db_path,
                        target_files_dict['%s:CONTIG' % alphabet],
                        rna_alphabet=True if alphabet == 'RNA' else False)

        commander = HMMer(target_files_dict,
                          num_threads_to_use=self.num_threads_to_use)

        for source in sources:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(
                sources[source]['target'])

            kind_of_search = sources[source]['kind']
            domain = sources[source]['domain']
            all_genes_searched_against = sources[source]['genes']
            hmm_model = sources[source]['model']
            reference = sources[source]['ref']
            noise_cutoff_terms = sources[source]['noise_cutoff_terms']

            hmm_scan_hits_txt = commander.run_hmmscan(
                source, alphabet, context, kind_of_search, domain,
                all_genes_searched_against, hmm_model, reference,
                noise_cutoff_terms)

            if not hmm_scan_hits_txt:
                search_results_dict = {}
            else:
                parser = parser_modules['search']['hmmscan'](hmm_scan_hits_txt,
                                                             alphabet=alphabet,
                                                             context=context)
                search_results_dict = parser.get_search_results()

            if not len(search_results_dict):
                run.info_single(
                    "The HMM source '%s' returned 0 hits. SAD (but it's stil OK)."
                    % source,
                    nl_before=1)

            if context == 'CONTIG':
                # we are in trouble here. because our search results dictionary contains no gene calls, but contig
                # names that contain our hits. on the other hand, the rest of the code outside of this if statement
                # expects a `search_results_dict` with gene callers id in it. so there are two things we need to do
                # to do. one is to come up with some new gene calls and add them to the contigs database. so things
                # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks
                # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these
                # steps are going to be taken care of in the following function. magic.

                self.run.warning(
                    "Alright! You just called an HMM profile that runs on contigs. Because it is not\
                                 working with anvi'o gene calls directly, the resulting hits will need to be added\
                                 as 'new gene calls' into the contigs database. This is a new feature, and if it\
                                 starts screwing things up for you please let us know. Other than that you're pretty\
                                 much golden. Carry on.",
                    header="Psst. Your fancy HMM profile '%s' speaking" %
                    source,
                    lc="green")

                num_hits_before = len(search_results_dict)
                search_results_dict = utils.get_pruned_HMM_hits_dict(
                    search_results_dict)
                num_hits_after = len(search_results_dict)

                if num_hits_before != num_hits_after:
                    self.run.info(
                        'Pruned',
                        '%d out of %d hits were removed due to redundancy' %
                        (num_hits_before - num_hits_after, num_hits_before))

                search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(
                    kind_of_search, search_results_dict)

            self.append(source, reference, kind_of_search, domain,
                        all_genes_searched_against, search_results_dict)

        # FIXME: I have no clue why importing the anvio module is necessary at this point,
        #        but without this, mini test fails becasue "`anvio.DEBUG` is being used
        #        before initialization". nonsense.
        import anvio
        if not anvio.DEBUG:
            commander.clean_tmp_dirs()
            for v in list(target_files_dict.values()):
                os.remove(v)
Example #7
0
    def process(self):
        hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')

        # initialize contigs database
        class Args:
            pass

        args = Args()
        args.contigs_db = self.contigs_db_path
        contigs_db = dbops.ContigsSuperclass(args)
        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # export AA sequences for genes
        target_files_dict = {
            'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa')
        }
        contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(
            output_file_path=target_files_dict['AA:GENE'],
            simple_headers=True,
            rna_alphabet=False,
            report_aa_sequences=True)

        # run hmmscan
        hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads)
        hmm_hits_file = hmmer.run_hmmscan('Pfam', 'AA', 'GENE', None, None,
                                          len(self.function_catalog), hmm_file,
                                          None, '--cut_ga')

        # parse hmmscan output
        parser = parser_modules['search']['hmmscan'](hmm_hits_file,
                                                     alphabet='AA',
                                                     context='GENE')
        search_results_dict = parser.get_search_results()

        # add functions to database
        functions_dict = {}
        counter = 0
        for hmm_hit in search_results_dict.values():
            functions_dict[counter] = {
                'gene_callers_id': hmm_hit['gene_callers_id'],
                'source': 'Pfam',
                'accession': hmm_hit['gene_hmm_id'],
                'function':
                self.get_function_from_catalog(hmm_hit['gene_hmm_id']),
                'e_value': hmm_hit['e_value'],
            }

            counter += 1

        gene_function_calls_table = TableForGeneFunctions(
            self.contigs_db_path, self.run, self.progress)
        gene_function_calls_table.create(functions_dict)

        if anvio.DEBUG:
            run.warning(
                "The temp directories, '%s' and '%s' are kept. Please don't forget to clean those up\
                         later" %
                (tmp_directory_path, ', '.join(hmmer.tmp_dirs)),
                header="Debug")
        else:
            run.info_single(
                'Cleaning up the temp directory (you can use `--debug` if you would\
                             like to keep it for testing purposes)',
                nl_before=1,
                nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
Example #8
0
    def populate_search_tables(self, sources={}):
        # if we end up generating a temporary file for amino acid sequences:
        if not len(sources):
            import anvio.data.hmm
            sources = anvio.data.hmm.sources

        if not sources:
            return

        target_files_dict = {}

        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # here we will go through targets and populate target_files_dict based on what we find among them.
        targets = set([s['target'] for s in list(sources.values())])
        for target in targets:

            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(target)

            self.run.info('Target found', '%s:%s' % (alphabet, context))

            class Args: pass
            args = Args()
            args.contigs_db = self.db_path
            contigs_db = ContigsSuperclass(args, r=terminal.Run(verbose=False))

            if context == 'GENE':
                target_files_dict['%s:GENE' % alphabet] = os.path.join(tmp_directory_path, '%s_gene_sequences.fa' % alphabet)
                contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(output_file_path=target_files_dict['%s:GENE' % alphabet],
                                                                           simple_headers=True,
                                                                           rna_alphabet=True if alphabet=='RNA' else False,
                                                                           report_aa_sequences=True if alphabet=='AA' else False)
            elif context == 'CONTIG':
                if alphabet == 'AA':
                    raise ConfigError("You are somewhere you shouldn't be. You came here because you thought it would be OK\
                                       to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If\
                                       you think this is dumb, please let us know.")
                else:
                    target_files_dict['%s:CONTIG' % alphabet] = os.path.join(tmp_directory_path, '%s_contig_sequences.fa' % alphabet)
                    utils.export_sequences_from_contigs_db(self.db_path,
                                                           target_files_dict['%s:CONTIG' % alphabet],
                                                           rna_alphabet=True if alphabet=='RNA' else False)

        commander = HMMer(target_files_dict, num_threads_to_use=self.num_threads_to_use)

        for source in sources:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(sources[source]['target'])

            kind_of_search = sources[source]['kind']
            domain = sources[source]['domain']
            all_genes_searched_against = sources[source]['genes']
            hmm_model = sources[source]['model']
            reference = sources[source]['ref']
            noise_cutoff_terms = sources[source]['noise_cutoff_terms']

            hmm_scan_hits_txt = commander.run_hmmscan(source,
                                                      alphabet,
                                                      context,
                                                      kind_of_search,
                                                      domain,
                                                      len(all_genes_searched_against),
                                                      hmm_model,
                                                      reference,
                                                      noise_cutoff_terms)

            if not hmm_scan_hits_txt:
                search_results_dict = {}
            else:
                parser = parser_modules['search']['hmmscan'](hmm_scan_hits_txt, alphabet=alphabet, context=context)
                search_results_dict = parser.get_search_results()

            if not len(search_results_dict):
                run.info_single("The HMM source '%s' returned 0 hits. SAD (but it's stil OK)." % source, nl_before=1)


            if context == 'CONTIG':
                # we are in trouble here. because our search results dictionary contains no gene calls, but contig
                # names that contain our hits. on the other hand, the rest of the code outside of this if statement
                # expects a `search_results_dict` with gene callers id in it. so there are two things we need to do
                # to do. one is to come up with some new gene calls and add them to the contigs database. so things
                # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks
                # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these
                # steps are going to be taken care of in the following function. magic.

                if source != "Ribosomal_RNAs":
                    self.run.warning("You just called an HMM profile that runs on contigs and not genes. Because this HMM\
                                      operation is not directly working with gene calls anvi'o already knows about, the resulting\
                                      hits will need to be added as 'new gene calls' into the contigs database. So far so good.\
                                      But blecause we are in the contigs realm rater than genes realm, it is likely that\
                                      resulting hits will not correspond to open reading frames that are supposed to be\
                                      translated (such as ribosomal RNAs), because otherwise you would be working with genes\
                                      instad of defining CONTIGS as your context in that HMM profile you just used unless you\
                                      not sure what you are doing. Hence, anvi'o will not report amino acid sequences for the\
                                      new gene calls it will recover through these HMMs. Please take a moment and you be the\
                                      judge of whether this will influence your pangenomic analyses or other things you thought\
                                      you would be doing with the result of this HMM search downstream. If you do not feel like\
                                      being the judge of anything today you can move on yet remember to remember this if things\
                                      look somewhat weird later on.",
                                     header="Psst. Your fancy HMM profile '%s' speaking" % source,
                                     lc="green")

                num_hits_before = len(search_results_dict)
                search_results_dict = utils.get_pruned_HMM_hits_dict(search_results_dict)
                num_hits_after = len(search_results_dict)

                if num_hits_before != num_hits_after:
                    self.run.info('Pruned', '%d out of %d hits were removed due to redundancy' % (num_hits_before - num_hits_after, num_hits_before))

                search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(kind_of_search,
                                                                                                           search_results_dict,
                                                                                                           skip_amino_acid_sequences=True)

            self.append(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict)

        # FIXME: I have no clue why importing the anvio module is necessary at this point,
        #        but without this, mini test fails becasue "`anvio.DEBUG` is being used
        #        before initialization". nonsense.
        import anvio
        if not anvio.DEBUG:
            commander.clean_tmp_dirs()
            for v in list(target_files_dict.values()):
                os.remove(v)