Example #1
0
    def populate_search_tables(self, sources={}):
        # make sure the output file is OK to write.
        filesnpaths.is_output_file_writable(self.db_path, ok_if_exists=True)

        # if we end up generating a temporary file for amino acid sequences:
        if not len(sources):
            import anvio.data.hmm
            sources = anvio.data.hmm.sources

        if not sources:
            return

        self.check_sources(sources)

        target_files_dict = {}

        tmp_directory_path = filesnpaths.get_temp_directory_path()

        hmmpressed_files = self.hmmpress_sources(sources, tmp_directory_path)

        self.run.info("Contigs DB", self.db_path)
        self.run.info("HMM sources", ', '.join(sources.keys()))

        # here we will go through targets and populate target_files_dict based on what we find among them.
        targets = set([s['target'] for s in list(sources.values())])
        have_hmm_sources_with_non_RNA_contig_context = False
        for target in targets:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(
                target)

            if not self.genes_are_called and context != "CONTIG":
                raise ConfigError(
                    "You are in trouble. The gene calling was skipped for this contigs database, yet anvi'o asked to run an "
                    "HMM profile that wishes to operate on %s context using the %s alphabet. It is not OK. You still could run "
                    "HMM profiles that does not require gene calls to be present (such as the HMM profile that identifies Ribosomal "
                    "RNAs in contigs, but for that you would have to explicitly ask for it by using the additional parameter "
                    "'--installed-hmm-profile PROFILE_NAME_HERE')." %
                    (context, alphabet))

            self.run.info('Alphabet/context target found',
                          '%s:%s' % (alphabet, context))

            if context == 'CONTIG' and alphabet != 'RNA':
                have_hmm_sources_with_non_RNA_contig_context = True

            class Args:
                pass

            args = Args()
            args.contigs_db = self.db_path
            contigs_db = ContigsSuperclass(args, r=terminal.Run(verbose=False))

            if context == 'GENE':
                target_files_dict['%s:GENE' % alphabet] = os.path.join(
                    tmp_directory_path, '%s_gene_sequences.fa' % alphabet)
                contigs_db.get_sequences_for_gene_callers_ids(
                    output_file_path=target_files_dict['%s:GENE' % alphabet],
                    simple_headers=True,
                    rna_alphabet=True if alphabet == 'RNA' else False,
                    report_aa_sequences=True if alphabet == 'AA' else False)
            elif context == 'CONTIG':
                if alphabet == 'AA':
                    raise ConfigError(
                        "You are somewhere you shouldn't be. You came here because you thought it would be OK "
                        "to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If "
                        "you think this is dumb, please let us know.")
                else:
                    target_files_dict['%s:CONTIG' % alphabet] = os.path.join(
                        tmp_directory_path,
                        '%s_contig_sequences.fa' % alphabet)
                    utils.export_sequences_from_contigs_db(
                        self.db_path,
                        target_files_dict['%s:CONTIG' % alphabet],
                        rna_alphabet=True if alphabet == 'RNA' else False)

        if have_hmm_sources_with_non_RNA_contig_context:
            # in that case, we should remind people what's up.
            self.run.warning(
                "The HMM profiles that are about to be run includes at least one HMM profile that runs on "
                "contigs and not genes. Thus, this HMM operation will not be working with gene calls anvi'o "
                "already knows about. Which means, the resulting hits will need to be added as 'new gene calls' "
                "into the contigs database. So far so good. But because we are in the realm of contigs rather "
                "than genes, the resulting HMM hits will unlikely correspond to open reading frames that are "
                "supposed to be translated (such as ribosomal RNAs). While anvi'o adds new gene calls to your "
                "contigs database for these hits, it will NOT report amino acid sequences for the "
                "new gene calls that will emerge from these HMMs, expecting you to judge whether this will "
                "influence your pangenomic analyses or other things you thought you would be doing with the "
                "result of this HMM search downstream. If you do not feel like being the judge of anything today "
                "you can move on yet remember to remember this if things look somewhat weird later on.",
                header="THE MORE YOU KNOW 🌈",
                lc="green")

        commander = HMMer(target_files_dict,
                          num_threads_to_use=self.num_threads_to_use,
                          program_to_use=self.hmm_program)

        for source in sources:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(
                sources[source]['target'])

            if alphabet in ['DNA', 'RNA'
                            ] and 'domtable' in self.hmmer_desired_output:
                raise ConfigError(
                    "Domain table output was requested (probably with the --get-domtable-output flag, "
                    "does that look familiar?) but unfortunately this option is incompatible with the "
                    f"current source of HMM profiles, {source}, because this source uses a nucleotide "
                    "alphabet.")

            kind_of_search = sources[source]['kind']
            domain = sources[source]['domain']
            all_genes_searched_against = sources[source]['genes']
            hmm_model = hmmpressed_files[source]
            reference = sources[source]['ref']
            noise_cutoff_terms = sources[source]['noise_cutoff_terms']

            hmmer_output = commander.run_hmmer(
                source,
                alphabet,
                context,
                kind_of_search,
                domain,
                len(all_genes_searched_against),
                hmm_model,
                reference,
                noise_cutoff_terms,
                desired_output=self.hmmer_desired_output,
                hmmer_output_dir=self.hmmer_output_dir)

            if self.hmmer_output_dir:
                self.run.info("HMMER output directory", self.hmmer_output_dir)

            if not isinstance(hmmer_output, tuple):
                hmm_scan_hits_txt = hmmer_output
            else:
                hmm_scan_hits_txt, domain_hits_txt = hmmer_output
                self.run.info("Domain table output", domain_hits_txt)

            if not hmm_scan_hits_txt:
                search_results_dict = {}
            else:
                try:
                    parser = parser_modules['search']['hmmer_table_output'](
                        hmm_scan_hits_txt,
                        alphabet=alphabet,
                        context=context,
                        program=self.hmm_program)
                except StupidHMMError as e:
                    raise ConfigError(
                        f"Unfortunately something went wrong while anvi'o was trying to parse some HMM output for your data. "
                        f"This error is typically due to contig names that are long and variable in length, which that "
                        f"confuses HMMER and so it generates output tables that are simply unparseable. Anvi'o does its best, "
                        f"but occasionally fails, which leads to this error. If you are curious why is this happening, you can take a "
                        f"look at this issue where this issue is described: https://github.com/merenlab/anvio/issues/1564. "
                        f"Solution to this is relatively easy: use `anvi-script-reformat-fasta` with `--simplify-names` flag "
                        f"BEFORE generating your contigs database as we advice you to. Sorry you came all this way just to "
                        f"find out about this :/ Here is the origial error message anvi'o produced from the code beneath: {e}."
                    )

                search_results_dict = parser.get_search_results()

            if not len(search_results_dict):
                run.info_single(
                    "The HMM source '%s' returned 0 hits. SAD (but it's stil OK)."
                    % source,
                    nl_before=1)

            if context == 'CONTIG':
                # we are in trouble here. because our search results dictionary contains no gene calls, but contig
                # names contain our hits. on the other hand, the rest of the code outside of this if statement
                # expects a `search_results_dict` with gene caller ids in it. so there are two things we need to do.
                # one is to come up with some new gene calls and add them to the contigs database. so things
                # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks
                # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these
                # steps are going to be taken care of in the following function. magic.
                num_hits_before = len(search_results_dict)
                search_results_dict = utils.get_pruned_HMM_hits_dict(
                    search_results_dict)
                num_hits_after = len(search_results_dict)

                if num_hits_before != num_hits_after:
                    self.run.info(
                        'Pruned',
                        '%d out of %d hits were removed due to redundancy' %
                        (num_hits_before - num_hits_after, num_hits_before))

                search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(
                    kind_of_search,
                    search_results_dict,
                    skip_amino_acid_sequences=True)

            self.append(source, reference, kind_of_search, domain,
                        all_genes_searched_against, search_results_dict)

        # FIXME: I have no clue why importing the anvio module is necessary at this point,
        #        but without this, mini test fails becasue "`anvio.DEBUG` is being used
        #        before initialization". nonsense.
        import anvio
        if not anvio.DEBUG:
            commander.clean_tmp_dirs()
            for v in list(target_files_dict.values()):
                os.remove(v)

            shutil.rmtree(tmp_directory_path)
Example #2
0
    def populate_search_tables(self, sources={}):
        # if we end up generating a temporary file for amino acid sequences:
        if not len(sources):
            import anvio.data.hmm
            sources = anvio.data.hmm.sources

        if not sources:
            return

        target_files_dict = {}

        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # here we will go through targets and populate target_files_dict based on what we find among them.
        targets = set([s['target'] for s in list(sources.values())])
        for target in targets:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(target)

            if not self.genes_are_called and context != "CONTIG":
                raise ConfigError("You are in trouble. The gene calling was skipped for this contigs database, yet anvi'o asked to run an\
                                   HMM profile that wishes to operate on %s context using the %s alphabet. It is not OK. You still could run\
                                   HMM profiles that does not require gene calls to be present (such as the HMM profile that identifies Ribosomal\
                                   RNAs in contigs, but for that you would have to explicitly ask for it by using the additional parameter\
                                   '--installed-hmm-profile Ribosomal_RNAs')." % (context, alphabet))

            self.run.info('Target found', '%s:%s' % (alphabet, context))

            class Args: pass
            args = Args()
            args.contigs_db = self.db_path
            contigs_db = ContigsSuperclass(args, r=terminal.Run(verbose=False))

            if context == 'GENE':
                target_files_dict['%s:GENE' % alphabet] = os.path.join(tmp_directory_path, '%s_gene_sequences.fa' % alphabet)
                contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(output_file_path=target_files_dict['%s:GENE' % alphabet],
                                                                           simple_headers=True,
                                                                           rna_alphabet=True if alphabet=='RNA' else False,
                                                                           report_aa_sequences=True if alphabet=='AA' else False)
            elif context == 'CONTIG':
                if alphabet == 'AA':
                    raise ConfigError("You are somewhere you shouldn't be. You came here because you thought it would be OK\
                                       to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If\
                                       you think this is dumb, please let us know.")
                else:
                    target_files_dict['%s:CONTIG' % alphabet] = os.path.join(tmp_directory_path, '%s_contig_sequences.fa' % alphabet)
                    utils.export_sequences_from_contigs_db(self.db_path,
                                                           target_files_dict['%s:CONTIG' % alphabet],
                                                           rna_alphabet=True if alphabet=='RNA' else False)

        commander = HMMer(target_files_dict, num_threads_to_use=self.num_threads_to_use)

        for source in sources:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(sources[source]['target'])

            kind_of_search = sources[source]['kind']
            domain = sources[source]['domain']
            all_genes_searched_against = sources[source]['genes']
            hmm_model = sources[source]['model']
            reference = sources[source]['ref']
            noise_cutoff_terms = sources[source]['noise_cutoff_terms']

            hmm_scan_hits_txt = commander.run_hmmscan(source,
                                                      alphabet,
                                                      context,
                                                      kind_of_search,
                                                      domain,
                                                      len(all_genes_searched_against),
                                                      hmm_model,
                                                      reference,
                                                      noise_cutoff_terms)

            if not hmm_scan_hits_txt:
                search_results_dict = {}
            else:
                parser = parser_modules['search']['hmmscan'](hmm_scan_hits_txt, alphabet=alphabet, context=context)
                search_results_dict = parser.get_search_results()

            if not len(search_results_dict):
                run.info_single("The HMM source '%s' returned 0 hits. SAD (but it's stil OK)." % source, nl_before=1)


            if context == 'CONTIG':
                # we are in trouble here. because our search results dictionary contains no gene calls, but contig
                # names that contain our hits. on the other hand, the rest of the code outside of this if statement
                # expects a `search_results_dict` with gene callers id in it. so there are two things we need to do
                # to do. one is to come up with some new gene calls and add them to the contigs database. so things
                # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks
                # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these
                # steps are going to be taken care of in the following function. magic.

                if source != "Ribosomal_RNAs":
                    self.run.warning("You just called an HMM profile that runs on contigs and not genes. Because this HMM\
                                      operation is not directly working with gene calls anvi'o already knows about, the resulting\
                                      hits will need to be added as 'new gene calls' into the contigs database. So far so good.\
                                      But because we are in the contigs realm rater than genes realm, it is likely that\
                                      resulting hits will not correspond to open reading frames that are supposed to be\
                                      translated (such as ribosomal RNAs), because otherwise you would be working with genes\
                                      instad of defining CONTIGS as your context in that HMM profile you just used unless you\
                                      not sure what you are doing. Hence, anvi'o will not report amino acid sequences for the\
                                      new gene calls it will recover through these HMMs. Please take a moment and you be the\
                                      judge of whether this will influence your pangenomic analyses or other things you thought\
                                      you would be doing with the result of this HMM search downstream. If you do not feel like\
                                      being the judge of anything today you can move on yet remember to remember this if things\
                                      look somewhat weird later on.",
                                     header="Psst. Your fancy HMM profile '%s' speaking" % source,
                                     lc="green")

                num_hits_before = len(search_results_dict)
                search_results_dict = utils.get_pruned_HMM_hits_dict(search_results_dict)
                num_hits_after = len(search_results_dict)

                if num_hits_before != num_hits_after:
                    self.run.info('Pruned', '%d out of %d hits were removed due to redundancy' % (num_hits_before - num_hits_after, num_hits_before))

                search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(kind_of_search,
                                                                                                           search_results_dict,
                                                                                                           skip_amino_acid_sequences=True)

            self.append(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict)

        # FIXME: I have no clue why importing the anvio module is necessary at this point,
        #        but without this, mini test fails becasue "`anvio.DEBUG` is being used
        #        before initialization". nonsense.
        import anvio
        if not anvio.DEBUG:
            commander.clean_tmp_dirs()
            for v in list(target_files_dict.values()):
                os.remove(v)
Example #3
0
    def populate_search_tables(self, sources={}):
        # if we end up generating a temporary file for amino acid sequences:
        if not len(sources):
            import anvio.data.hmm
            sources = anvio.data.hmm.sources

        if not sources:
            return

        target_files_dict = {}

        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # here we will go through targets and populate target_files_dict based on what we find among them.
        targets = set([s['target'] for s in list(sources.values())])
        for target in targets:

            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(
                target)

            self.run.info('Target found', '%s:%s' % (alphabet, context))

            class Args:
                pass

            args = Args()
            args.contigs_db = self.db_path
            contigs_db = ContigsSuperclass(args)

            if context == 'GENE':
                target_files_dict['%s:GENE' % alphabet] = os.path.join(
                    tmp_directory_path, '%s_gene_sequences.fa' % alphabet)
                contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(
                    output_file_path=target_files_dict['%s:GENE' % alphabet],
                    simple_headers=True,
                    rna_alphabet=True if alphabet == 'RNA' else False,
                    report_aa_sequences=True if alphabet == 'AA' else False)
            elif context == 'CONTIG':
                if alphabet == 'AA':
                    raise ConfigError(
                        "You are somewhere you shouldn't be. You came here because you thought it would be OK\
                                       to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If\
                                       you think this is dumb, please let us know."
                    )
                else:
                    target_files_dict['%s:CONTIG' % alphabet] = os.path.join(
                        tmp_directory_path,
                        '%s_contig_sequences.fa' % alphabet)
                    utils.export_sequences_from_contigs_db(
                        self.db_path,
                        target_files_dict['%s:CONTIG' % alphabet],
                        rna_alphabet=True if alphabet == 'RNA' else False)

        commander = HMMer(target_files_dict,
                          num_threads_to_use=self.num_threads_to_use)

        for source in sources:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(
                sources[source]['target'])

            kind_of_search = sources[source]['kind']
            domain = sources[source]['domain']
            all_genes_searched_against = sources[source]['genes']
            hmm_model = sources[source]['model']
            reference = sources[source]['ref']
            noise_cutoff_terms = sources[source]['noise_cutoff_terms']

            hmm_scan_hits_txt = commander.run_hmmscan(
                source, alphabet, context, kind_of_search, domain,
                all_genes_searched_against, hmm_model, reference,
                noise_cutoff_terms)

            if not hmm_scan_hits_txt:
                search_results_dict = {}
            else:
                parser = parser_modules['search']['hmmscan'](hmm_scan_hits_txt,
                                                             alphabet=alphabet,
                                                             context=context)
                search_results_dict = parser.get_search_results()

            if not len(search_results_dict):
                run.info_single(
                    "The HMM source '%s' returned 0 hits. SAD (but it's stil OK)."
                    % source,
                    nl_before=1)

            if context == 'CONTIG':
                # we are in trouble here. because our search results dictionary contains no gene calls, but contig
                # names that contain our hits. on the other hand, the rest of the code outside of this if statement
                # expects a `search_results_dict` with gene callers id in it. so there are two things we need to do
                # to do. one is to come up with some new gene calls and add them to the contigs database. so things
                # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks
                # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these
                # steps are going to be taken care of in the following function. magic.

                self.run.warning(
                    "Alright! You just called an HMM profile that runs on contigs. Because it is not\
                                 working with anvi'o gene calls directly, the resulting hits will need to be added\
                                 as 'new gene calls' into the contigs database. This is a new feature, and if it\
                                 starts screwing things up for you please let us know. Other than that you're pretty\
                                 much golden. Carry on.",
                    header="Psst. Your fancy HMM profile '%s' speaking" %
                    source,
                    lc="green")

                num_hits_before = len(search_results_dict)
                search_results_dict = utils.get_pruned_HMM_hits_dict(
                    search_results_dict)
                num_hits_after = len(search_results_dict)

                if num_hits_before != num_hits_after:
                    self.run.info(
                        'Pruned',
                        '%d out of %d hits were removed due to redundancy' %
                        (num_hits_before - num_hits_after, num_hits_before))

                search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(
                    kind_of_search, search_results_dict)

            self.append(source, reference, kind_of_search, domain,
                        all_genes_searched_against, search_results_dict)

        # FIXME: I have no clue why importing the anvio module is necessary at this point,
        #        but without this, mini test fails becasue "`anvio.DEBUG` is being used
        #        before initialization". nonsense.
        import anvio
        if not anvio.DEBUG:
            commander.clean_tmp_dirs()
            for v in list(target_files_dict.values()):
                os.remove(v)
Example #4
0
    def populate_search_tables(self, contigs_db_path):
        utils.is_contigs_db(contigs_db_path)

        info_table = hmmops.SequencesForHMMHits(contigs_db_path).hmm_hits_info

        if self.source_name in info_table:
            if self.just_do_it:
                TablesForHMMHits(contigs_db_path,
                                 run=self.run,
                                 progress=self.progress).remove_source(
                                     self.source_name)
            else:
                raise ConfigError(
                    "There is already information for %s in the database :/ Anvi'o will not overwrite this "
                    "unless you ask for it explicitly. You can either use `anvi-delete-hmms` to remove it first, "
                    "or run `anvi-scan-trnas` with `--just-do-it` flag so anvi'o would remove it for you."
                    % (self.source_name))

        filesnpaths.is_output_file_writable(contigs_db_path, ok_if_exists=True)

        contig_sequences_fasta_path = os.path.join(self.tmp_directory_path,
                                                   'contig_sequences.fa')

        utils.export_sequences_from_contigs_db(contigs_db_path,
                                               contig_sequences_fasta_path)

        search_results_dict = self.run_trnascan_on_FASTA(
            fasta_file_path=contig_sequences_fasta_path)

        # At this point we need to turn this search_results_dict into one that matches how it is used
        # in HMM operations. Here is an entry from tRNA results dict:
        #
        # {1: {'contig': 'Bfragilis_0100_000000000001',
        #      'trna_no': '1',
        #      'start': 135361,
        #      'stop': 135433,
        #      'amino_acid': 'Thr',
        #      'anticodon': 'CGT',
        #      'score': 67.6}}
        #
        # and here is one exmple from the rRNA HMMs results dict:
        #
        # {1: {'entry_id': 0,
        #      'gene_name': 'Bacterial_23S_rRNA',
        #      'gene_hmm_id': '-',
        #      'contig_name': 'Bfragilis_0100_000000000001',
        #      'start': 1110877,
        #      'stop': 1113757,
        #      'e_value': 0.0}}
        #
        # so we will have to make the former look like the latter. I have the feeling that the
        # score / e_value will cause issues later :(

        missing_amino_acids = Counter()
        missing_anticodons = Counter()
        entries_to_remove = set([])
        for entry_id in search_results_dict:
            entry = search_results_dict[entry_id]

            aa, anticodon = entry['amino_acid'], entry['anticodon']

            if anticodon not in self.anticodons:
                missing_anticodons[anticodon] += 1
                entries_to_remove.add(entry_id)
                continue

            if aa not in self.amino_acids:
                missing_amino_acids[aa] += 1
                entries_to_remove.add(entry_id)
                continue

            aa_codon = '%s_%s' % (aa, anticodon)

            entry['gene_name'] = aa_codon
            entry['e_value'] = entry['score']
            entry['gene_hmm_id'] = '-'
            if entry['stop'] > entry['start']:
                # so we are forward
                entry['start'] = entry[
                    'start'] - 1  # setting the pythonic start.
            else:
                # so this one is reverse
                entry['stop'] = entry['stop'] - 1

            # just to double check for surprises (see https://github.com/merenlab/anvio/issues/1367 for details)
            for pos in ['start', 'stop']:
                if entry[pos] < 0:
                    entry[pos] = 0

        for entry_id in entries_to_remove:
            search_results_dict.pop(entry_id)

        self.run.info("Num tRNA genes recovered", len(search_results_dict))

        if len(missing_anticodons):
            info_line = ', '.join([
                '%s (%d)' % (anticodon, missing_anticodons[anticodon])
                for anticodon in missing_anticodons
            ])
            self.run.warning(
                "While anvi'o was trying to parse the output from tRNAScan-SE, it "
                "became clear that some of the codons the tool identified was not "
                "known to anvi'o, so we conservatively discareded those entries. "
                "Here is the list of codons that were discareded and their frequency "
                "among your contigs: '%s'." % (info_line),
                header="WEIRD CODONS ALERT")

        if len(missing_amino_acids):
            info_line = ', '.join([
                '%s (%d)' % (amino_acid, missing_amino_acids[amino_acid])
                for amino_acid in missing_amino_acids
            ])
            self.run.warning(
                "While anvi'o was trying to parse the output from tRNAScan-SE, it "
                "run into some amino acid names that were not known to anvi'o. "
                "All those entries are now gone :/ But here is the list of amino "
                "acids and their frequencies: '%s'." % (info_line),
                header="WEIRD AMINO ACIDS ALERT")

        search_results_dict = utils.get_pruned_HMM_hits_dict(
            search_results_dict)

        tables_for_hmm_hits = TablesForHMMHits(contigs_db_path,
                                               run=self.run,
                                               progress=self.progress)
        search_results_dict = tables_for_hmm_hits.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(
            self.kind_of_search,
            search_results_dict,
            skip_amino_acid_sequences=True)
        tables_for_hmm_hits.append_to_hmm_hits_table(
            self.source_name, self.reference, self.kind_of_search, self.domain,
            self.all_genes_searched_against, search_results_dict)

        # when the code comes all the way here, the entries in the search results dict already look like
        # this, so we have a gene callers id for the newly generate genes for tRNAs. we will use it
        # to populate a functions dict and submit it to the contigs database as well:
        #
        #     {'contig_name': 'Bfragilis_0100_000000000001',
        #      'trna_no': '1',
        #      'start': 135361,
        #      'stop': 135433,
        #      'amino_acid': 'Thr',
        #      'anticodon': 'CGT',
        #      'score': 67.6,
        #      'gene_name': 'Thr_ACG',
        #      'e_value': 67.6,
        #      'gene_hmm_id': '-',
        #      'gene_callers_id': 4502}
        #
        functions_dict = {}
        for entry_id in search_results_dict:
            entry = search_results_dict[entry_id]

            function_text = 'tRNA gene for amino acid %s (anticodon:%s; score:%.1f; intron_start:%d; intron_end:%d)' \
                                            % (entry['amino_acid'], entry['anticodon'], entry['score'], entry['intron_start'], entry['intron_end'])

            functions_dict[entry_id] = {
                'gene_callers_id':
                entry['gene_callers_id'],
                'source':
                self.source_name,
                'accession':
                '%s_%s_%d' % (entry['amino_acid'], entry['anticodon'],
                              entry['gene_callers_id']),
                'function':
                function_text,
                'e_value':
                0.0
            }

        gene_function_calls_table = TableForGeneFunctions(
            contigs_db_path,
            run=terminal.Run(verbose=False),
            progress=terminal.Progress(verbose=False))
        gene_function_calls_table.create(functions_dict)

        if not anvio.DEBUG:
            self.clean_tmp_directory()
            self.run.info_single(
                "Temp directory is now cleaned (if you would like to keep it the "
                "next time use the flag `--debug`).",
                nl_before=1)
        else:
            self.run.info_single(
                "Due to the `--debug` flag, anvi'o did not remove the temoporary files "
                "directory (which is still at '%s')." %
                (self.tmp_directory_path),
                nl_before=1)
Example #5
0
    def populate_search_tables(self, sources={}):
        # if we end up generating a temporary file for amino acid sequences:
        if not len(sources):
            import anvio.data.hmm
            sources = anvio.data.hmm.sources

        if not sources:
            return

        target_files_dict = {}

        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # here we will go through targets and populate target_files_dict based on what we find among them.
        targets = set([s['target'] for s in list(sources.values())])
        for target in targets:

            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(target)

            self.run.info('Target found', '%s:%s' % (alphabet, context))

            class Args: pass
            args = Args()
            args.contigs_db = self.db_path
            contigs_db = ContigsSuperclass(args, r=terminal.Run(verbose=False))

            if context == 'GENE':
                target_files_dict['%s:GENE' % alphabet] = os.path.join(tmp_directory_path, '%s_gene_sequences.fa' % alphabet)
                contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(output_file_path=target_files_dict['%s:GENE' % alphabet],
                                                                           simple_headers=True,
                                                                           rna_alphabet=True if alphabet=='RNA' else False,
                                                                           report_aa_sequences=True if alphabet=='AA' else False)
            elif context == 'CONTIG':
                if alphabet == 'AA':
                    raise ConfigError("You are somewhere you shouldn't be. You came here because you thought it would be OK\
                                       to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If\
                                       you think this is dumb, please let us know.")
                else:
                    target_files_dict['%s:CONTIG' % alphabet] = os.path.join(tmp_directory_path, '%s_contig_sequences.fa' % alphabet)
                    utils.export_sequences_from_contigs_db(self.db_path,
                                                           target_files_dict['%s:CONTIG' % alphabet],
                                                           rna_alphabet=True if alphabet=='RNA' else False)

        commander = HMMer(target_files_dict, num_threads_to_use=self.num_threads_to_use)

        for source in sources:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(sources[source]['target'])

            kind_of_search = sources[source]['kind']
            domain = sources[source]['domain']
            all_genes_searched_against = sources[source]['genes']
            hmm_model = sources[source]['model']
            reference = sources[source]['ref']
            noise_cutoff_terms = sources[source]['noise_cutoff_terms']

            hmm_scan_hits_txt = commander.run_hmmscan(source,
                                                      alphabet,
                                                      context,
                                                      kind_of_search,
                                                      domain,
                                                      len(all_genes_searched_against),
                                                      hmm_model,
                                                      reference,
                                                      noise_cutoff_terms)

            if not hmm_scan_hits_txt:
                search_results_dict = {}
            else:
                parser = parser_modules['search']['hmmscan'](hmm_scan_hits_txt, alphabet=alphabet, context=context)
                search_results_dict = parser.get_search_results()

            if not len(search_results_dict):
                run.info_single("The HMM source '%s' returned 0 hits. SAD (but it's stil OK)." % source, nl_before=1)


            if context == 'CONTIG':
                # we are in trouble here. because our search results dictionary contains no gene calls, but contig
                # names that contain our hits. on the other hand, the rest of the code outside of this if statement
                # expects a `search_results_dict` with gene callers id in it. so there are two things we need to do
                # to do. one is to come up with some new gene calls and add them to the contigs database. so things
                # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks
                # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these
                # steps are going to be taken care of in the following function. magic.

                if source != "Ribosomal_RNAs":
                    self.run.warning("You just called an HMM profile that runs on contigs and not genes. Because this HMM\
                                      operation is not directly working with gene calls anvi'o already knows about, the resulting\
                                      hits will need to be added as 'new gene calls' into the contigs database. So far so good.\
                                      But blecause we are in the contigs realm rater than genes realm, it is likely that\
                                      resulting hits will not correspond to open reading frames that are supposed to be\
                                      translated (such as ribosomal RNAs), because otherwise you would be working with genes\
                                      instad of defining CONTIGS as your context in that HMM profile you just used unless you\
                                      not sure what you are doing. Hence, anvi'o will not report amino acid sequences for the\
                                      new gene calls it will recover through these HMMs. Please take a moment and you be the\
                                      judge of whether this will influence your pangenomic analyses or other things you thought\
                                      you would be doing with the result of this HMM search downstream. If you do not feel like\
                                      being the judge of anything today you can move on yet remember to remember this if things\
                                      look somewhat weird later on.",
                                     header="Psst. Your fancy HMM profile '%s' speaking" % source,
                                     lc="green")

                num_hits_before = len(search_results_dict)
                search_results_dict = utils.get_pruned_HMM_hits_dict(search_results_dict)
                num_hits_after = len(search_results_dict)

                if num_hits_before != num_hits_after:
                    self.run.info('Pruned', '%d out of %d hits were removed due to redundancy' % (num_hits_before - num_hits_after, num_hits_before))

                search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(kind_of_search,
                                                                                                           search_results_dict,
                                                                                                           skip_amino_acid_sequences=True)

            self.append(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict)

        # FIXME: I have no clue why importing the anvio module is necessary at this point,
        #        but without this, mini test fails becasue "`anvio.DEBUG` is being used
        #        before initialization". nonsense.
        import anvio
        if not anvio.DEBUG:
            commander.clean_tmp_dirs()
            for v in list(target_files_dict.values()):
                os.remove(v)