Example #1
0
    def store_annotations_in_db(self, drop_previous_annotations=False):
        if not self.contigs_db_path:
            raise ConfigError("EggNOGMapper::store_annotations_in_db() is speaking: you can't really call this function if you inherited\
                                this class without a contigs database path :/ Well, yes, OK, you can call it, but it wouldn't work. Happy?")

        if not len(self.annotations_dict):
            raise ConfigError('Annotations dictionary is empty :/ There is nothing to add to the database.')

        gene_functions_table = TableForGeneFunctions(self.contigs_db_path)
        gene_functions_table.create(self.annotations_dict, drop_previous_annotations_first=drop_previous_annotations)
Example #2
0
    def store_annotations_in_db(self, drop_previous_annotations=False):
        if not self.contigs_db_path:
            raise ConfigError("EggNOGMapper::store_annotations_in_db() is speaking: you can't really call this function if you inherited "
                               "this class without a contigs database path :/ Well, yes, OK, you can call it, but it wouldn't work. Happy?")

        if not len(self.annotations_dict):
            raise ConfigError('Annotations dictionary is empty :/ There is nothing to add to the database.')

        gene_functions_table = TableForGeneFunctions(self.contigs_db_path)
        gene_functions_table.create(self.annotations_dict, drop_previous_annotations_first=drop_previous_annotations)
Example #3
0
    def process(self):
        hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')

        # initialize contigs database
        class Args:
            pass

        args = Args()
        args.contigs_db = self.contigs_db_path
        contigs_db = dbops.ContigsSuperclass(args)
        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # export AA sequences for genes
        target_files_dict = {
            'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa')
        }
        contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(
            output_file_path=target_files_dict['AA:GENE'],
            simple_headers=True,
            rna_alphabet=False,
            report_aa_sequences=True)

        # run hmmscan
        hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads)
        hmm_hits_file = hmmer.run_hmmscan('Pfam', 'AA', 'GENE', None, None,
                                          len(self.function_catalog), hmm_file,
                                          None, '--cut_ga')

        # parse hmmscan output
        parser = parser_modules['search']['hmmscan'](hmm_hits_file,
                                                     alphabet='AA',
                                                     context='GENE')
        search_results_dict = parser.get_search_results()

        # add functions to database
        functions_dict = {}
        counter = 0
        for hmm_hit in search_results_dict.values():
            functions_dict[counter] = {
                'gene_callers_id': hmm_hit['gene_callers_id'],
                'source': 'Pfam',
                'accession': hmm_hit['gene_hmm_id'],
                'function':
                self.get_function_from_catalog(hmm_hit['gene_hmm_id']),
                'e_value': hmm_hit['e_value'],
            }

            counter += 1

        gene_function_calls_table = TableForGeneFunctions(
            self.contigs_db_path, self.run, self.progress)
        gene_function_calls_table.create(functions_dict)
Example #4
0
    def process(self):
        hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')

        # initialize contigs database
        class Args: pass
        args = Args()
        args.contigs_db = self.contigs_db_path
        contigs_db = dbops.ContigsSuperclass(args)
        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # export AA sequences for genes
        target_files_dict = {'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa')}
        contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(output_file_path=target_files_dict['AA:GENE'],
                                                                   simple_headers=True,
                                                                   rna_alphabet=False,
                                                                   report_aa_sequences=True)

        # run hmmscan
        hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads)
        hmm_hits_file = hmmer.run_hmmscan('Pfam', 'AA', 'GENE', None, None, len(self.function_catalog), hmm_file, None, '--cut_ga')

        # parse hmmscan output
        parser = parser_modules['search']['hmmscan'](hmm_hits_file, alphabet='AA', context='GENE')
        search_results_dict = parser.get_search_results()

        # add functions to database
        functions_dict = {}
        counter = 0
        for hmm_hit in search_results_dict.values():
            functions_dict[counter] = {
                'gene_callers_id': hmm_hit['gene_callers_id'],
                'source': 'Pfam',
                'accession': hmm_hit['gene_hmm_id'],
                'function': self.get_function_from_catalog(hmm_hit['gene_hmm_id']),
                'e_value': hmm_hit['e_value'],
            }

            counter += 1

        gene_function_calls_table = TableForGeneFunctions(self.contigs_db_path, self.run, self.progress)
        gene_function_calls_table.create(functions_dict)

        if anvio.DEBUG:
            run.warning("The temp directories, '%s' and '%s' are kept. Please don't forget to clean those up\
                         later" % (tmp_directory_path, ', '.join(hmmer.tmp_dirs)), header="Debug")
        else:
            run.info_single('Cleaning up the temp directory (you can use `--debug` if you would\
                             like to keep it for testing purposes)', nl_before=1, nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
Example #5
0
    def append_to_gene_functions_table(self, source, search_results_dict):
        """Append custom HMM hits to the gene functions table in the contigs database."""

        # get an instance of gene functions table
        gene_function_calls_table = TableForGeneFunctions(self.db_path, self.run, self.progress)

        # first we massage the hmm_hits dictionary to match expected input for the gene_functions table
        if search_results_dict:
            for entry_id in search_results_dict:
                hit = search_results_dict[entry_id]
                hit['source'] = source
                hit['accession'] = hit['gene_hmm_id']
                hit['function'] = hit['gene_name']

            gene_function_calls_table.create(search_results_dict)
        else:
            self.run.warning("There are no hits to add to the database. Returning empty handed, "
                             f"but still adding {source} as a functional source.")
            gene_function_calls_table.add_empty_sources_to_functional_sources({source})
Example #6
0
    def store_locus_as_contigs_db(self,
                                  contig_name,
                                  sequence,
                                  gene_calls,
                                  output_path_prefix,
                                  reverse_complement=False):
        """Generates a contigs database and a blank profile for a given locus"""

        temporary_files = []

        # dealing with some output file business.
        E = lambda e: output_path_prefix + e
        locus_output_db_path = E(".db")
        locus_sequence_fasta = E("_sequence.fa")
        locus_external_gene_calls = E("_external_gene_calls.txt")
        temporary_files.extend(
            [locus_external_gene_calls, locus_sequence_fasta])

        # we will generate a blank profile database at the end of this. let's get the directory
        # business sorted.
        profile_output_dir = output_path_prefix + '-PROFILE'
        if os.path.exists(profile_output_dir):
            if self.overwrite_output_destinations:
                filesnpaths.shutil.rmtree(profile_output_dir)
            else:
                raise ConfigError(
                    "The directory %s exists, which kinda messes things up here. Either remove\
                                   it manually, or use the flag  --overwrite-output-destinations so anvi'o can\
                                   do it for you." % profile_output_dir)

        # sort out the contigs database output path
        if filesnpaths.is_file_exists(locus_output_db_path, dont_raise=True):
            if self.overwrite_output_destinations:
                os.remove(locus_output_db_path)
            else:
                raise ConfigError(
                    "There is already a contigs database at the output file path :( Either remove it first,\
                                   or use the --overwrite-output-destinations flag to give anvi'o full authority to wipe\
                                   your disk.")

        # do we need to reverse complement this guy? if yes, we will take care of the contigs sequence and
        # gene calls here, and remember this for later.
        gene_calls_list = list(gene_calls.keys())
        if reverse_complement:
            sequence = utils.rev_comp(sequence)
            gene_calls, gene_caller_id_conversion_dict = utils.rev_comp_gene_calls_dict(
                gene_calls, sequence)
        else:
            gene_caller_id_conversion_dict = dict([
                (gene_calls_list[g], g) for g in range(0, len(gene_calls_list))
            ])
            new_gene_calls = {}
            for g in range(0, len(gene_calls_list)):
                gene_call = copy.deepcopy(gene_calls[gene_calls_list[g]])
                new_gene_calls[g] = gene_call
            gene_calls = new_gene_calls

        # write the sequene as a temporary FASTA file since the design of ContigsDatabase::create
        # will work seamlessly with this approach:
        with open(locus_sequence_fasta, 'w') as f:
            f.write('>%s\n%s\n' % (contig_name, sequence))

        # similarly, here we will store external gene calls so there will be no gene calling during
        # the generation of the contigs database
        headers = [
            'gene_callers_id', 'contig', 'start', 'stop', 'direction',
            'partial', 'source', 'version'
        ]
        utils.store_dict_as_TAB_delimited_file(gene_calls,
                                               locus_external_gene_calls,
                                               headers=headers)

        # this is where magic happens. we ask anvi'o to create a contigs database for us.
        args = argparse.Namespace(
            contigs_fasta=locus_sequence_fasta,
            project_name=os.path.basename(output_path_prefix),
            split_length=sys.maxsize,
            kmer_size=4,
            external_gene_calls=locus_external_gene_calls)
        dbops.ContigsDatabase(locus_output_db_path,
                              run=self.run_object).create(args)

        # while we are at it, here we generate a blank profile, too. so visualization of the
        # new contigs database for debugging or other purposes through anvi'o.
        args = argparse.Namespace(
            blank_profile=True,
            contigs_db=locus_output_db_path,
            skip_hierarchical_clustering=False,
            output_dir=profile_output_dir,
            sample_name=os.path.basename(output_path_prefix))
        profiler.BAMProfiler(args, r=self.run_object)._run()

        # so we have a contigs database! but there isn't much in it. the following where clause will
        # help us read from the tables of the original contigs database, and store it into the
        # new one throughout the following sections of the code.
        where_clause = "gene_callers_id in (%s)" % ', '.join(
            ['"%d"' % g for g in gene_caller_id_conversion_dict])

        # a lousy anonymous function to read data from tables given the gene calls of interest
        R = lambda table_name: db.DB(self.input_contigs_db_path, None, ignore_version=True) \
                                              .get_some_rows_from_table_as_dict(table_name,
                                                                                where_clause=where_clause,
                                                                                error_if_no_data=False)

        G = lambda g: gene_caller_id_conversion_dict[g]

        ############################################################################################
        # DO FUNCTIONS
        ###########################################################################################
        function_calls = R(t.gene_function_calls_table_name)

        for entry_id in function_calls:
            function_calls[entry_id]['gene_callers_id'] = G(
                function_calls[entry_id]['gene_callers_id'])

        gene_function_calls_table = TableForGeneFunctions(locus_output_db_path,
                                                          run=self.run_object)
        gene_function_calls_table.create(function_calls)

        self.run.info("Output contigs DB path", locus_output_db_path)
        self.run.info("Output blank profile DB path",
                      os.path.join(profile_output_dir, 'PROFILE.db'))

        ############################################################################################
        # DO AMINO ACID SEQUENCES -- we are using external gene calls to generate the new contigs
        #                            database, but amino acid sequnces are kept in a different table
        #                            and anvi'o checks whether provided gene calls resolve to amino
        #                            acid sequences with proper starts and stops. if not, it skips
        #                            them. but amino acid sequences for each gene call was stored
        #                            in the original contigs database, and the best practice is to
        #                            carry them into the new one. so here we will remove all data
        #                            from the amino acid seqeunces table in the new database, and
        #                            copy the contents from the original one.
        ############################################################################################
        amino_acid_sequences = R(t.gene_amino_acid_sequences_table_name)

        entries = [(gene_caller_id_conversion_dict[g],
                    amino_acid_sequences[g]['sequence'])
                   for g in amino_acid_sequences]
        db.DB(locus_output_db_path, None, ignore_version=True).insert_many(
            t.gene_amino_acid_sequences_table_name, entries=entries)

        ############################################################################################
        # REMOVE TEMP FILES
        ###########################################################################################
        if anvio.DEBUG:
            self.run.info_single(
                "Temp output files were kept for inspection due to --debug")
        else:
            [os.remove(f) for f in temporary_files]
Example #7
0
    def process(self):
        hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')

        # initialize contigs database
        class Args:
            pass

        args = Args()
        args.contigs_db = self.contigs_db_path
        contigs_db = dbops.ContigsSuperclass(args)
        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # get an instance of gene functions table
        gene_function_calls_table = TableForGeneFunctions(
            self.contigs_db_path, self.run, self.progress)

        # export AA sequences for genes
        target_files_dict = {
            'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa')
        }
        contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(
            output_file_path=target_files_dict['AA:GENE'],
            simple_headers=True,
            rna_alphabet=False,
            report_aa_sequences=True)

        # run hmmscan
        hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads)
        hmm_hits_file = hmmer.run_hmmscan('Pfam', 'AA', 'GENE', None, None,
                                          len(self.function_catalog), hmm_file,
                                          None, '--cut_ga')

        if not hmm_hits_file:
            run.info_single(
                "The HMM search returned no hits :/ So there is nothing to add to the contigs database. But "
                "now anvi'o will add PFAMs as a functional source with no hits, clean the temporary directories "
                "and gracefully quit.",
                nl_before=1,
                nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
            gene_function_calls_table.add_empty_sources_to_functional_sources(
                {'Pfam'})
            return

        # parse hmmscan output
        parser = parser_modules['search']['hmmscan'](hmm_hits_file,
                                                     alphabet='AA',
                                                     context='GENE')
        search_results_dict = parser.get_search_results()

        # add functions to database
        functions_dict = {}
        counter = 0
        for hmm_hit in search_results_dict.values():
            functions_dict[counter] = {
                'gene_callers_id':
                hmm_hit['gene_callers_id'],
                'source':
                'Pfam',
                'accession':
                hmm_hit['gene_hmm_id'],
                'function':
                self.get_function_from_catalog(
                    hmm_hit['gene_hmm_id'], ok_if_missing_from_catalog=True),
                'e_value':
                hmm_hit['e_value'],
            }

            counter += 1

        if functions_dict:
            gene_function_calls_table.create(functions_dict)
        else:
            self.run.warning(
                "Pfam class has no hits to process. Returning empty handed, but still adding Pfam as "
                "a functional source.")
            gene_function_calls_table.add_empty_sources_to_functional_sources(
                {'Pfam'})

        if anvio.DEBUG:
            run.warning(
                "The temp directories, '%s' and '%s' are kept. Please don't forget to clean those up "
                "later" % (tmp_directory_path, ', '.join(hmmer.tmp_dirs)),
                header="Debug")
        else:
            run.info_single(
                'Cleaning up the temp directory (you can use `--debug` if you would '
                'like to keep it for testing purposes)',
                nl_before=1,
                nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
Example #8
0
    def store_hits_into_contigs_db(self):
        if not self.hits:
            self.run.warning(
                "COGs class has no hits to process. Returning empty handed.")
            return

        cogs_data = COGsData(self.args)
        cogs_data.init_p_id_to_cog_id_dict()

        functions_dict = {}
        self.__entry_id = 0

        def add_entry(gene_callers_id, source, accession, function, e_value):
            functions_dict[self.__entry_id] = {
                'gene_callers_id': int(gene_callers_id),
                'source': source,
                'accession': accession,
                'function': function,
                'e_value': float(e_value)
            }
            self.__entry_id += 1

        # let's keep track of hits that match to missing COGs
        hits_for_missing_cogs = 0
        missing_cogs_found = set([])

        for gene_callers_id in self.hits:
            ncbi_protein_id = self.hits[gene_callers_id]['hit']

            in_proteins_FASTA_not_in_cogs_CSV = []
            if ncbi_protein_id not in cogs_data.p_id_to_cog_id:
                in_proteins_FASTA_not_in_cogs_CSV.append(
                    (ncbi_protein_id, gene_callers_id), )
            else:
                COG_ids = cogs_data.p_id_to_cog_id[ncbi_protein_id]

            annotations = []
            categories = set([])
            for COG_id in COG_ids:
                # is missing?
                if COG_id in cogs_data.missing_cogs:
                    missing_cogs_found.add(COG_id)
                    hits_for_missing_cogs += 1
                    continue

                # resolve categories
                for category in cogs_data.cogs[COG_id]['categories']:
                    categories.add(category)

                # append annotation
                annotations.append(cogs_data.cogs[COG_id]['annotation'])

            # all these shitty heuristics... If there are multiple COG ids or categories, separate them from each other by '!!!' so parsing
            # them later is possible. Am I embarrassed? Yes. Is there a better way of doing this efficiently? Absolutely. What time is it?
            # 9pm. Where am I? In the lab. Is it OK for me to let this slip away if it means for me to go home sooner? Yes, probably. Am I
            # gonna remember this crap in the code for the next two months at random times in the shower and feel bad about myself? F**k yes.
            add_entry(gene_callers_id, 'COG_FUNCTION', '!!!'.join(COG_ids),
                      '!!!'.join(annotations),
                      self.hits[gene_callers_id]['evalue'])
            add_entry(gene_callers_id, 'COG_CATEGORY', '!!!'.join(categories),
                      '!!!'.join(categories), 0.0)

        # store hits in contigs db.
        gene_function_calls_table = TableForGeneFunctions(
            self.contigs_db_path, self.run, self.progress)
        gene_function_calls_table.create(functions_dict)

        if len(missing_cogs_found):
            self.run.warning(
                'Although your COGs are successfully added to the database, there were some COG IDs your genes hit\
                              were among the ones that were not described in the raw data. Here is the list of %d COG IDs that\
                              were hit %d times: %s.' %
                (len(missing_cogs_found), hits_for_missing_cogs,
                 ', '.join(missing_cogs_found)))

        if len(in_proteins_FASTA_not_in_cogs_CSV):
            # so some of the hits represented in the FASTA file from the NCBI were not put in the
            # CSV file from NCBI to associate them with COGs
            report_output_file_path = filesnpaths.get_temp_file_path()
            report_output = open(report_output_file_path, 'w')
            report_output.write('anvio_gene_callers_id\tNCBI_protein_id\n')

            for protein_id, gene_callers_id in in_proteins_FASTA_not_in_cogs_CSV:
                report_output.write('%s\t%s\n' % (gene_callers_id, protein_id))

            report_output.close()

            self.run.warning("This is important. %s hits for your genes that appeared in the proteins FASTA file from the NCBI had protein\
                              IDs that were not described in the CSV file from the NCBI that associates each protein ID with a COG function.\
                              That's OK if you don't care. But if you would like to take a look, anvi'o stored a report\
                              file for you at %s" \
                        % (len(in_proteins_FASTA_not_in_cogs_CSV), report_output_file_path))
Example #9
0
    def populate_search_tables(self, contigs_db_path):
        utils.is_contigs_db(contigs_db_path)

        info_table = hmmops.SequencesForHMMHits(contigs_db_path).hmm_hits_info

        if self.source_name in info_table:
            if self.just_do_it:
                TablesForHMMHits(contigs_db_path,
                                 run=self.run,
                                 progress=self.progress).remove_source(
                                     self.source_name)
            else:
                raise ConfigError(
                    "There is already information for %s in the database :/ Anvi'o will not overwrite this "
                    "unless you ask for it explicitly. You can either use `anvi-delete-hmms` to remove it first, "
                    "or run `anvi-scan-trnas` with `--just-do-it` flag so anvi'o would remove it for you."
                    % (self.source_name))

        filesnpaths.is_output_file_writable(contigs_db_path, ok_if_exists=True)

        contig_sequences_fasta_path = os.path.join(self.tmp_directory_path,
                                                   'contig_sequences.fa')

        utils.export_sequences_from_contigs_db(contigs_db_path,
                                               contig_sequences_fasta_path)

        search_results_dict = self.run_trnascan_on_FASTA(
            fasta_file_path=contig_sequences_fasta_path)

        # At this point we need to turn this search_results_dict into one that matches how it is used
        # in HMM operations. Here is an entry from tRNA results dict:
        #
        # {1: {'contig': 'Bfragilis_0100_000000000001',
        #      'trna_no': '1',
        #      'start': 135361,
        #      'stop': 135433,
        #      'amino_acid': 'Thr',
        #      'anticodon': 'CGT',
        #      'score': 67.6}}
        #
        # and here is one exmple from the rRNA HMMs results dict:
        #
        # {1: {'entry_id': 0,
        #      'gene_name': 'Bacterial_23S_rRNA',
        #      'gene_hmm_id': '-',
        #      'contig_name': 'Bfragilis_0100_000000000001',
        #      'start': 1110877,
        #      'stop': 1113757,
        #      'e_value': 0.0}}
        #
        # so we will have to make the former look like the latter. I have the feeling that the
        # score / e_value will cause issues later :(

        missing_amino_acids = Counter()
        missing_anticodons = Counter()
        entries_to_remove = set([])
        for entry_id in search_results_dict:
            entry = search_results_dict[entry_id]

            aa, anticodon = entry['amino_acid'], entry['anticodon']

            if anticodon not in self.anticodons:
                missing_anticodons[anticodon] += 1
                entries_to_remove.add(entry_id)
                continue

            if aa not in self.amino_acids:
                missing_amino_acids[aa] += 1
                entries_to_remove.add(entry_id)
                continue

            aa_codon = '%s_%s' % (aa, anticodon)

            entry['gene_name'] = aa_codon
            entry['e_value'] = entry['score']
            entry['gene_hmm_id'] = '-'
            if entry['stop'] > entry['start']:
                # so we are forward
                entry['start'] = entry[
                    'start'] - 1  # setting the pythonic start.
            else:
                # so this one is reverse
                entry['stop'] = entry['stop'] - 1

            # just to double check for surprises (see https://github.com/merenlab/anvio/issues/1367 for details)
            for pos in ['start', 'stop']:
                if entry[pos] < 0:
                    entry[pos] = 0

        for entry_id in entries_to_remove:
            search_results_dict.pop(entry_id)

        self.run.info("Num tRNA genes recovered", len(search_results_dict))

        if len(missing_anticodons):
            info_line = ', '.join([
                '%s (%d)' % (anticodon, missing_anticodons[anticodon])
                for anticodon in missing_anticodons
            ])
            self.run.warning(
                "While anvi'o was trying to parse the output from tRNAScan-SE, it "
                "became clear that some of the codons the tool identified was not "
                "known to anvi'o, so we conservatively discareded those entries. "
                "Here is the list of codons that were discareded and their frequency "
                "among your contigs: '%s'." % (info_line),
                header="WEIRD CODONS ALERT")

        if len(missing_amino_acids):
            info_line = ', '.join([
                '%s (%d)' % (amino_acid, missing_amino_acids[amino_acid])
                for amino_acid in missing_amino_acids
            ])
            self.run.warning(
                "While anvi'o was trying to parse the output from tRNAScan-SE, it "
                "run into some amino acid names that were not known to anvi'o. "
                "All those entries are now gone :/ But here is the list of amino "
                "acids and their frequencies: '%s'." % (info_line),
                header="WEIRD AMINO ACIDS ALERT")

        search_results_dict = utils.get_pruned_HMM_hits_dict(
            search_results_dict)

        tables_for_hmm_hits = TablesForHMMHits(contigs_db_path,
                                               run=self.run,
                                               progress=self.progress)
        search_results_dict = tables_for_hmm_hits.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(
            self.kind_of_search,
            search_results_dict,
            skip_amino_acid_sequences=True)
        tables_for_hmm_hits.append_to_hmm_hits_table(
            self.source_name, self.reference, self.kind_of_search, self.domain,
            self.all_genes_searched_against, search_results_dict)

        # when the code comes all the way here, the entries in the search results dict already look like
        # this, so we have a gene callers id for the newly generate genes for tRNAs. we will use it
        # to populate a functions dict and submit it to the contigs database as well:
        #
        #     {'contig_name': 'Bfragilis_0100_000000000001',
        #      'trna_no': '1',
        #      'start': 135361,
        #      'stop': 135433,
        #      'amino_acid': 'Thr',
        #      'anticodon': 'CGT',
        #      'score': 67.6,
        #      'gene_name': 'Thr_ACG',
        #      'e_value': 67.6,
        #      'gene_hmm_id': '-',
        #      'gene_callers_id': 4502}
        #
        functions_dict = {}
        for entry_id in search_results_dict:
            entry = search_results_dict[entry_id]

            function_text = 'tRNA gene for amino acid %s (anticodon:%s; score:%.1f; intron_start:%d; intron_end:%d)' \
                                            % (entry['amino_acid'], entry['anticodon'], entry['score'], entry['intron_start'], entry['intron_end'])

            functions_dict[entry_id] = {
                'gene_callers_id':
                entry['gene_callers_id'],
                'source':
                self.source_name,
                'accession':
                '%s_%s_%d' % (entry['amino_acid'], entry['anticodon'],
                              entry['gene_callers_id']),
                'function':
                function_text,
                'e_value':
                0.0
            }

        gene_function_calls_table = TableForGeneFunctions(
            contigs_db_path,
            run=terminal.Run(verbose=False),
            progress=terminal.Progress(verbose=False))
        gene_function_calls_table.create(functions_dict)

        if not anvio.DEBUG:
            self.clean_tmp_directory()
            self.run.info_single(
                "Temp directory is now cleaned (if you would like to keep it the "
                "next time use the flag `--debug`).",
                nl_before=1)
        else:
            self.run.info_single(
                "Due to the `--debug` flag, anvi'o did not remove the temoporary files "
                "directory (which is still at '%s')." %
                (self.tmp_directory_path),
                nl_before=1)
Example #10
0
    def store_hits_into_contigs_db(self):
        if not self.hits:
            self.run.warning("COGs class has no hits to process. Returning empty handed.")
            return

        cogs_data = COGsData(self.args)
        cogs_data.init_p_id_to_cog_id_dict()

        functions_dict = {}
        self.__entry_id = 0


        def add_entry(gene_callers_id, source, accession, function, e_value):
            functions_dict[self.__entry_id] = {'gene_callers_id': int(gene_callers_id),
                                               'source': source,
                                               'accession': accession,
                                               'function': function,
                                               'e_value': float(e_value)}
            self.__entry_id += 1

        # let's keep track of hits that match to missing COGs
        hits_for_missing_cogs = 0
        missing_cogs_found = set([])

        for gene_callers_id in self.hits:
            ncbi_protein_id = self.hits[gene_callers_id]['hit']

            in_proteins_FASTA_not_in_cogs_CSV = []
            if ncbi_protein_id not in cogs_data.p_id_to_cog_id:
                in_proteins_FASTA_not_in_cogs_CSV.append((ncbi_protein_id, gene_callers_id),)
            else:
                COG_ids = cogs_data.p_id_to_cog_id[ncbi_protein_id]

            annotations = []
            categories = set([])
            for COG_id in COG_ids:
                # is missing?
                if COG_id in cogs_data.missing_cogs:
                    missing_cogs_found.add(COG_id)
                    hits_for_missing_cogs += 1
                    continue

                # resolve categories
                for category in cogs_data.cogs[COG_id]['categories']:
                    categories.add(category)

                # append annotation
                annotations.append(cogs_data.cogs[COG_id]['annotation'])

            # all these shitty heuristics... If there are multiple COG ids or categories, separate them from each other by '!!!' so parsing
            # them later is possible. Am I embarrassed? Yes. Is there a better way of doing this efficiently? Absolutely. What time is it?
            # 9pm. Where am I? In the lab. Is it OK for me to let this slip away if it means for me to go home sooner? Yes, probably. Am I
            # gonna remember this crap in the code for the next two months at random times in the shower and feel bad about myself? F**k yes.
            add_entry(gene_callers_id, 'COG_FUNCTION', '!!!'.join(COG_ids), '!!!'.join(annotations), self.hits[gene_callers_id]['evalue'])
            add_entry(gene_callers_id, 'COG_CATEGORY', '!!!'.join(categories), '!!!'.join(categories), 0.0)

        # store hits in contigs db.
        gene_function_calls_table = TableForGeneFunctions(self.contigs_db_path, self.run, self.progress)
        gene_function_calls_table.create(functions_dict)

        if len(missing_cogs_found):
            self.run.warning('Although your COGs are successfully added to the database, there were some COG IDs your genes hit\
                              were among the ones that were not described in the raw data. Here is the list of %d COG IDs that\
                              were hit %d times: %s.' % (len(missing_cogs_found), hits_for_missing_cogs, ', '.join(missing_cogs_found)))

        if len(in_proteins_FASTA_not_in_cogs_CSV):
            # so some of the hits represented in the FASTA file from the NCBI were not put in the
            # CSV file from NCBI to associate them with COGs
            report_output_file_path = filesnpaths.get_temp_file_path()
            report_output = open(report_output_file_path, 'w')
            report_output.write('anvio_gene_callers_id\tNCBI_protein_id\n')

            for protein_id, gene_callers_id in in_proteins_FASTA_not_in_cogs_CSV:
                report_output.write('%s\t%s\n' % (gene_callers_id, protein_id))

            report_output.close()

            self.run.warning("This is important. %s hits for your genes that appeared in the proteins FASTA file from the NCBI had protein\
                              IDs that were not described in the CSV file from the NCBI that associates each protein ID with a COG function.\
                              That's OK if you don't care. But if you would like to take a look, anvi'o stored a report\
                              file for you at %s" \
                        % (len(in_proteins_FASTA_not_in_cogs_CSV), report_output_file_path))
Example #11
0
    def store_hits_into_contigs_db(self):
        if not self.hits:
            self.run.warning("COGs class has no hits to process. Returning empty handed, but still adding COGs as "
                             "functional sources.")
            gene_function_calls_table = TableForGeneFunctions(self.contigs_db_path, self.run, self.progress)

            if self.COG_version == 'COG14':
                gene_function_calls_table.add_empty_sources_to_functional_sources({'COG14_FUNCTION', 'COG14_CATEGORY'})
            elif self.COG_version == 'COG20':
                gene_function_calls_table.add_empty_sources_to_functional_sources({'COG20_FUNCTION', 'COG20_CATEGORY', 'COG20_PATHWAY'})
            else:
                raise ConfigError("You need to edit all the if/else statements with COG version checks to ensure proper "
                                  "parsing of a new generation of COG files.")
            return

        cogs_data = COGsData(self.args)
        cogs_data.init_p_id_to_cog_id_dict()

        functions_dict = {}
        self.__entry_id = 0


        def add_entry(gene_callers_id, source, accession, function, e_value):
            functions_dict[self.__entry_id] = {'gene_callers_id': int(gene_callers_id),
                                               'source': source,
                                               'accession': accession,
                                               'function': function,
                                               'e_value': float(e_value)}
            self.__entry_id += 1

        # let's keep track of hits that match to missing COGs
        hits_for_missing_cogs = 0
        hits_for_missing_ncbi_protein_ids = 0

        missing_cogs_found = set([])
        missing_ncbi_protein_ids_found = set([])

        for gene_callers_id in self.hits:
            ncbi_protein_id = self.hits[gene_callers_id]['hit']

            in_proteins_FASTA_not_in_cogs_CSV = []
            if ncbi_protein_id not in cogs_data.p_id_to_cog_id:
                in_proteins_FASTA_not_in_cogs_CSV.append((ncbi_protein_id, gene_callers_id),)
                missing_ncbi_protein_ids_found.add(ncbi_protein_id)
                hits_for_missing_ncbi_protein_ids += 1
                continue

            COG_ids = cogs_data.p_id_to_cog_id[ncbi_protein_id]

            annotations = []
            categories = []
            pathways = []
            category_descriptions = []
            for COG_id in COG_ids:
                # is missing?
                if COG_id in cogs_data.missing_cogs:
                    missing_cogs_found.add(COG_id)
                    hits_for_missing_cogs += 1
                    continue

                # resolve categories
                for category in cogs_data.cogs[COG_id]['categories']:
                    categories.append(category)
                    category_descriptions.append(cogs_data.categories[category])

                # append annotation
                annotations.append(cogs_data.cogs[COG_id]['annotation'])

                if self.COG_version == 'COG14':
                    pass
                elif self.COG_version == 'COG20':
                    if cogs_data.cogs[COG_id]['pathway']:
                        pathways.append(cogs_data.cogs[COG_id]['pathway'])
                else:
                    raise ConfigError("You need to edit all the if/else statements with COG version checks to ensure proper "
                                      "parsing of a new generation of COG files.")

            # all these shitty heuristics... If there are multiple COG ids or categories, separate them from each other by '!!!' so parsing
            # them later is possible. Am I embarrassed? Yes. Is there a better way of doing this efficiently? Absolutely. What time is it?
            # 9pm. Where am I? In the lab. Is it OK for me to let this slip away if it means for me to go home sooner? Yes, probably. Am I
            # gonna remember this crap in the code for the next two months at random times in the shower and feel bad about myself? F**k yes.
            if self.COG_version == 'COG14':
                add_entry(gene_callers_id, f'{self.COG_version}_FUNCTION', '!!!'.join(COG_ids), '!!!'.join(annotations), self.hits[gene_callers_id]['evalue'])
                add_entry(gene_callers_id, f'{self.COG_version}_CATEGORY', '!!!'.join(categories), '!!!'.join(category_descriptions), 0.0)
            elif self.COG_version == 'COG20':
                add_entry(gene_callers_id, f'{self.COG_version}_FUNCTION', '!!!'.join(COG_ids), '!!!'.join(annotations), self.hits[gene_callers_id]['evalue'])
                add_entry(gene_callers_id, f'{self.COG_version}_CATEGORY', '!!!'.join(categories), '!!!'.join(category_descriptions), 0.0)
                if len(pathways):
                    add_entry(gene_callers_id, f'{self.COG_version}_PATHWAY', '!!!'.join(COG_ids), '!!!'.join(pathways), self.hits[gene_callers_id]['evalue'])
            else:
                raise ConfigError("You need to edit all the if/else statements with COG version checks to ensure proper "
                                  "parsing of a new generation of COG files.")

        # store hits in contigs db.
        gene_function_calls_table = TableForGeneFunctions(self.contigs_db_path, self.run, self.progress)
        gene_function_calls_table.create(functions_dict)

        if len(missing_cogs_found):
            self.run.warning('Although your COGs are successfully added to the database, there were some COG IDs your genes hit '
                             'were among the ones that were not described in the raw data. Here is the list of %d COG IDs that '
                             'were hit %d times: %s.' % (len(missing_cogs_found), hits_for_missing_cogs, ', '.join(missing_cogs_found)))

        if len(missing_ncbi_protein_ids_found):
            self.run.warning("Well. Your COGs were successfully added to the database, but there were some garbage anvi'o brushed "
                             "off under the rug. There were %d genes in your database that hit %d protein IDs in NCBIs COGs database, "
                             "but since NCBI did not release what COGs they correspond to in the database they made available (that "
                             "helps us to resolve protein IDs to COG ids), we could not annotate those genes with functions. Anvi'o "
                             "apologizes on behalf of all computer scientists for half-done stuff we often force biologists to deal "
                             "with. If you want to do some Googling, these were the offending protein IDs: '%s'." % \
                                        (hits_for_missing_ncbi_protein_ids, len(missing_ncbi_protein_ids_found), ', '.join([str(s) for s in missing_ncbi_protein_ids_found])))

        if len(in_proteins_FASTA_not_in_cogs_CSV):
            # so some of the hits represented in the FASTA file from the NCBI were not put in the
            # CSV file from NCBI to associate them with COGs
            report_output_file_path = filesnpaths.get_temp_file_path()
            report_output = open(report_output_file_path, 'w')
            report_output.write('anvio_gene_callers_id\tNCBI_protein_id\n')

            for protein_id, gene_callers_id in in_proteins_FASTA_not_in_cogs_CSV:
                report_output.write('%s\t%s\n' % (gene_callers_id, protein_id))

            report_output.close()

            self.run.warning("This is important. %s hits for your genes that appeared in the proteins FASTA file from the NCBI had protein "
                             "IDs that were not described in the CSV file from the NCBI that associates each protein ID with a COG function. "
                             "That's OK if you don't care. But if you would like to take a look, anvi'o stored a report "
                             "file for you at %s" \
                        % (len(in_proteins_FASTA_not_in_cogs_CSV), report_output_file_path))
Example #12
0
    def process(self):
        hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')

        # initialize contigs database
        class Args:
            pass

        args = Args()
        args.contigs_db = self.contigs_db_path
        contigs_db = dbops.ContigsSuperclass(args)
        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # export AA sequences for genes
        target_files_dict = {
            'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa')
        }
        contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(
            output_file_path=target_files_dict['AA:GENE'],
            simple_headers=True,
            rna_alphabet=False,
            report_aa_sequences=True)

        # run hmmscan
        hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads)
        hmm_hits_file = hmmer.run_hmmscan('Pfam', 'AA', 'GENE', None, None,
                                          len(self.function_catalog), hmm_file,
                                          None, '--cut_ga')

        # parse hmmscan output
        parser = parser_modules['search']['hmmscan'](hmm_hits_file,
                                                     alphabet='AA',
                                                     context='GENE')
        search_results_dict = parser.get_search_results()

        # add functions to database
        functions_dict = {}
        counter = 0
        for hmm_hit in search_results_dict.values():
            functions_dict[counter] = {
                'gene_callers_id': hmm_hit['gene_callers_id'],
                'source': 'Pfam',
                'accession': hmm_hit['gene_hmm_id'],
                'function':
                self.get_function_from_catalog(hmm_hit['gene_hmm_id']),
                'e_value': hmm_hit['e_value'],
            }

            counter += 1

        gene_function_calls_table = TableForGeneFunctions(
            self.contigs_db_path, self.run, self.progress)
        gene_function_calls_table.create(functions_dict)

        if anvio.DEBUG:
            run.warning(
                "The temp directories, '%s' and '%s' are kept. Please don't forget to clean those up\
                         later" %
                (tmp_directory_path, ', '.join(hmmer.tmp_dirs)),
                header="Debug")
        else:
            run.info_single(
                'Cleaning up the temp directory (you can use `--debug` if you would\
                             like to keep it for testing purposes)',
                nl_before=1,
                nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
Example #13
0
    def store_locus_as_contigs_db(self, contig_name, sequence, gene_calls, output_path_prefix, reverse_complement=False):
        """Generates a contigs database and a blank profile for a given locus"""

        temporary_files = []

        # dealing with some output file business.
        E = lambda e: output_path_prefix + e
        locus_output_db_path = E(".db")
        locus_sequence_fasta = E("_sequence.fa")
        locus_external_gene_calls = E("_external_gene_calls.txt")
        temporary_files.extend([locus_external_gene_calls, locus_sequence_fasta])

        # we will generate a blank profile database at the end of this. let's get the directory
        # business sorted.
        profile_output_dir = output_path_prefix + '-PROFILE'
        if os.path.exists(profile_output_dir):
            if self.overwrite_output_destinations:
                filesnpaths.shutil.rmtree(profile_output_dir)
            else:
                raise ConfigError("The directory %s exists, which kinda messes things up here. Either remove\
                                   it manually, or use the flag  --overwrite-output-destinations so anvi'o can\
                                   do it for you." % profile_output_dir)

        # sort out the contigs database output path
        if filesnpaths.is_file_exists(locus_output_db_path, dont_raise=True):
            if self.overwrite_output_destinations:
                os.remove(locus_output_db_path)
            else:
                raise ConfigError("There is already a contigs database at the output file path :( Either remove it first,\
                                   or use the --overwrite-output-destinations flag to give anvi'o full authority to wipe\
                                   your disk.")

        # do we need to reverse complement this guy? if yes, we will take care of the contigs sequence and
        # gene calls here, and remember this for later.
        gene_calls_list = list(gene_calls.keys())
        if reverse_complement:
            sequence = utils.rev_comp(sequence)
            gene_calls, gene_caller_id_conversion_dict = utils.rev_comp_gene_calls_dict(gene_calls, sequence)
        else:
            gene_caller_id_conversion_dict = dict([(gene_calls_list[g], g) for g in range(0, len(gene_calls_list))])
            new_gene_calls = {}
            for g in range(0, len(gene_calls_list)):
                gene_call = copy.deepcopy(gene_calls[gene_calls_list[g]])
                new_gene_calls[g] = gene_call
            gene_calls = new_gene_calls


        # write the sequene as a temporary FASTA file since the design of ContigsDatabase::create
        # will work seamlessly with this approach:
        with open(locus_sequence_fasta, 'w') as f:
            f.write('>%s\n%s\n' % (contig_name, sequence))

        # similarly, here we will store external gene calls so there will be no gene calling during
        # the generation of the contigs database
        headers = ['gene_callers_id', 'contig', 'start', 'stop', 'direction', 'partial', 'source', 'version']
        utils.store_dict_as_TAB_delimited_file(gene_calls, locus_external_gene_calls, headers=headers)

        # this is where magic happens. we ask anvi'o to create a contigs database for us.
        args = argparse.Namespace(contigs_fasta=locus_sequence_fasta,
                                  project_name=os.path.basename(output_path_prefix),
                                  split_length=sys.maxsize,
                                  kmer_size=4,
                                  external_gene_calls=locus_external_gene_calls)
        dbops.ContigsDatabase(locus_output_db_path, run=self.run_object).create(args)

        # while we are at it, here we generate a blank profile, too. so visualization of the
        # new contigs database for debugging or other purposes through anvi'o.
        args = argparse.Namespace(blank_profile=True,
                                  contigs_db=locus_output_db_path,
                                  skip_hierarchical_clustering=False,
                                  output_dir=profile_output_dir,
                                  sample_name=os.path.basename(output_path_prefix))
        profiler.BAMProfiler(args, r=self.run_object)._run()

        # so we have a contigs database! but there isn't much in it. the following where clause will
        # help us read from the tables of the original contigs database, and store it into the
        # new one throughout the following sections of the code.
        where_clause = "gene_callers_id in (%s)" % ', '.join(['"%d"' % g for g in gene_caller_id_conversion_dict])

        # a lousy anonymous function to read data from tables given the gene calls of interest
        R = lambda table_name: db.DB(self.input_contigs_db_path, None, ignore_version=True) \
                                              .get_some_rows_from_table_as_dict(table_name,
                                                                                where_clause=where_clause,
                                                                                error_if_no_data=False)

        G = lambda g: gene_caller_id_conversion_dict[g]

        ############################################################################################
        # DO FUNCTIONS
        ###########################################################################################
        function_calls = R(t.gene_function_calls_table_name)

        for entry_id in function_calls:
            function_calls[entry_id]['gene_callers_id'] = G(function_calls[entry_id]['gene_callers_id'])

        gene_function_calls_table = TableForGeneFunctions(locus_output_db_path, run=self.run_object)
        gene_function_calls_table.create(function_calls)

        self.run.info("Output contigs DB path", locus_output_db_path)
        self.run.info("Output blank profile DB path", os.path.join(profile_output_dir, 'PROFILE.db'))

        ############################################################################################
        # DO AMINO ACID SEQUENCES -- we are using external gene calls to generate the new contigs
        #                            database, but amino acid sequnces are kept in a different table
        #                            and anvi'o checks whether provided gene calls resolve to amino
        #                            acid sequences with proper starts and stops. if not, it skips
        #                            them. but amino acid sequences for each gene call was stored
        #                            in the original contigs database, and the best practice is to
        #                            carry them into the new one. so here we will remove all data
        #                            from the amino acid seqeunces table in the new database, and
        #                            copy the contents from the original one.
        ############################################################################################
        amino_acid_sequences = R(t.gene_amino_acid_sequences_table_name)

        entries = [(gene_caller_id_conversion_dict[g], amino_acid_sequences[g]['sequence']) for g in amino_acid_sequences]
        db.DB(locus_output_db_path, None, ignore_version=True).insert_many(t.gene_amino_acid_sequences_table_name, entries=entries)

        ############################################################################################
        # REMOVE TEMP FILES
        ###########################################################################################
        if anvio.DEBUG:
            self.run.info_single("Temp output files were kept for inspection due to --debug")
        else:
            [os.remove(f) for f in temporary_files]