コード例 #1
0
ファイル: hmmops.py プロジェクト: paczian/anvio
    def __init__(self, contigs_db_path, sources = set([]), run = run, progress = progress):
        if type(sources) != type(set([])):
            raise ConfigError, "'sources' variable has to be a set instance."

        self.sources = set([s for s in sources if s])

        # take care of contigs db related stuff and move on:
        contigs_db = db.DB(contigs_db_path, anvio.__contigs__version__)
        self.search_info_table = contigs_db.get_table_as_dict(t.hmm_hits_info_table_name)
        self.search_table_splits = contigs_db.get_table_as_dict(t.hmm_hits_splits_table_name)
        self.search_table_contigs = contigs_db.get_table_as_dict(t.hmm_hits_contigs_table_name)
        self.contig_sequences = contigs_db.get_table_as_dict(t.contig_sequences_table_name, string_the_key = True)
        contigs_db.disconnect()

        missing_sources = [s for s in self.sources if s not in self.search_info_table]
        if len(missing_sources):
            raise ConfigError, 'Some of the requested sources were not found in the contigs database :/\
                                Here is a list of the ones that are missing: %s' % ', '.join(missing_sources)

        if len(self.sources):
            self.search_table_splits = utils.get_filtered_dict(self.search_table_splits, 'source', self.sources)
            self.search_table_contigs = utils.get_filtered_dict(self.search_table_contigs, 'source', self.sources)
        else:
            self.sources = self.search_info_table.keys()

        # create a map of all unique gene ids to contigs table entry ids for fast access:
        self.unique_id_to_contig_entry_id = {}
        for entry_id in self.search_table_contigs:
            unique_id = self.search_table_contigs[entry_id]['gene_unique_identifier']
            self.unique_id_to_contig_entry_id[unique_id] = entry_id
コード例 #2
0
    def __init__(self, contigs_db_path, sources=set([]), run=run, progress=progress):
        if not isinstance(sources, type(set([]))):
            raise ConfigError("'sources' variable has to be a set instance.")

        self.sources = set([s for s in sources if s])

        # take care of contigs db related stuff and move on:
        contigs_db = db.DB(contigs_db_path, anvio.__contigs__version__)
        self.hmm_hits = contigs_db.get_table_as_dict(t.hmm_hits_table_name)
        self.hmm_hits_info = contigs_db.get_table_as_dict(t.hmm_hits_info_table_name)
        self.hmm_hits_splits = contigs_db.get_table_as_dict(t.hmm_hits_splits_table_name)
        self.contig_sequences = contigs_db.get_table_as_dict(t.contig_sequences_table_name, string_the_key=True)
        self.aa_sequences = contigs_db.get_table_as_dict(t.gene_protein_sequences_table_name)
        self.genes_in_contigs = contigs_db.get_table_as_dict(t.genes_in_contigs_table_name)
        contigs_db.disconnect()

        missing_sources = [s for s in self.sources if s not in self.hmm_hits_info]
        if len(missing_sources):
            raise ConfigError('Some of the requested sources were not found in the contigs database :/\
                                Here is a list of the ones that are missing: %s' % ', '.join(missing_sources))

        if len(self.sources):
            self.hmm_hits_splits = utils.get_filtered_dict(self.hmm_hits_splits, 'source', self.sources)
            self.hmm_hits = utils.get_filtered_dict(self.hmm_hits, 'source', self.sources)
        else:
            self.sources = list(self.hmm_hits_info.keys())
コード例 #3
0
ファイル: hmmops.py プロジェクト: giriarteS/anvio
    def __init__(self, contigs_db_path, sources = set([]), run = run, progress = progress):
        if type(sources) != type(set([])):
            raise ConfigError, "'sources' variable has to be a set instance."

        self.sources = set([s for s in sources if s])

        # take care of contigs db related stuff and move on:
        contigs_db = db.DB(contigs_db_path, anvio.__contigs__version__)
        self.hmm_hits = contigs_db.get_table_as_dict(t.hmm_hits_table_name)
        self.hmm_hits_info = contigs_db.get_table_as_dict(t.hmm_hits_info_table_name)
        self.hmm_hits_splits = contigs_db.get_table_as_dict(t.hmm_hits_splits_table_name)
        self.contig_sequences = contigs_db.get_table_as_dict(t.contig_sequences_table_name, string_the_key = True)
        self.genes_in_contigs = contigs_db.get_table_as_dict(t.genes_in_contigs_table_name)
        contigs_db.disconnect()

        missing_sources = [s for s in self.sources if s not in self.hmm_hits_info]
        if len(missing_sources):
            raise ConfigError, 'Some of the requested sources were not found in the contigs database :/\
                                Here is a list of the ones that are missing: %s' % ', '.join(missing_sources)

        if len(self.sources):
            self.hmm_hits_splits = utils.get_filtered_dict(self.hmm_hits_splits, 'source', self.sources)
            self.hmm_hits = utils.get_filtered_dict(self.hmm_hits, 'source', self.sources)
        else:
            self.sources = self.hmm_hits_info.keys()
コード例 #4
0
ファイル: hmmops.py プロジェクト: meren/anvio
    def filter_hmm_sequences_dict_for_bins_that_lack_more_than_N_genes(self, hmm_sequences_dict_for_splits, gene_names, max_num_genes_missing=0):
        """This takes the output of `get_sequences_dict_for_hmm_hits_in_splits`, and goes through every bin\
           to identify bins or genomes that have lack more than `max_num_genes_missing` from a list of genes.

           Note that it returns a filtered dictionary, AND the bins that are removed."""

        num_genes_missing_per_bin = self.get_num_genes_missing_per_bin_dict(hmm_sequences_dict_for_splits, gene_names)

        bins_to_remove = set([])
        all_bins = set(list(num_genes_missing_per_bin.keys()))
        for bin_name in num_genes_missing_per_bin:
            if num_genes_missing_per_bin[bin_name] > max_num_genes_missing:
                bins_to_remove.add(bin_name)

        bins_to_keep = all_bins.difference(bins_to_remove)

        self.run.info_single("Hi there! The anvi'o function that kills bins is speaking (we are here because you used\
                              the --max-num-genes-missing-from-bin parameter to remove bins that are not good enough for\
                              your analysis becasue they are missing lots of genes. What follows is a report of what \
                              happened.", nl_before=1, nl_after=1)

        self.run.info('All bins (%d)' % len(all_bins), ', '.join(all_bins), nl_after=1)
        self.run.info('Bins that missed at most %d of %d genes (%d)' % (max_num_genes_missing, len(gene_names), len(bins_to_keep)), ', '.join(bins_to_keep), nl_after=1, mc='green')
        self.run.info('Bins that are no more in the analysis (%d)' % (len(bins_to_remove)), ', '.join(bins_to_remove) if bins_to_remove else 'None. Lovely.', nl_after=1, mc='red')


        if len(bins_to_remove):
            return (utils.get_filtered_dict(hmm_sequences_dict_for_splits, 'bin_id', bins_to_keep), bins_to_remove)
        else:
            return (hmm_sequences_dict_for_splits, set([]))
コード例 #5
0
    def get_collection_dict(self, collection_name):
        self.sanity_check(collection_name)

        c = self.collections_dict[collection_name]

        database = db.DB(c['source_db_path'], c['source_db_version'])
        collections_splits_table = database.get_table_as_dict(
            t.collections_splits_table_name)
        database.disconnect()

        # FIXME: this could be resolved with a WHERE clause in the SQL query:
        collection_dict_from_db = utils.get_filtered_dict(
            collections_splits_table, 'collection_name',
            set([collection_name]))

        collection_dict_to_return = {}

        for entry in collection_dict_from_db.values():
            collection_name = entry['collection_name']
            bin_name = entry['bin_name']
            split = entry['split']

            if collection_dict_to_return.has_key(bin_name):
                collection_dict_to_return[bin_name].append(split)
            else:
                collection_dict_to_return[bin_name] = [split]

        return collection_dict_to_return
コード例 #6
0
ファイル: ccollections.py プロジェクト: caglar10ur/anvio
    def get_collection_dict(self, source):
        self.sanity_check(source)

        c = self.sources_dict[source]

        database = db.DB(c['source_db_path'], c['source_db_version'])
        collections_splits_table = database.get_table_as_dict(t.collections_splits_table_name)
        database.disconnect()

        # FIXME: this could be resolved with a WHERE clause in the SQL query:
        collection = utils.get_filtered_dict(collections_splits_table, 'source', set([source]))

        collection_dict = {}

        for entry in collection.values():
            source = entry['source']
            cluster_id = entry['cluster_id']
            split = entry['split']

            if collection_dict.has_key(cluster_id):
                collection_dict[cluster_id].append(split)
            else:
                collection_dict[cluster_id] = [split]

        return collection_dict
コード例 #7
0
ファイル: bottleroutes.py プロジェクト: lindechun/anvio
    def get_hmm_hit_from_bin(self, bin_name, gene_name):
        if self.interactive.mode != 'collection':
            return json.dumps({
                'error':
                "HMM hits from bins can only be requested in 'collection' mode. You are doing something wrong..."
            })

        if not self.interactive.collection:
            return json.dumps({
                'error':
                "You are in 'collection' mode, but your collection is empty. You are killing me."
            })

        hmm_sequences_dict = self.interactive.hmm_access.get_sequences_dict_for_hmm_hits_in_splits(
            {bin_name: set(self.interactive.collection[bin_name])})
        gene_sequences = utils.get_filtered_dict(hmm_sequences_dict,
                                                 'gene_name', set([gene_name]))

        if not gene_sequences:
            return json.dumps({
                'error':
                "Sorry. It seems %s does not have a hit for %s." %
                (bin_name, gene_name)
            })

        unique_id_for_longest_hit = sorted(
            [(gene_sequences[gene_id]['length'], gene_id)
             for gene_id in gene_sequences],
            reverse=True)[0][1]

        header, sequence = self.interactive.hmm_access.get_FASTA_header_and_sequence_for_gene_unique_id(
            gene_sequences, unique_id_for_longest_hit)

        return json.dumps({'sequence': sequence, 'header': header})
コード例 #8
0
    def store_sequences_for_hmm_hits(self):
        if self.summary.quick:
            return

        s = SequencesForHMMHits(self.summary.contigs_db_path)
        hmm_sequences_dict = s.get_hmm_sequences_dict_for_splits(
            {self.bin_id: self.split_ids})

        single_copy_gene_hmm_sources = [
            hmm_search_source for hmm_search_type, hmm_search_source in
            self.summary.hmm_searches_header
        ]
        non_single_copy_gene_hmm_sources = self.summary.completeness.sources

        for hmm_search_source in single_copy_gene_hmm_sources + non_single_copy_gene_hmm_sources:
            filtered_hmm_sequences_dict = utils.get_filtered_dict(
                hmm_sequences_dict, 'source', set([hmm_search_source]))

            output_file_obj = self.get_output_file_handle(
                '%s-hmm-sequences.txt' % hmm_search_source,
                key=hmm_search_source)

            for gene_unique_id in filtered_hmm_sequences_dict:
                header, sequence = s.get_FASTA_header_and_sequence_for_gene_unique_id(
                    hmm_sequences_dict, gene_unique_id)
                output_file_obj.write('>%s\n%s\n' % (header, sequence))
コード例 #9
0
ファイル: ccollections.py プロジェクト: banfieldlab/anvio
    def get_collection_dict(self, collection_name):
        self.sanity_check(collection_name)

        c = self.collections_dict[collection_name]

        database = db.DB(c['source_db_path'], c['source_db_version'])
        collections_splits_table = database.get_table_as_dict(t.collections_splits_table_name)
        database.disconnect()

        # FIXME: this could be resolved with a WHERE clause in the SQL query:
        collection_dict_from_db = utils.get_filtered_dict(collections_splits_table, 'collection_name', set([collection_name]))

        collection_dict_to_return = {}

        for entry in collection_dict_from_db.values():
            collection_name = entry['collection_name']
            bin_name = entry['bin_name']
            split = entry['split']

            if collection_dict_to_return.has_key(bin_name):
                collection_dict_to_return[bin_name].append(split)
            else:
                collection_dict_to_return[bin_name] = [split]

        return collection_dict_to_return
コード例 #10
0
ファイル: hmmops.py プロジェクト: QinLab/anvio
    def filter_hmm_sequences_dict_for_bins_that_lack_more_than_N_genes(
            self,
            hmm_sequences_dict_for_splits,
            gene_names,
            max_num_genes_missing=0):
        """This takes the output of `get_sequences_dict_for_hmm_hits_in_splits`, and goes through every bin\
           to identify bins or genomes that have lack more than `max_num_genes_missing` from a list of genes.

           Note that it returns a filtered dictionary, AND the bins that are removed."""

        num_genes_missing_per_bin = self.get_num_genes_missing_per_bin_dict(
            hmm_sequences_dict_for_splits, gene_names)

        bins_to_remove = set([])
        all_bins = set(list(num_genes_missing_per_bin.keys()))
        for bin_name in num_genes_missing_per_bin:
            if num_genes_missing_per_bin[bin_name] > max_num_genes_missing:
                bins_to_remove.add(bin_name)

        bins_to_keep = all_bins.difference(bins_to_remove)

        if len(bins_to_remove):
            return (utils.get_filtered_dict(hmm_sequences_dict_for_splits,
                                            'bin_id',
                                            bins_to_keep), bins_to_remove)
        else:
            return (hmm_sequences_dict_for_splits, set([]))
コード例 #11
0
    def get_collection_dict(self, source):
        self.sanity_check(source)

        c = self.sources_dict[source]

        database = db.DB(c['source_db_path'], c['source_db_version'])
        collections_splits_table = database.get_table_as_dict(
            t.collections_splits_table_name)
        database.disconnect()

        # FIXME: this could be resolved with a WHERE clause in the SQL query:
        collection = utils.get_filtered_dict(collections_splits_table,
                                             'source', set([source]))

        collection_dict = {}

        for entry in collection.values():
            source = entry['source']
            cluster_id = entry['cluster_id']
            split = entry['split']

            if collection_dict.has_key(cluster_id):
                collection_dict[cluster_id].append(split)
            else:
                collection_dict[cluster_id] = [split]

        return collection_dict
コード例 #12
0
    def filter_hmm_sequences_dict_for_bins_that_lack_more_than_N_genes(self, hmm_sequences_dict_for_splits, gene_names, max_num_genes_missing=0):
        """This takes the output of `get_sequences_dict_for_hmm_hits_in_splits`, and goes through every bin\
           to identify bins or genomes that have lack more than `max_num_genes_missing` from a list of genes.

           Note that it returns a filtered dictionary, AND the bins that are removed."""

        num_genes_missing_per_bin = self.get_num_genes_missing_per_bin_dict(hmm_sequences_dict_for_splits, gene_names)

        bins_to_remove = set([])
        all_bins = set(list(num_genes_missing_per_bin.keys()))
        for bin_name in num_genes_missing_per_bin:
            if num_genes_missing_per_bin[bin_name] > max_num_genes_missing:
                bins_to_remove.add(bin_name)

        bins_to_keep = all_bins.difference(bins_to_remove)

        self.run.info_single("Hi there! The anvi'o function that kills bins is speaking (we are here because you used\
                              the --max-num-genes-missing-from-bin parameter to remove bins that are not good enough for\
                              your analysis becasue they are missing lots of genes. What follows is a report of what \
                              happened.", nl_before=1, nl_after=1)

        self.run.info('All bins (%d)' % len(all_bins), ', '.join(all_bins), nl_after=1)
        self.run.info('Bins that missed at most %d of %d genes (%d)' % (max_num_genes_missing, len(gene_names), len(bins_to_keep)), ', '.join(bins_to_keep), nl_after=1, mc='green')
        self.run.info('Bins that are no more in the analysis (%d)' % (len(bins_to_remove)), ', '.join(bins_to_remove) if bins_to_remove else 'None. Lovely.', nl_after=1, mc='red')


        if len(bins_to_remove):
            return (utils.get_filtered_dict(hmm_sequences_dict_for_splits, 'bin_id', bins_to_keep), bins_to_remove)
        else:
            return (hmm_sequences_dict_for_splits, set([]))
コード例 #13
0
ファイル: hmmops.py プロジェクト: gitter-badger/anvio
    def get_hmm_sequences_dict_for_splits(self, splits_dict):
        """splits dict is what you get from ccollections.GetSplitNamesInBins(args).get_dict(), and
           its struture goes like this:

                {
                    'bin_x': set['split_a, split_b, ...'],
                    'bin_y': set['split_c, split_d, ...'],
                    ...
                }
        """

        split_names = set([])
        for s in splits_dict.values():
            split_names.update(s)

        hits_in_splits = utils.get_filtered_dict(self.hmm_hits_splits, 'split', split_names)

        split_name_to_bin_id = {}
        for bin_id in splits_dict:
            for split_name in splits_dict[bin_id]:
                split_name_to_bin_id[split_name] = bin_id

        if not hits_in_splits:
            return {}

        hmm_sequences_dict_for_splits = {}

        unique_ids_taken_care_of = set([])
        for split_entry in hits_in_splits.values():
            hmm_hit = self.hmm_hits[split_entry['hmm_hit_entry_id']]

            split_name = split_entry['split']
            source = hmm_hit['source']
            gene_name = hmm_hit['gene_name']
            e_value = hmm_hit['e_value']
            gene_unique_id = hmm_hit['gene_unique_identifier']

            if gene_unique_id in unique_ids_taken_care_of:
                continue
            else:
                unique_ids_taken_care_of.add(gene_unique_id)

            gene_call = self.genes_in_contigs[hmm_hit['gene_callers_id']]

            contig_name = gene_call['contig']
            start, stop = gene_call['start'], gene_call['stop']
            sequence = self.contig_sequences[contig_name]['sequence'][start:stop]

            hmm_sequences_dict_for_splits[gene_unique_id] = {'sequence': sequence,
                                                             'source': source,
                                                             'bin_id': split_name_to_bin_id[split_name],
                                                             'gene_name': gene_name,
                                                             'e_value': e_value,
                                                             'contig': contig_name,
                                                             'start': start,
                                                             'stop': stop,
                                                             'length': stop - start}

        return hmm_sequences_dict_for_splits
コード例 #14
0
ファイル: hmmops.py プロジェクト: giriarteS/anvio
    def get_hmm_sequences_dict_for_splits(self, splits_dict):
        """splits dict is what you get from ccollections.GetSplitNamesInBins(args).get_dict(), and
           its struture goes like this:

                {
                    'bin_x': set['split_a, split_b, ...'],
                    'bin_y': set['split_c, split_d, ...'],
                    ...
                }
        """

        split_names = set([])
        for s in splits_dict.values():
            split_names.update(s)

        hits_in_splits = utils.get_filtered_dict(self.hmm_hits_splits, 'split', split_names)

        split_name_to_bin_id = {}
        for bin_id in splits_dict:
            for split_name in splits_dict[bin_id]:
                split_name_to_bin_id[split_name] = bin_id

        if not hits_in_splits:
            return {}

        hmm_sequences_dict_for_splits = {}

        unique_ids_taken_care_of = set([])
        for split_entry in hits_in_splits.values():
            hmm_hit = self.hmm_hits[split_entry['hmm_hit_entry_id']]

            split_name = split_entry['split']
            source = hmm_hit['source']
            gene_name = hmm_hit['gene_name']
            e_value = hmm_hit['e_value']
            gene_unique_id = hmm_hit['gene_unique_identifier']

            if gene_unique_id in unique_ids_taken_care_of:
                continue
            else:
                unique_ids_taken_care_of.add(gene_unique_id)

            gene_call = self.genes_in_contigs[hmm_hit['gene_callers_id']]

            contig_name = gene_call['contig']
            start, stop = gene_call['start'], gene_call['stop']
            sequence = self.contig_sequences[contig_name]['sequence'][start:stop]

            hmm_sequences_dict_for_splits[gene_unique_id] = {'sequence': sequence,
                                                             'source': source,
                                                             'bin_id': split_name_to_bin_id[split_name],
                                                             'gene_name': gene_name,
                                                             'e_value': e_value,
                                                             'contig': contig_name,
                                                             'start': start,
                                                             'stop': stop,
                                                             'length': stop - start}

        return hmm_sequences_dict_for_splits
コード例 #15
0
ファイル: splitter.py プロジェクト: satish162/anvio
    def init(self):
        """The whole purpose of this function is to identify which gene calls to focus"""

        self.sanity_check()

        self.run.warning(None, header="Initialization bleep bloops", lc="cyan")

        if self.gene_caller_ids:
            self.run.info('Mode', 'User-provided gene caller id(s)')

            gene_caller_ids_of_interest = list(
                utils.get_gene_caller_ids_from_args(self.gene_caller_ids,
                                                    self.delimiter))
            self.sources = ['gene_caller_ids']
        elif self.use_hmm:
            self.run.info('Mode', 'HMM search')

            s = hmmops.SequencesForHMMHits(self.input_contigs_db_path,
                                           sources=self.hmm_sources)

            self.run.info('Search term', self.search_term, mc='green')
            self.run.info('HMM sources being used', ', '.join(s.sources))

            hmm_hits = utils.get_filtered_dict(s.hmm_hits, 'gene_name',
                                               {self.search_term})
            gene_caller_ids_of_interest = [
                entry['gene_callers_id'] for entry in hmm_hits.values()
            ]

            self.targets.append('HMMs')
            self.sources = s.sources
        else:
            self.run.info('Mode', 'Function search')

            contigs_db = dbops.ContigsSuperclass(self.args, r=self.run_object)
            # use functional annotation
            contigs_db.init_functions()
            self.run.info('Search term', self.search_term, mc='green')
            self.run.info('Function calls being used',
                          ', '.join(contigs_db.gene_function_call_sources))

            foo, search_report = contigs_db.search_for_gene_functions(
                [self.search_term], verbose=True)
            # gene id's of genes with the searched function
            gene_caller_ids_of_interest = [i[0] for i in search_report]

            self.targets.append('functions')
            self.sources = contigs_db.gene_function_call_sources

        # Multiple sources could annotate the same gene, so make sure the list is unique
        self.gene_caller_ids_of_interest = set(gene_caller_ids_of_interest)

        if len(self.gene_caller_ids_of_interest):
            run.info('Matching genes',
                     '%d genes matched your search' %
                     len(self.gene_caller_ids_of_interest),
                     mc='green',
                     nl_after=1)
コード例 #16
0
ファイル: hmmops.py プロジェクト: paczian/anvio
    def __init__(self,
                 contigs_db_path,
                 sources=set([]),
                 run=run,
                 progress=progress):
        if type(sources) != type(set([])):
            raise ConfigError, "'sources' variable has to be a set instance."

        self.sources = set([s for s in sources if s])

        # take care of contigs db related stuff and move on:
        contigs_db = db.DB(contigs_db_path, anvio.__contigs__version__)
        self.search_info_table = contigs_db.get_table_as_dict(
            t.hmm_hits_info_table_name)
        self.search_table_splits = contigs_db.get_table_as_dict(
            t.hmm_hits_splits_table_name)
        self.search_table_contigs = contigs_db.get_table_as_dict(
            t.hmm_hits_contigs_table_name)
        self.contig_sequences = contigs_db.get_table_as_dict(
            t.contig_sequences_table_name, string_the_key=True)
        contigs_db.disconnect()

        missing_sources = [
            s for s in self.sources if s not in self.search_info_table
        ]
        if len(missing_sources):
            raise ConfigError, 'Some of the requested sources were not found in the contigs database :/\
                                Here is a list of the ones that are missing: %s' % ', '.join(
                missing_sources)

        if len(self.sources):
            self.search_table_splits = utils.get_filtered_dict(
                self.search_table_splits, 'source', self.sources)
            self.search_table_contigs = utils.get_filtered_dict(
                self.search_table_contigs, 'source', self.sources)
        else:
            self.sources = self.search_info_table.keys()

        # create a map of all unique gene ids to contigs table entry ids for fast access:
        self.unique_id_to_contig_entry_id = {}
        for entry_id in self.search_table_contigs:
            unique_id = self.search_table_contigs[entry_id][
                'gene_unique_identifier']
            self.unique_id_to_contig_entry_id[unique_id] = entry_id
コード例 #17
0
ファイル: completeness.py プロジェクト: mortonjt/anvio
    def get_info_for_splits(self, split_names, min_e_value=1e-5):
        hmm_hits_splits_table = utils.get_filtered_dict(self.hmm_hits_splits_table, 'split', split_names)

        # we need to restructure 'hits' into a dictionary that gives access to sources and genes in a more direct manner
        info_dict, gene_name_to_unique_id = {}, {}
        for source in self.sources:
            info_dict[source], gene_name_to_unique_id[source] = {}, {}

        # here we go through every hit and populate 'info_dict' and 'gene_name_to_unique_id':
        for entry in hmm_hits_splits_table.values():
            hmm_hit = self.hmm_hits_table[entry['hmm_hit_entry_id']]

            if hmm_hit['e_value'] > min_e_value:
                continue

            source = hmm_hit['source']
            e_value = hmm_hit['e_value']
            gene_name = hmm_hit['gene_name']
            percentage = entry['percentage_in_split']
            gene_unique_id = hmm_hit['gene_unique_identifier']

            if gene_unique_id in info_dict[source]:
                info_dict[source][gene_unique_id]['percentage'] += percentage
            else:
                info_dict[source][gene_unique_id] = {}
                info_dict[source][gene_unique_id] = {'gene_name': gene_name, 'percentage': percentage, 'e_value': e_value}

            if gene_name in gene_name_to_unique_id[source]:
                gene_name_to_unique_id[source][gene_name].add(gene_unique_id)
            else:
                gene_name_to_unique_id[source][gene_name] = set([gene_unique_id])

        # here we generate the results information
        results_dict = {}
        for source in self.sources:
            results_dict[source] = {}

        for source in self.sources:
            genes_count = Counter([v['gene_name'] for v in info_dict[source].values()])

            # report results
            results_dict[source]['percent_complete'] = len(genes_count) * 100.0 / len(self.genes_in_db[source])

            # report redundancy:
            genes_that_occur_multiple_times = [g for g in genes_count if genes_count[g] > 1]
            results_dict[source]['percent_redundancy'] = sum([genes_count[g] - 1 for g in genes_that_occur_multiple_times]) * 100.0 / len(self.genes_in_db[source])

            # identify splits that contribute the same single_copy_gene
            redundants = {}
            for gene_name in genes_that_occur_multiple_times:
                redundants[gene_name] = [self.splits_unique_gene_id_occurs[unique_gene_id] for unique_gene_id in gene_name_to_unique_id[source][gene_name]]
            results_dict[source]['redundants'] = redundants

        return results_dict
コード例 #18
0
    def get_genes_and_functions_from_contigs_db(self, contigs_db_path):
        """This method will extract a list of gene attributes from each contig within a contigsDB.

        Returns
        =======
        output : list of lists
            first element is gene_caller_id, second is function accession, third is the contig name
        """

        # get contigsDB
        contigs_db = dbops.ContigsDatabase(contigs_db_path)

        # extract contigs names
        genes_in_contigs = contigs_db.db.get_table_as_dict(
            t.genes_in_contigs_table_name)

        # extract annotations and filter for the sources designated by user using self.annotation_source
        annotations_dict = contigs_db.db.get_table_as_dict(
            t.gene_function_calls_table_name)
        annotations_dict = utils.get_filtered_dict(
            annotations_dict, 'source', set([self.annotation_source]))

        # Make dict with gene-caller-id:accession
        gene_callers_id_to_accession_dict = {
            entry['gene_callers_id']: entry['accession']
            for entry in annotations_dict.values()
        }

        # Make list of lists containing gene attributes. If there is not annotation add one in!
        genes_and_functions_list = [
        ]  # List of lists [gene-caller-id, accessions, contig-name]
        counter = 0
        for gene_callers_id in genes_in_contigs:
            list_of_gene_attributes = []

            if gene_callers_id in gene_callers_id_to_accession_dict:
                accession = gene_callers_id_to_accession_dict[gene_callers_id]
                accession = accession.replace(" ", "")
                contig_name = genes_in_contigs[gene_callers_id]['contig']
                list_of_gene_attributes.extend(
                    (gene_callers_id, accession, contig_name))
                genes_and_functions_list.append(list_of_gene_attributes)
            else:
                # adding in "unknown annotation" if there is none
                accession = "unknown-function"
                contig_name = genes_in_contigs[counter]['contig']
                list_of_gene_attributes.extend(
                    (counter, accession, contig_name))
                genes_and_functions_list.append(list_of_gene_attributes)
            counter = counter + 1

        return genes_and_functions_list
コード例 #19
0
ファイル: hmmops.py プロジェクト: meren/anvio
    def get_hmm_hits_in_splits(self, splits_dict):
        split_names = set([])
        for s in list(splits_dict.values()):
            split_names.update(s)

        hits_in_splits = utils.get_filtered_dict(self.hmm_hits_splits, 'split', split_names)

        split_name_to_bin_id = {}
        for bin_id in splits_dict:
            for split_name in splits_dict[bin_id]:
                split_name_to_bin_id[split_name] = bin_id

        return hits_in_splits, split_name_to_bin_id
コード例 #20
0
    def get_hmm_hits_in_splits(self, splits_dict):
        split_names = set([])
        for s in list(splits_dict.values()):
            split_names.update(s)

        hits_in_splits = utils.get_filtered_dict(self.hmm_hits_splits, 'split', split_names)

        split_name_to_bin_id = {}
        for bin_id in splits_dict:
            for split_name in splits_dict[bin_id]:
                split_name_to_bin_id[split_name] = bin_id

        return hits_in_splits, split_name_to_bin_id
コード例 #21
0
ファイル: summarizer.py プロジェクト: caglar10ur/anvio
    def store_sequences_for_hmm_hits(self):
        s = SequencesForHMMHits(self.summary.annotation_db_path)
        hmm_sequences_dict = s.get_hmm_sequences_dict_for_splits({self.bin_id: self.split_ids})

        single_copy_gene_hmm_sources = [hmm_search_source for hmm_search_type, hmm_search_source in self.summary.hmm_searches_header]
        non_single_copy_gene_hmm_sources = self.summary.completeness.sources

        for hmm_search_source in single_copy_gene_hmm_sources + non_single_copy_gene_hmm_sources:
            filtered_hmm_sequences_dict = utils.get_filtered_dict(hmm_sequences_dict, 'source', set([hmm_search_source]))

            output_file_obj = self.get_output_file_handle('%s-hmm-sequences.txt' % hmm_search_source, key = hmm_search_source)

            for gene_unique_id in filtered_hmm_sequences_dict:
                header, sequence = s.get_FASTA_header_and_sequence_for_gene_unique_id(hmm_sequences_dict, gene_unique_id)
                output_file_obj.write('>%s\n%s\n' % (header, sequence))
コード例 #22
0
ファイル: splitter.py プロジェクト: AstrobioMike/anvio
    def init(self):
        """The whole purpose of this function is to identify which gene calls to focus"""

        self.sanity_check()

        self.run.warning(None, header="Initialization bleep bloops", lc="cyan")

        if self.gene_caller_ids:
            self.run.info('Mode', 'User-provided gene caller id(s)')

            gene_caller_ids_of_interest = list(utils.get_gene_caller_ids_from_args(self.gene_caller_ids, self.delimiter))
            self.sources = ['gene_caller_ids']
        elif self.use_hmm:
            self.run.info('Mode', 'HMM search')

            s = hmmops.SequencesForHMMHits(self.input_contigs_db_path, sources=self.hmm_sources)

            self.run.info('Search term', self.search_term, mc='green')
            self.run.info('HMM sources being used', ', '.join(s.sources))

            hmm_hits = utils.get_filtered_dict(s.hmm_hits, 'gene_name', {self.search_term})
            gene_caller_ids_of_interest = [entry['gene_callers_id'] for entry in hmm_hits.values()]

            self.targets.append('HMMs')
            self.sources = s.sources
        else:
            self.run.info('Mode', 'Function search')

            contigs_db = dbops.ContigsSuperclass(self.args, r=self.run_object)
            # use functional annotation
            contigs_db.init_functions()
            self.run.info('Search term', self.search_term, mc='green')
            self.run.info('Function calls being used', ', '.join(contigs_db.gene_function_call_sources))

            foo, search_report = contigs_db.search_for_gene_functions([self.search_term], verbose=True)
            # gene id's of genes with the searched function
            gene_caller_ids_of_interest = [i[0] for i in search_report]

            self.targets.append('functions')
            self.sources = contigs_db.gene_function_call_sources

        # Multiple sources could annotate the same gene, so make sure the list is unique
        self.gene_caller_ids_of_interest = set(gene_caller_ids_of_interest)

        if len(self.gene_caller_ids_of_interest):
            run.info('Matching genes',
                     '%d genes matched your search' % len(self.gene_caller_ids_of_interest),
                     mc='green', nl_after=1)
コード例 #23
0
ファイル: hmmops.py プロジェクト: yinx843/anvio
    def filter_hmm_sequences_dict_from_genes_that_occur_in_less_than_N_bins(self, hmm_sequences_dict_for_splits, min_num_bins_gene_occurs=None):
        """This takes in your `hmm_sequences_dict_for_splits`, and removes genes that rarely occurs across bins.

           The `min_num_bins_gene_occurs` parameter defines what is the minimum number of bins you want a gene to
           be present. It removes all the genes that do not fit into that criterion."""

        if not isinstance(min_num_bins_gene_occurs, int):
            raise ConfigError("Funny. Someone called the function to filter gene names from HMM sequences dictionary if they occur in less than "
                              "a certain amount. But they didn't sen an integer for that amount :/")

        if min_num_bins_gene_occurs < 0:
            raise ConfigError("But the minimum number of bins a gene is expected to be found can't be a negative value now. Right? :/")

        all_bins = set([])

        for entry in hmm_sequences_dict_for_splits.values():
            all_bins.add(entry['bin_id'])

        if min_num_bins_gene_occurs > len(all_bins):
            raise ConfigError("You are asking anvi'o to remove any gene that occurs in less than %d genomes (or bins), however, it seems you have only "
                              "%s genomes. Either you set a parameter that exceeds the number of genomes you actually have, or the previous filters "
                              "applied to your set of genes have removed all genes from some or all of your genomes :/ Anvi'o cannot know here what might "
                              "have gone wrong, but it kinda believes that it is all on your at this point :/" % (min_num_bins_gene_occurs, len(all_bins)))

        gene_occurrences_accross_bins = self.get_gene_num_occurrences_across_bins(hmm_sequences_dict_for_splits)

        genes_to_remove = set([])
        all_genes = set(list(gene_occurrences_accross_bins.keys()))
        for gene_name in all_genes:
            if gene_occurrences_accross_bins[gene_name] < min_num_bins_gene_occurs:
                genes_to_remove.add(gene_name)

        genes_to_keep = all_genes.difference(genes_to_remove)

        self.run.info_single("Hi! The anvi'o function that was supposed to remove genes that were occurring in "
                             "less than X number of bins due to the use of `--min-num-bins-gene-occurs` is "
                             "speaking. What follows is a report of what happened after anvi'o tried to remove "
                             "genes that were occurring in at least %d of the %d bins you had at this point." \
                                    % (min_num_bins_gene_occurs, len(all_bins)), nl_before=1, nl_after=1)

        self.run.info('All genes (%d)' % len(all_genes), ', '.join(all_genes), nl_after=1)
        self.run.info('Genes occurred in at least %d of %d bins (%d)' % (min_num_bins_gene_occurs, len(all_bins), len(genes_to_keep)), ', '.join(genes_to_keep), nl_after=1, mc='green')
        self.run.info('Genes that are no more in the analysis (%d)' % (len(genes_to_remove)), ', '.join(genes_to_remove) if genes_to_remove else 'None.', nl_after=1, mc='red')

        if len(genes_to_remove):
            return (utils.get_filtered_dict(hmm_sequences_dict_for_splits, 'gene_name', genes_to_keep), genes_to_remove)
        else:
            return (hmm_sequences_dict_for_splits, set([]))
コード例 #24
0
ファイル: hmmops.py プロジェクト: meren/anvio
    def filter_hmm_sequences_dict_from_genes_that_occur_in_less_than_N_bins(self, hmm_sequences_dict_for_splits, min_num_bins_gene_occurs=None):
        """This takes in your `hmm_sequences_dict_for_splits`, and removes genes that rarely occurs across bins.

           The `min_num_bins_gene_occurs` parameter defines what is the minimum number of bins you want a gene to
           be present. It removes all the genes that do not fit into that criterion."""

        if not isinstance(min_num_bins_gene_occurs, int):
            raise ConfigError("Funny. Someone called the function to filter gene names from HMM sequences dictionary if they occur in less than\
                               a certain amount. But they didn't sen an integer for that amount :/")

        if min_num_bins_gene_occurs < 0:
            raise ConfigError("But the minimum number of bins a gene is expected to be found can't be a negative value now. Right? :/")

        all_bins = set([])

        for entry in hmm_sequences_dict_for_splits.values():
            all_bins.add(entry['bin_id'])

        if min_num_bins_gene_occurs > len(all_bins):
            raise ConfigError("You are asking anvi'o to remove any gene that occurs in less than %d genomes (or bins), however, it seems you have only\
                               %s genomes. Either you set a parameter that exceeds the number of genomes you actually have, or the previous filters\
                               applied to your set of genes have removed all genes from some or all of your genomes :/ Anvi'o cannot know here what might\
                               have gone wrong, but it kinda believes that it is all on your at this point :/" % (min_num_bins_gene_occurs, len(all_bins)))

        gene_occurrences_accross_bins = self.get_gene_num_occurrences_across_bins(hmm_sequences_dict_for_splits)

        genes_to_remove = set([])
        all_genes = set(list(gene_occurrences_accross_bins.keys()))
        for gene_name in all_genes:
            if gene_occurrences_accross_bins[gene_name] < min_num_bins_gene_occurs:
                genes_to_remove.add(gene_name)

        genes_to_keep = all_genes.difference(genes_to_remove)

        self.run.info_single("Hi! The anvi'o funciton that was supposed to remove genes that were occurring in\
                              less than X number of bins due to the use of `--min-num-bins-gene-occurs` is \
                              speaking. What follows is a report of what happened after anvi'o tried to remove\
                              genes that were occurring in at least %d of the %d bins you had at this point." \
                                    % (min_num_bins_gene_occurs, len(all_bins)), nl_before=1, nl_after=1)

        self.run.info('All genes (%d)' % len(all_genes), ', '.join(all_genes), nl_after=1)
        self.run.info('Genes occurred in at least %d of %d bins (%d)' % (min_num_bins_gene_occurs, len(all_bins), len(genes_to_keep)), ', '.join(genes_to_keep), nl_after=1, mc='green')
        self.run.info('Genes that are no more in the analysis (%d)' % (len(genes_to_remove)), ', '.join(genes_to_remove) if genes_to_remove else 'None.', nl_after=1, mc='red')

        if len(genes_to_remove):
            return (utils.get_filtered_dict(hmm_sequences_dict_for_splits, 'gene_name', genes_to_keep), genes_to_remove)
        else:
            return (hmm_sequences_dict_for_splits, set([]))
コード例 #25
0
    def filter_hmm_sequences_dict_from_genes_that_occur_in_less_than_N_bins(self, hmm_sequences_dict_for_splits, min_num_bins_gene_occurs=None):
        """This takes in your `hmm_sequences_dict_for_splits`, and removes genes that rarely occurs across bins.

           The `min_num_bins_gene_occurs` parameter defines what is the minimum number of bins you want a gene to
           be present. It removes all the genes that do not fit into that criterion."""

        if not isinstance(min_num_bins_gene_occurs, int):
            raise ConfigError("Funny. Someone called the function to filter gene names from HMM sequences dictionary if they occur in less than\
                               a certain amount. But they didn't sen an integer for that amount :/")

        if min_num_bins_gene_occurs < 0:
            raise ConfigError("But the minimum number of bins a gene is expected to be found can't be a negative value now. Right? :/")

        all_bins = set([])

        for entry in hmm_sequences_dict_for_splits.values():
            all_bins.add(entry['bin_id'])

        if min_num_bins_gene_occurs > len(all_bins):
            raise ConfigError("OK. Well. This is awkward. You have like %d bins, eh? And you are asking anvi'o to remove any\
                               that occurs in less than %d bins. Do you see the problem here? Maybe it is time to take a break\
                               from work :(" % (len(all_bins), min_num_bins_gene_occurs))

        gene_occurrences_accross_bins = self.get_gene_num_occurrences_across_bins(hmm_sequences_dict_for_splits)

        genes_to_remove = set([])
        all_genes = set(list(gene_occurrences_accross_bins.keys()))
        for gene_name in all_genes:
            if gene_occurrences_accross_bins[gene_name] < min_num_bins_gene_occurs:
                genes_to_remove.add(gene_name)

        genes_to_keep = all_genes.difference(genes_to_remove)

        self.run.info_single("Hi! The anvi'o funciton that was supposed to remove genes that were occurring in\
                              less than X number of bins due to the use of `--min-num-bins-gene-occurs` is \
                              speaking. What follows is a report of what happened after anvi'o tried to remove\
                              genes that were occurring in at least %d of the %d bins you had at this point." \
                                    % (min_num_bins_gene_occurs, len(all_bins)), nl_before=1, nl_after=1)

        self.run.info('All genes (%d)' % len(all_genes), ', '.join(all_genes), nl_after=1)
        self.run.info('Genes occurred in at least %d of %d bins (%d)' % (min_num_bins_gene_occurs, len(all_bins), len(genes_to_keep)), ', '.join(genes_to_keep), nl_after=1, mc='green')
        self.run.info('Genes that are no more in the analysis (%d)' % (len(genes_to_remove)), ', '.join(genes_to_remove) if genes_to_remove else 'None.', nl_after=1, mc='red')

        if len(genes_to_remove):
            return (utils.get_filtered_dict(hmm_sequences_dict_for_splits, 'gene_name', genes_to_keep), genes_to_remove)
        else:
            return (hmm_sequences_dict_for_splits, set([]))
コード例 #26
0
ファイル: ccollections.py プロジェクト: AstrobioMike/anvio
    def get_bins_info_dict(self, collection_name):
        self.sanity_check(collection_name)

        c = self.collections_dict[collection_name]

        database = db.DB(c['source_db_path'], c['source_db_version'])
        collections_bins_info_table = database.get_table_as_dict(t.collections_bins_info_table_name)
        database.disconnect()

        # FIXME: this could be resolved with a WHERE clause in the SQL query:
        collections_bins_info_table_filtered = utils.get_filtered_dict(collections_bins_info_table, 'collection_name', set([collection_name]))

        bins_info_dict = {}
        for v in list(collections_bins_info_table_filtered.values()):
            bins_info_dict[v['bin_name']] = {'html_color': v['html_color'], 'source': v['source']}

        return bins_info_dict
コード例 #27
0
ファイル: ccollections.py プロジェクト: simatei/anvio
    def get_bins_info_dict(self, collection_name):
        self.sanity_check(collection_name)

        c = self.collections_dict[collection_name]

        database = db.DB(c['source_db_path'], c['source_db_version'])
        collections_bins_info_table = database.get_table_as_dict(t.collections_bins_info_table_name)
        database.disconnect()

        # FIXME: this could be resolved with a WHERE clause in the SQL query:
        collections_bins_info_table_filtered = utils.get_filtered_dict(collections_bins_info_table, 'collection_name', set([collection_name]))

        bins_info_dict = {}
        for v in list(collections_bins_info_table_filtered.values()):
            bins_info_dict[v['bin_name']] = {'html_color': v['html_color'], 'source': v['source']}

        return bins_info_dict
コード例 #28
0
ファイル: ccollections.py プロジェクト: caglar10ur/anvio
    def get_collection_colors(self, source):
        self.sanity_check(source)

        c = self.sources_dict[source]

        database = db.DB(c['source_db_path'], c['source_db_version'])
        collections_colors = database.get_table_as_dict(t.collections_colors_table_name)
        database.disconnect()

        # FIXME: this could be resolved with a WHERE clause in the SQL query:
        collection = utils.get_filtered_dict(collections_colors, 'source', set([source]))

        collection_color_dict = {}

        for entry in collection.values():
            collection_color_dict[entry['cluster_id']] = entry['htmlcolor']

        return collection_color_dict
コード例 #29
0
    def get_collection_colors(self, source):
        self.sanity_check(source)

        c = self.sources_dict[source]

        database = db.DB(c['source_db_path'], c['source_db_version'])
        collections_colors = database.get_table_as_dict(
            t.collections_colors_table_name)
        database.disconnect()

        # FIXME: this could be resolved with a WHERE clause in the SQL query:
        collection = utils.get_filtered_dict(collections_colors, 'source',
                                             set([source]))

        collection_color_dict = {}

        for entry in collection.values():
            collection_color_dict[entry['cluster_id']] = entry['htmlcolor']

        return collection_color_dict
コード例 #30
0
ファイル: bottleroutes.py プロジェクト: ascendo/anvio
def get_hmm_hit_from_bin(args, d, request, response, bin_name, gene_name):
    set_default_headers(response)

    if d.mode != 'collection':
        return json.dumps({'error': "HMM hits from bins can only be requested in 'collection' mode. You are doing something wrong..."})

    if not d.collection:
        return json.dumps({'error': "You are in 'collection' mode, but your collection is empty. You are killing me."})

    hmm_sequences_dict = d.hmm_access.get_hmm_sequences_dict_for_splits({bin_name: set(d.collection[bin_name])})
    gene_sequences = utils.get_filtered_dict(hmm_sequences_dict, 'gene_name', set([gene_name]))

    if not gene_sequences:
        return json.dumps({'error': "Sorry. It seems %s does not have a hit for %s." % (bin_name, gene_name)})

    unique_id_for_longest_hit = sorted([(gene_sequences[gene_id]['length'], gene_id) for gene_id in gene_sequences], reverse=True)[0][1]

    header, sequence = d.hmm_access.get_FASTA_header_and_sequence_for_gene_unique_id(gene_sequences, unique_id_for_longest_hit)

    return json.dumps({'sequence': sequence, 'header': header})
コード例 #31
0
ファイル: hmmops.py プロジェクト: QinLab/anvio
    def filter_hmm_sequences_dict_from_genes_that_occur_in_less_than_N_bins(
            self,
            hmm_sequences_dict_for_splits,
            min_num_bins_gene_occurs=None):
        """This takes in your `hmm_sequences_dict_for_splits`, and removes genes that rarely occurs across bins.

           The `min_num_bins_gene_occurs` parameter defines what is the minimum number of bins you want a gene to
           be present. It removes all the genes that do not fit into that criterion."""

        if not isinstance(min_num_bins_gene_occurs, int):
            raise ConfigError(
                "Funny. Someone called the function to filter gene names from HMM sequences dictionary if they occur in less than\
                               a certain amount. But they didn't sen an integer for that amount :/"
            )

        if min_num_bins_gene_occurs < 0:
            raise ConfigError(
                "But the minimum number of bins a gene is expected to be found can't be a negative value now. Right? :/"
            )

        gene_occurrences_accross_bins = self.get_gene_num_occurrences_across_bins(
            hmm_sequences_dict_for_splits)

        genes_to_remove = set([])
        all_genes = set(list(gene_occurrences_accross_bins.keys()))
        for gene_name in all_genes:
            if gene_occurrences_accross_bins[
                    gene_name] < min_num_bins_gene_occurs:
                genes_to_remove.add(gene_name)

        genes_to_keep = all_genes.difference(genes_to_remove)

        if len(genes_to_remove):
            return (utils.get_filtered_dict(hmm_sequences_dict_for_splits,
                                            'gene_name',
                                            genes_to_keep), genes_to_remove)
        else:
            return (hmm_sequences_dict_for_splits, set([]))
コード例 #32
0
ファイル: completeness.py プロジェクト: mschecht/anvio
    def __init__(self,
                 contigs_db_path,
                 scg_domain_classifier_path=None,
                 source_requested=None,
                 run=run,
                 progress=progress):
        self.run = run
        self.progress = progress
        self.initialized_properly = True

        self.SCG_domain_predictor = scgdomainclassifier.Predict(
            argparse.Namespace(),
            run=terminal.Run(verbose=False),
            progress=self.progress)

        # hi db
        contigs_db = dbops.ContigsDatabase(contigs_db_path)

        # read info table to get what is available in the db
        info_table = contigs_db.db.get_table_as_dict(
            t.hmm_hits_info_table_name)

        # identify and remove non-single-copy sources of hmm search results:
        non_singlecopy_sources = set([
            k for k in list(info_table.keys())
            if info_table[k]['search_type'] != 'singlecopy'
        ])
        singlecopy_sources = set([
            k for k in list(info_table.keys())
            if info_table[k]['search_type'] == 'singlecopy'
        ])
        for non_singlecopy_source in non_singlecopy_sources:
            info_table.pop(non_singlecopy_source)

        # get the hmm hits table
        self.hmm_hits_table = contigs_db.db.get_table_as_dict(
            t.hmm_hits_table_name)

        # read search table (which holds hmmscan hits for splits).
        self.hmm_hits_splits_table = utils.get_filtered_dict(
            contigs_db.db.get_table_as_dict(t.hmm_hits_splits_table_name),
            'source', singlecopy_sources)

        # an example entry in self.hmm_hits_splits_table looks loke this:
        #
        # {
        #    'percentage_in_split'   : 69.6763202725724,
        #    'source'                : u'Bacteria_74',
        #    'split'                 : u'ANTARCTICAAQUATIC_SMPL_SITE231_3.0UMcontig18439_split_00001',
        #    'hmm_hit_entry_id'      : 1
        # }
        #

        # a little convenience for potential clients:
        self.http_refs = {}
        for source_in_db in info_table:
            self.http_refs[source_in_db] = [
                h for h in info_table[source_in_db]['ref'].split()
                if h.startswith('http')
            ][0]

        self.genes_in_db = dict([(s, info_table[s]['genes'].split(', '))
                                 for s in info_table])

        # we're done with the db
        contigs_db.disconnect()

        self.sources = list(info_table.keys())
        self.domains = set(
            [info_table[source]['domain'] for source in self.sources])
        self.source_to_domain = dict([(source, info_table[source]['domain'])
                                      for source in self.sources])
        self.domain_to_sources = [(domain, [
            source for source in self.sources
            if info_table[source]['domain'] == domain
        ]) for domain in self.domains]

        self.domains_missing_in_SCG_domain_predictor = [
            d for d in self.domains
            if d not in self.SCG_domain_predictor.SCG_domains
        ]
        if len(self.domains_missing_in_SCG_domain_predictor):
            num_domains_missing = len(
                self.domains_missing_in_SCG_domain_predictor)
            self.run.warning("OK. We have a problem. You seem to have single-copy core gene collections for among your HMM hits %s that\
                              are not included when the anvi'o domain predictor was trained :/ Here is the list of domains that are making\
                              us upset here: \"%s\". This means either you put a new HMM single-copy core gene collection to the anvi'o HMMs\
                              directory, or gave it as a parameter, and run `anvi-run-hmms` without updating the classifier anvi'o uses to\
                              resolve domains for proper completion/redundancy estimates."                                                                                           % \
                                           ('a domain' if num_domains_missing == 1 else '%s domains' % num_domains_missing,
                                            ', '.join(self.domains_missing_in_SCG_domain_predictor)))
            self.initialized_properly = False

        if source_requested:
            if source_requested not in self.sources:
                raise ConfigError(
                    'Requested source "%s" is not one of the single-copy gene sources found in the database.'
                    % source_requested)

            # filter out sources that are not requested
            self.sources = [source_requested]
            self.genes_in_db = {
                source_requested: self.genes_in_db[source_requested]
            }
            self.hmm_hits_splits_table = utils.get_filtered_dict(
                self.hmm_hits_splits_table, 'source', set([source_requested]))

        # these will be very useful later. trust me.
        self.unique_gene_id_to_gene_name = {}
        self.splits_unique_gene_id_occurs = {}
        for entry in list(self.hmm_hits_splits_table.values()):
            hmm_hit = self.hmm_hits_table[entry['hmm_hit_entry_id']]
            gene_unique_identifier = hmm_hit['gene_unique_identifier']

            if gene_unique_identifier not in self.unique_gene_id_to_gene_name:
                self.unique_gene_id_to_gene_name[
                    gene_unique_identifier] = hmm_hit['gene_name']

            if gene_unique_identifier not in self.splits_unique_gene_id_occurs:
                self.splits_unique_gene_id_occurs[gene_unique_identifier] = [
                    entry['split']
                ]
            else:
                self.splits_unique_gene_id_occurs[
                    gene_unique_identifier].append(entry['split'])
コード例 #33
0
ファイル: completeness.py プロジェクト: mortonjt/anvio
    def __init__(self, contigs_db_path, source=None, run=run, progress=progress):
        self.run = run
        self.progress = progress

        # hi db
        contigs_db = dbops.ContigsDatabase(contigs_db_path)

        # read info table to get what is available in the db
        info_table = contigs_db.db.get_table_as_dict(t.hmm_hits_info_table_name)

        # identify and remove non-single-copy sources of hmm search results:
        non_singlecopy_sources = set([k for k in info_table.keys() if info_table[k]['search_type'] != 'singlecopy'])
        singlecopy_sources = set([k for k in info_table.keys() if info_table[k]['search_type'] == 'singlecopy'])
        for non_singlecopy_source in non_singlecopy_sources:
            info_table.pop(non_singlecopy_source)

        # get the hmm hits table
        self.hmm_hits_table = contigs_db.db.get_table_as_dict(t.hmm_hits_table_name)

        # read search table (which holds hmmscan hits for splits).
        self.hmm_hits_splits_table = utils.get_filtered_dict(contigs_db.db.get_table_as_dict(t.hmm_hits_splits_table_name), 'source', singlecopy_sources)

        # an example entry in self.hmm_hits_splits_table looks loke this:
        #
        # {
        #    'percentage_in_split'   : 69.6763202725724,
        #    'source'                : u'Campbell_et_al',
        #    'split'                 : u'ANTARCTICAAQUATIC_SMPL_SITE231_3.0UMcontig18439_split_00001',
        #    'hmm_hit_entry_id'      : 1
        # }
        #

        # a little convenience for potential clients:
        self.http_refs = {}
        for source_in_db in info_table:
            self.http_refs[source_in_db] = [h for h in info_table[source_in_db]['ref'].split() if h.startswith('http')][0]

        self.genes_in_db = dict([(s, info_table[s]['genes'].split(', ')) for s in info_table])


        # we're done with the db
        contigs_db.disconnect()

        self.sources = info_table.keys()

        if source:
            if source not in self.sources:
                raise ConfigError, 'Source "%s" is not one of the single-copy gene sources found in the database.' % source

            # filter out sources that are not requested
            self.sources = [source]
            self.genes_in_db = {source: self.genes_in_db[source]}
            self.hmm_hits_splits_table = utils.get_filtered_dict(self.hmm_hits_splits_table, 'source', set([source]))

        self.unique_gene_id_to_gene_name = {}
        self.splits_unique_gene_id_occurs = {}
        # these will be very useful later. trust me.
        for entry in self.hmm_hits_splits_table.values():
            hmm_hit = self.hmm_hits_table[entry['hmm_hit_entry_id']]
            gene_unique_identifier = hmm_hit['gene_unique_identifier']

            if gene_unique_identifier not in self.unique_gene_id_to_gene_name:
                self.unique_gene_id_to_gene_name[gene_unique_identifier] = hmm_hit['gene_name']

            if gene_unique_identifier not in self.splits_unique_gene_id_occurs:
                self.splits_unique_gene_id_occurs[gene_unique_identifier] = [entry['split']]
            else:
                self.splits_unique_gene_id_occurs[gene_unique_identifier].append(entry['split'])
コード例 #34
0
ファイル: completeness.py プロジェクト: wangmz0617/anvio
    def __init__(self,
                 contigs_db_path,
                 scg_domain_classifier_path=None,
                 source_requested=None,
                 run=run,
                 progress=progress):
        self.run = run
        self.progress = progress

        self.SCG_comain_predictor = scgdomainclassifier.Predict(
            argparse.Namespace(),
            run=terminal.Run(verbose=False),
            progress=self.progress)

        # hi db
        contigs_db = dbops.ContigsDatabase(contigs_db_path)

        # read info table to get what is available in the db
        info_table = contigs_db.db.get_table_as_dict(
            t.hmm_hits_info_table_name)

        # identify and remove non-single-copy sources of hmm search results:
        non_singlecopy_sources = set([
            k for k in list(info_table.keys())
            if info_table[k]['search_type'] != 'singlecopy'
        ])
        singlecopy_sources = set([
            k for k in list(info_table.keys())
            if info_table[k]['search_type'] == 'singlecopy'
        ])
        for non_singlecopy_source in non_singlecopy_sources:
            info_table.pop(non_singlecopy_source)

        # get the hmm hits table
        self.hmm_hits_table = contigs_db.db.get_table_as_dict(
            t.hmm_hits_table_name)

        # read search table (which holds hmmscan hits for splits).
        self.hmm_hits_splits_table = utils.get_filtered_dict(
            contigs_db.db.get_table_as_dict(t.hmm_hits_splits_table_name),
            'source', singlecopy_sources)

        # an example entry in self.hmm_hits_splits_table looks loke this:
        #
        # {
        #    'percentage_in_split'   : 69.6763202725724,
        #    'source'                : u'Campbell_et_al',
        #    'split'                 : u'ANTARCTICAAQUATIC_SMPL_SITE231_3.0UMcontig18439_split_00001',
        #    'hmm_hit_entry_id'      : 1
        # }
        #

        # a little convenience for potential clients:
        self.http_refs = {}
        for source_in_db in info_table:
            self.http_refs[source_in_db] = [
                h for h in info_table[source_in_db]['ref'].split()
                if h.startswith('http')
            ][0]

        self.genes_in_db = dict([(s, info_table[s]['genes'].split(', '))
                                 for s in info_table])

        # we're done with the db
        contigs_db.disconnect()

        self.sources = list(info_table.keys())
        self.domains = set(
            [info_table[source]['domain'] for source in self.sources])
        self.source_to_domain = dict([(source, info_table[source]['domain'])
                                      for source in self.sources])
        self.domain_to_sources = [(domain, [
            source for source in self.sources
            if info_table[source]['domain'] == domain
        ]) for domain in self.domains]

        self.missing_SCG_HMMs = [
            s for s in self.SCG_comain_predictor.SCG_sources
            if s not in self.sources
        ]
        if len(self.missing_SCG_HMMs):
            self.run.warning("Sad news :( Anvi'o completion estimations require all single-copy core gene (SCG) collections to be run on contigs\
                              databases. Yet your contigs database '%s' is lacking these default ones: '%s'. If you need completion estimates,\
                              the easiest solution is to run `anvi-run-hmms` on your contigs database. This will make sure all SCG HMM hits are \
                              stored in your contigs database. Alternatively you can run the same program with `-I XXX` parameter where XXX is one\
                              of the missing SCG HMM collection (in case you are only missing one of the SCG collections)." \
                                                                                    % (contigs_db_path, ', '.join(self.missing_SCG_HMMs)))

        if source_requested:
            if source_requested not in self.sources:
                raise ConfigError(
                    'Requested source "%s" is not one of the single-copy gene sources found in the database.'
                    % source_requested)

            # filter out sources that are not requested
            self.sources = [source_requested]
            self.genes_in_db = {
                source_requested: self.genes_in_db[source_requested]
            }
            self.hmm_hits_splits_table = utils.get_filtered_dict(
                self.hmm_hits_splits_table, 'source', set([source_requested]))

        self.unique_gene_id_to_gene_name = {}
        self.splits_unique_gene_id_occurs = {}
        # these will be very useful later. trust me.
        for entry in list(self.hmm_hits_splits_table.values()):
            hmm_hit = self.hmm_hits_table[entry['hmm_hit_entry_id']]
            gene_unique_identifier = hmm_hit['gene_unique_identifier']

            if gene_unique_identifier not in self.unique_gene_id_to_gene_name:
                self.unique_gene_id_to_gene_name[
                    gene_unique_identifier] = hmm_hit['gene_name']

            if gene_unique_identifier not in self.splits_unique_gene_id_occurs:
                self.splits_unique_gene_id_occurs[gene_unique_identifier] = [
                    entry['split']
                ]
            else:
                self.splits_unique_gene_id_occurs[
                    gene_unique_identifier].append(entry['split'])
コード例 #35
0
ファイル: hmmops.py プロジェクト: meren/anvio
    def get_sequences_dict_for_hmm_hits_in_splits(self, splits_dict, return_amino_acid_sequences=False, return_best_hits=False):
        """splits dict is what you get from ccollections.GetSplitNamesInBins(args).get_dict(), and
           its struture goes like this:

                {
                    'bin_x': set['split_a, split_b, ...'],
                    'bin_y': set['split_c, split_d, ...'],
                    ...
                }

            This function will return DNA seqeunces by default. If `return_amino_acid_sequences` parameter
            is True, it will return AA sequences instead.

            `return_best_hit=True` will filter the resulting dictionary to remove weak hits if there are more
            than one hit for a given gene name in a bin for a given hmm source.
        """

        # trim hmm hits if sources
        if len(self.sources):
            self.hmm_hits_splits = utils.get_filtered_dict(self.hmm_hits_splits, 'source', self.sources)
            self.hmm_hits = utils.get_filtered_dict(self.hmm_hits, 'source', self.sources)
        else:
            self.sources = list(self.hmm_hits_info.keys())

        hits_in_splits, split_name_to_bin_id = self.get_hmm_hits_in_splits(splits_dict)

        hmm_sequences_dict_for_splits = {}

        unique_hits_taken_care_of = set([])
        for split_entry in list(hits_in_splits.values()):
            hmm_hit = self.hmm_hits[split_entry['hmm_hit_entry_id']]

            split_name = split_entry['split']
            source = hmm_hit['source']
            gene_name = hmm_hit['gene_name']
            e_value = hmm_hit['e_value']
            hit_unique_id = '___'.join([source, hmm_hit['gene_unique_identifier']])

            if hit_unique_id in unique_hits_taken_care_of:
                continue
            else:
                unique_hits_taken_care_of.add(hit_unique_id)

            gene_callers_id = hmm_hit['gene_callers_id']
            gene_call = self.genes_in_contigs[gene_callers_id]

            contig_name = gene_call['contig']
            start, stop, forward = gene_call['start'], gene_call['stop'], gene_call['direction'] == 'f'

            if return_amino_acid_sequences:
                sequence = self.aa_sequences[gene_callers_id]['sequence']
            else:
                sequence = self.contig_sequences[contig_name]['sequence'][start:stop]
                if not forward:
                    sequence = utils.rev_comp(sequence)

            hmm_sequences_dict_for_splits[hit_unique_id] = {'sequence': sequence,
                                                            'source': source,
                                                            'bin_id': split_name_to_bin_id[split_name],
                                                            'gene_name': gene_name,
                                                            'e_value': e_value,
                                                            'contig': contig_name,
                                                            'start': start,
                                                            'stop': stop,
                                                            'gene_callers_id': gene_callers_id,
                                                            'rev_comped': (not forward),
                                                            'length': stop - start}

        if return_best_hits:
            return self.filter_hmm_sequences_dict_for_splits_to_keep_only_best_hits(hmm_sequences_dict_for_splits)
        else:
            return hmm_sequences_dict_for_splits
コード例 #36
0
ファイル: completeness.py プロジェクト: paczian/anvio
    def __init__(self,
                 contigs_db_path,
                 source=None,
                 run=run,
                 progress=progress):
        # hi db
        contigs_db = dbops.ContigsDatabase(contigs_db_path)

        # read info table to get what is available in the db
        info_table = contigs_db.db.get_table_as_dict(
            t.hmm_hits_info_table_name)

        # identify and remove non-single-copy sources of hmm search results:
        non_singlecopy_sources = set([
            k for k in info_table.keys()
            if info_table[k]['search_type'] != 'singlecopy'
        ])
        singlecopy_sources = set([
            k for k in info_table.keys()
            if info_table[k]['search_type'] == 'singlecopy'
        ])
        for non_singlecopy_source in non_singlecopy_sources:
            info_table.pop(non_singlecopy_source)

        # read search table (which holds hmmscan hits for splits).
        self.search_table = utils.get_filtered_dict(
            contigs_db.db.get_table_as_dict(t.hmm_hits_splits_table_name),
            'source', singlecopy_sources)

        # an example entry in self.search_table looks loke this:
        #
        # {
        #    'percentage_in_split'   : 100,
        #    'source'                : u'Campbell_et_al',
        #    'gene_unique_identifier': u'c70c1cc3025b636100fd8a910b5b7f0dd09752fc78e2a1f10ee60954',
        #    'e_value'               : 0.0013,
        #    'gene_name'             : u'UvrC_HhH_N',
        #    'split'                 : u'ANTARCTICAAQUATIC_SMPL_SITE231_3.0UMcontig18439_split_00001'
        # }
        #

        # a little convenience for potential clients:
        self.http_refs = {}
        for source_in_db in info_table:
            self.http_refs[source_in_db] = [
                h for h in info_table[source_in_db]['ref'].split()
                if h.startswith('http')
            ][0]

        self.genes_in_db = dict([(s, info_table[s]['genes'].split(', '))
                                 for s in info_table])

        # we're done with the db
        contigs_db.disconnect()

        self.sources = info_table.keys()

        if source:
            if source not in self.sources:
                raise ConfigError, 'Source "%s" is not one of the single-copy gene sources found in the database.' % source

            # filter out sources that are not requested
            self.sources = [source]
            self.genes_in_db = {source: self.genes_in_db[source]}
            self.search_table = utils.get_filtered_dict(
                self.search_table, 'source', set([source]))

        self.unique_gene_id_to_gene_name = {}
        self.splits_unique_gene_id_occurs = {}
        # these will be very useful later. trust me.
        for entry in self.search_table.values():
            if entry[
                    'gene_unique_identifier'] not in self.unique_gene_id_to_gene_name:
                self.unique_gene_id_to_gene_name[
                    entry['gene_unique_identifier']] = entry['gene_name']

            if entry[
                    'gene_unique_identifier'] not in self.splits_unique_gene_id_occurs:
                self.splits_unique_gene_id_occurs[
                    entry['gene_unique_identifier']] = [entry['split']]
            else:
                self.splits_unique_gene_id_occurs[
                    entry['gene_unique_identifier']].append(entry['split'])
コード例 #37
0
ファイル: completeness.py プロジェクト: paczian/anvio
    def get_info_for_splits(self, split_names, min_e_value=1e-5):
        hits = utils.get_filtered_dict(self.search_table, 'split', split_names)

        # we need to restructure 'hits' into a dictionary that gives access to sources and genes in a more direct manner
        info_dict, gene_name_to_unique_id = {}, {}
        for source in self.sources:
            info_dict[source], gene_name_to_unique_id[source] = {}, {}

        # here we go through every hit and populate 'info_dict' and 'gene_name_to_unique_id':
        for entry in hits.values():
            if entry['e_value'] > min_e_value:
                continue

            source = entry['source']
            e_value = entry['e_value']
            gene_name = entry['gene_name']
            percentage = entry['percentage_in_split']
            gene_unique_id = entry['gene_unique_identifier']

            if info_dict[source].has_key(gene_unique_id):
                info_dict[source][gene_unique_id]['percentage'] += percentage
            else:
                info_dict[source][gene_unique_id] = {}
                info_dict[source][gene_unique_id] = {
                    'gene_name': gene_name,
                    'percentage': percentage,
                    'e_value': e_value
                }

            if gene_name_to_unique_id[source].has_key(gene_name):
                gene_name_to_unique_id[source][gene_name].add(gene_unique_id)
            else:
                gene_name_to_unique_id[source][gene_name] = set(
                    [gene_unique_id])

        # here we generate the results information
        results_dict = {}
        for source in self.sources:
            results_dict[source] = {}

        for source in self.sources:
            genes_count = Counter(
                [v['gene_name'] for v in info_dict[source].values()])

            # report results
            results_dict[source]['percent_complete'] = len(
                genes_count) * 100.0 / len(self.genes_in_db[source])

            # report redundancy:
            genes_that_occur_multiple_times = [
                g for g in genes_count if genes_count[g] > 1
            ]
            results_dict[source]['percent_redundancy'] = sum([
                genes_count[g] - 1 for g in genes_that_occur_multiple_times
            ]) * 100.0 / len(self.genes_in_db[source])

            # identify splits that contribute the same single_copy_gene
            redundants = {}
            for gene_name in genes_that_occur_multiple_times:
                redundants[gene_name] = [
                    self.splits_unique_gene_id_occurs[unique_gene_id] for
                    unique_gene_id in gene_name_to_unique_id[source][gene_name]
                ]
            results_dict[source]['redundants'] = redundants

        return results_dict
コード例 #38
0
    def __init__(self,
                 contigs_db_path,
                 source_requested=None,
                 run=run,
                 progress=progress):
        self.run = run
        self.progress = progress

        # hi db
        contigs_db = dbops.ContigsDatabase(contigs_db_path)

        # read info table to get what is available in the db
        info_table = contigs_db.db.get_table_as_dict(
            t.hmm_hits_info_table_name)

        # identify and remove non-single-copy sources of hmm search results:
        non_singlecopy_sources = set([
            k for k in list(info_table.keys())
            if info_table[k]['search_type'] != 'singlecopy'
        ])
        singlecopy_sources = set([
            k for k in list(info_table.keys())
            if info_table[k]['search_type'] == 'singlecopy'
        ])
        for non_singlecopy_source in non_singlecopy_sources:
            info_table.pop(non_singlecopy_source)

        # get the hmm hits table
        self.hmm_hits_table = contigs_db.db.get_table_as_dict(
            t.hmm_hits_table_name)

        # read search table (which holds hmmscan hits for splits).
        self.hmm_hits_splits_table = utils.get_filtered_dict(
            contigs_db.db.get_table_as_dict(t.hmm_hits_splits_table_name),
            'source', singlecopy_sources)

        # an example entry in self.hmm_hits_splits_table looks loke this:
        #
        # {
        #    'percentage_in_split'   : 69.6763202725724,
        #    'source'                : u'Campbell_et_al',
        #    'split'                 : u'ANTARCTICAAQUATIC_SMPL_SITE231_3.0UMcontig18439_split_00001',
        #    'hmm_hit_entry_id'      : 1
        # }
        #

        # a little convenience for potential clients:
        self.http_refs = {}
        for source_in_db in info_table:
            self.http_refs[source_in_db] = [
                h for h in info_table[source_in_db]['ref'].split()
                if h.startswith('http')
            ][0]

        self.genes_in_db = dict([(s, info_table[s]['genes'].split(', '))
                                 for s in info_table])

        # we're done with the db
        contigs_db.disconnect()

        self.sources = list(info_table.keys())
        self.domains = set(
            [info_table[source]['domain'] for source in self.sources])
        self.source_to_domain = dict([(source, info_table[source]['domain'])
                                      for source in self.sources])
        self.domain_to_sources = [(domain, [
            source for source in self.sources
            if info_table[source]['domain'] == domain
        ]) for domain in self.domains]

        if source_requested:
            if source_requested not in self.sources:
                raise ConfigError(
                    'Requested source "%s" is not one of the single-copy gene sources found in the database.'
                    % source_requested)

            # filter out sources that are not requested
            self.sources = [source_requested]
            self.genes_in_db = {
                source_requested: self.genes_in_db[source_requested]
            }
            self.hmm_hits_splits_table = utils.get_filtered_dict(
                self.hmm_hits_splits_table, 'source', set([source_requested]))

        self.unique_gene_id_to_gene_name = {}
        self.splits_unique_gene_id_occurs = {}
        # these will be very useful later. trust me.
        for entry in list(self.hmm_hits_splits_table.values()):
            hmm_hit = self.hmm_hits_table[entry['hmm_hit_entry_id']]
            gene_unique_identifier = hmm_hit['gene_unique_identifier']

            if gene_unique_identifier not in self.unique_gene_id_to_gene_name:
                self.unique_gene_id_to_gene_name[
                    gene_unique_identifier] = hmm_hit['gene_name']

            if gene_unique_identifier not in self.splits_unique_gene_id_occurs:
                self.splits_unique_gene_id_occurs[gene_unique_identifier] = [
                    entry['split']
                ]
            else:
                self.splits_unique_gene_id_occurs[
                    gene_unique_identifier].append(entry['split'])
コード例 #39
0
    def get_info_for_splits(self, split_names, min_e_value=1e-5):
        """This function takes a bunch of split names, and returns three things:

            - Average percent completion for best matching domain
            - Average redundancy for best matching domain
            - Best matching domain for this collection of splits,
            - Domain matching confidence (see get_average_domain_completion_and_redundancy for details)
            - And a comprehensive results dictionary that explains each HMM source in each domain,

        For your convenience, you can call this function this way:

        p_completion, p_redundancy, domain, domain_confidence, results_dict = get_info_for_splits(s)
        """
        hmm_hits_splits_table = utils.get_filtered_dict(
            self.hmm_hits_splits_table, 'split', split_names)

        # we need to restructure 'hits' into a dictionary that gives access to sources and genes in a more direct manner
        info_dict, gene_name_to_unique_id = {}, {}
        for source in self.sources:
            info_dict[source], gene_name_to_unique_id[source] = {}, {}

        # here we go through every hit and populate 'info_dict' and 'gene_name_to_unique_id':
        for entry in list(hmm_hits_splits_table.values()):
            hmm_hit = self.hmm_hits_table[entry['hmm_hit_entry_id']]

            if hmm_hit['e_value'] > min_e_value:
                continue

            source = hmm_hit['source']
            e_value = hmm_hit['e_value']
            gene_name = hmm_hit['gene_name']
            percentage = entry['percentage_in_split']
            gene_unique_id = hmm_hit['gene_unique_identifier']

            if gene_unique_id in info_dict[source]:
                info_dict[source][gene_unique_id]['percentage'] += percentage
            else:
                info_dict[source][gene_unique_id] = {}
                info_dict[source][gene_unique_id] = {
                    'gene_name': gene_name,
                    'percentage': percentage,
                    'e_value': e_value
                }

            if gene_name in gene_name_to_unique_id[source]:
                gene_name_to_unique_id[source][gene_name].add(gene_unique_id)
            else:
                gene_name_to_unique_id[source][gene_name] = set(
                    [gene_unique_id])

        # here we generate the results information
        results_dict = {}
        for domain in self.domains:
            results_dict[domain] = {}

        for source in self.sources:
            domain = self.source_to_domain[source]
            results_dict[domain][source] = {'domain': domain, 'source': source}

            genes_count = Counter(
                [v['gene_name'] for v in list(info_dict[source].values())])

            # report num genes in the model and the num of those with hits (note that htis doesn't
            # care whether those hits are contributing to redundance or not --instad here we are
            # intrested only in the 'coverage' of the model)
            results_dict[domain][source]['num_genes_in_model'] = len(
                self.genes_in_db[source])
            results_dict[domain][source][
                'num_genes_in_model_with_hits '] = len(genes_count)
            results_dict[domain][source]['model_coverage'] = len(
                genes_count) / len(self.genes_in_db[source])

            results_dict[domain][source]['percent_completion'] = len(
                genes_count) * 100.0 / len(self.genes_in_db[source])

            # report redundancy:
            genes_that_occur_multiple_times = [
                g for g in genes_count if genes_count[g] > 1
            ]
            results_dict[domain][source]['percent_redundancy'] = sum([
                genes_count[g] - 1 for g in genes_that_occur_multiple_times
            ]) * 100.0 / len(self.genes_in_db[source])

            # identify splits that contribute the same single_copy_gene
            redundants = {}
            for gene_name in genes_that_occur_multiple_times:
                redundants[gene_name] = [
                    self.splits_unique_gene_id_occurs[unique_gene_id] for
                    unique_gene_id in gene_name_to_unique_id[source][gene_name]
                ]
            results_dict[domain][source]['redundants'] = redundants

        if not len(results_dict):
            return (None, None, None, None, results_dict)

        best_matching_domain, domain_matching_confidence = self.get_best_matching_domain(
            results_dict)

        if best_matching_domain:
            percent_completion, percent_redundancy = self.get_average_domain_completion_and_redundancy(
                results_dict, best_matching_domain)
        else:
            percent_completion, percent_redundancy = 0.0, 0.0

        return (percent_completion, percent_redundancy, best_matching_domain,
                domain_matching_confidence, results_dict)
コード例 #40
0
ファイル: hmmops.py プロジェクト: simatei/anvio
    def get_sequences_dict_for_hmm_hits_in_splits(
            self,
            splits_dict,
            return_amino_acid_sequences=False,
            return_best_hits=False):
        """splits dict is what you get from ccollections.GetSplitNamesInBins(args).get_dict(), and
           its struture goes like this:

                {
                    'bin_x': set['split_a, split_b, ...'],
                    'bin_y': set['split_c, split_d, ...'],
                    ...
                }

            This function will return DNA sequences by default. If `return_amino_acid_sequences` parameter
            is True, it will return AA sequences instead.

            `return_best_hit=True` will filter the resulting dictionary to remove weak hits if there are more
            than one hit for a given gene name in a bin for a given hmm source.
        """

        # trim hmm hits if sources
        if len(self.sources):
            self.hmm_hits_splits = utils.get_filtered_dict(
                self.hmm_hits_splits, 'source', self.sources)
            self.hmm_hits = utils.get_filtered_dict(self.hmm_hits, 'source',
                                                    self.sources)
        else:
            self.sources = list(self.hmm_hits_info.keys())

        hits_in_splits, split_name_to_bin_id = self.get_hmm_hits_in_splits(
            splits_dict)

        hmm_sequences_dict_for_splits = {}

        unique_hits_taken_care_of = set([])
        for split_entry in list(hits_in_splits.values()):
            hmm_hit = self.hmm_hits[split_entry['hmm_hit_entry_id']]

            split_name = split_entry['split']
            source = hmm_hit['source']
            gene_name = hmm_hit['gene_name']
            e_value = hmm_hit['e_value']
            hit_unique_id = '___'.join(
                [source, hmm_hit['gene_unique_identifier']])

            if hit_unique_id in unique_hits_taken_care_of:
                continue
            else:
                unique_hits_taken_care_of.add(hit_unique_id)

            gene_callers_id = hmm_hit['gene_callers_id']
            gene_call = self.genes_in_contigs[gene_callers_id]

            contig_name = gene_call['contig']
            start, stop, forward = gene_call['start'], gene_call[
                'stop'], gene_call['direction'] == 'f'

            if return_amino_acid_sequences:
                sequence = self.aa_sequences[gene_callers_id]['sequence']
            else:
                sequence = self.contig_sequences[contig_name]['sequence'][
                    start:stop]
                if not forward:
                    sequence = utils.rev_comp(sequence)

            hmm_sequences_dict_for_splits[hit_unique_id] = {
                'sequence': sequence,
                'source': source,
                'bin_id': split_name_to_bin_id[split_name],
                'gene_name': gene_name,
                'e_value': e_value,
                'contig': contig_name,
                'start': start,
                'stop': stop,
                'gene_callers_id': gene_callers_id,
                'rev_comped': (not forward),
                'length': stop - start
            }

        if return_best_hits:
            return self.filter_hmm_sequences_dict_for_splits_to_keep_only_best_hits(
                hmm_sequences_dict_for_splits)
        else:
            return hmm_sequences_dict_for_splits
コード例 #41
0
ファイル: completeness.py プロジェクト: AstrobioMike/anvio
    def get_info_for_splits(self, split_names, min_e_value=1e-5):
        """This function takes a bunch of split names, and returns three things:

            - Average percent completion for best matching domain
            - Average redundancy for best matching domain
            - Best matching domain for this collection of splits,
            - Domain matching confidence (see get_average_domain_completion_and_redundancy for details)
            - And a comprehensive results dictionary that explains each HMM source in each domain,

        For your convenience, you can call this function this way:

        p_completion, p_redundancy, domain, domain_confidence, results_dict = get_info_for_splits(s)
        """
        hmm_hits_splits_table = utils.get_filtered_dict(self.hmm_hits_splits_table, 'split', split_names)

        # we need to restructure 'hits' into a dictionary that gives access to sources and genes in a more direct manner
        info_dict, gene_name_to_unique_id = {}, {}
        for source in self.sources:
            info_dict[source], gene_name_to_unique_id[source] = {}, {}

        # here we go through every hit and populate 'info_dict' and 'gene_name_to_unique_id':
        for entry in list(hmm_hits_splits_table.values()):
            hmm_hit = self.hmm_hits_table[entry['hmm_hit_entry_id']]

            if hmm_hit['e_value'] > min_e_value:
                continue

            source = hmm_hit['source']
            e_value = hmm_hit['e_value']
            gene_name = hmm_hit['gene_name']
            percentage = entry['percentage_in_split']
            gene_unique_id = hmm_hit['gene_unique_identifier']

            if gene_unique_id in info_dict[source]:
                info_dict[source][gene_unique_id]['percentage'] += percentage
            else:
                info_dict[source][gene_unique_id] = {}
                info_dict[source][gene_unique_id] = {'gene_name': gene_name, 'percentage': percentage, 'e_value': e_value}

            if gene_name in gene_name_to_unique_id[source]:
                gene_name_to_unique_id[source][gene_name].add(gene_unique_id)
            else:
                gene_name_to_unique_id[source][gene_name] = set([gene_unique_id])

        # here we generate the results information
        results_dict = {}
        for domain in self.domains:
            results_dict[domain] = {}

        for source in self.sources:
            domain = self.source_to_domain[source]
            results_dict[domain][source] = {'domain': domain, 'source': source}

            genes_count = Counter([v['gene_name'] for v in list(info_dict[source].values())])

            # report num genes in the model and the num of those with hits (note that htis doesn't
            # care whether those hits are contributing to redundance or not --instad here we are
            # intrested only in the 'coverage' of the model)
            results_dict[domain][source]['num_genes_in_model'] = len(self.genes_in_db[source])
            results_dict[domain][source]['num_genes_in_model_with_hits ']= len(genes_count)
            results_dict[domain][source]['model_coverage']= len(genes_count) / len(self.genes_in_db[source])

            results_dict[domain][source]['percent_completion'] = len(genes_count) * 100.0 / len(self.genes_in_db[source])

            # report redundancy:
            genes_that_occur_multiple_times = [g for g in genes_count if genes_count[g] > 1]
            results_dict[domain][source]['percent_redundancy'] = sum([genes_count[g] - 1 for g in genes_that_occur_multiple_times]) * 100.0 / len(self.genes_in_db[source])

            # identify splits that contribute the same single_copy_gene
            redundants = {}
            for gene_name in genes_that_occur_multiple_times:
                redundants[gene_name] = [self.splits_unique_gene_id_occurs[unique_gene_id] for unique_gene_id in gene_name_to_unique_id[source][gene_name]]
            results_dict[domain][source]['redundants'] = redundants

        if not len(results_dict):
            return (None, None, None, None, results_dict)

        best_matching_domain, domain_matching_confidence = self.get_best_matching_domain(results_dict)

        if best_matching_domain:
            percent_completion, percent_redundancy = self.get_average_domain_completion_and_redundancy(results_dict, best_matching_domain)
        else:
            percent_completion, percent_redundancy = 0.0, 0.0

        return (percent_completion, percent_redundancy, best_matching_domain, domain_matching_confidence, results_dict)
コード例 #42
0
ファイル: completeness.py プロジェクト: dagahren/anvio
    def get_info_for_splits(self,
                            split_names,
                            min_e_value=1e-5,
                            bin_name='UNKNOWN'):
        """This function takes a bunch of split names, and returns three things:

            - Average percent completion for best matching domain
            - Average redundancy for best matching domain
            - Best matching domain for this collection of splits,
            - And a comprehensive results dictionary that explains each HMM source in each domain,

        For your convenience, you can call this function this way:

            p_completion, p_redundancy, domain, domain_probabilities, info_text, hmm_hits_dict = get_info_for_splits(s)
            domain_confidence = domain_probabilities[domain] if domain else 0.0
        """

        hmm_hits_splits_table = utils.get_filtered_dict(
            self.hmm_hits_splits_table, 'split', split_names)

        # FIXME: the design here is turning into a bad case of spaghetti code. we should reimplement
        # the SCG / completion stuff around the random forest domain predictor. the previous code is
        # quite inefficient, and the late addition of random forest domain predictor is making things
        # even less clear. The following dictionary is to predict the domain:
        observed_genes_per_domain = {}
        for domain in self.domains:
            observed_genes_per_domain[domain] = Counter()

        # we need to restructure 'hits' into a dictionary that gives access to sources and genes in a more direct manner
        info_dict, gene_name_to_unique_id = {}, {}
        for source in self.sources:
            info_dict[source], gene_name_to_unique_id[source] = {}, {}

        # here we go through every hit and populate 'info_dict' and 'gene_name_to_unique_id':
        for entry in list(hmm_hits_splits_table.values()):
            hmm_hit = self.hmm_hits_table[entry['hmm_hit_entry_id']]

            if hmm_hit['e_value'] > min_e_value:
                continue

            source = hmm_hit['source']
            domain = self.source_to_domain[source]
            e_value = hmm_hit['e_value']
            gene_name = hmm_hit['gene_name']
            percentage = entry['percentage_in_split']
            gene_unique_id = hmm_hit['gene_unique_identifier']

            if gene_unique_id in info_dict[source]:
                info_dict[source][gene_unique_id]['percentage'] += percentage
            else:
                info_dict[source][gene_unique_id] = {}
                info_dict[source][gene_unique_id] = {
                    'gene_name': gene_name,
                    'percentage': percentage,
                    'e_value': e_value
                }

            if gene_name in gene_name_to_unique_id[source]:
                gene_name_to_unique_id[source][gene_name].add(gene_unique_id)
            else:
                gene_name_to_unique_id[source][gene_name] = set(
                    [gene_unique_id])

            observed_genes_per_domain[domain][gene_name] += 1

        # here we generate the results information
        scg_hmm_hits = {}
        for domain in self.domains:
            scg_hmm_hits[domain] = {}

        for source in self.sources:
            domain = self.source_to_domain[source]
            scg_hmm_hits[domain][source] = {'domain': domain, 'source': source}

            genes_count = Counter(
                [v['gene_name'] for v in list(info_dict[source].values())])

            # report num genes in the model and the num of those with hits (note that htis doesn't
            # care whether those hits are contributing to redundance or not --instad here we are
            # intrested only in the 'coverage' of the model)
            scg_hmm_hits[domain][source]['num_genes_in_model'] = len(
                self.genes_in_db[source])
            scg_hmm_hits[domain][source]['num_genes_in_model_with_hits'] = len(
                genes_count)
            scg_hmm_hits[domain][source]['model_coverage'] = len(
                genes_count) / len(self.genes_in_db[source])

            scg_hmm_hits[domain][source]['percent_completion'] = len(
                genes_count) * 100.0 / len(self.genes_in_db[source])

            # report redundancy:
            genes_that_occur_multiple_times = [
                g for g in genes_count if genes_count[g] > 1
            ]
            scg_hmm_hits[domain][source]['percent_redundancy'] = sum([
                genes_count[g] - 1 for g in genes_that_occur_multiple_times
            ]) * 100.0 / len(self.genes_in_db[source])

            # identify splits that contribute the same single_copy_gene
            redundants = {}
            for gene_name in genes_that_occur_multiple_times:
                redundants[gene_name] = [
                    self.splits_unique_gene_id_occurs[unique_gene_id] for
                    unique_gene_id in gene_name_to_unique_id[source][gene_name]
                ]
            scg_hmm_hits[domain][source]['redundants'] = redundants

        if not len(scg_hmm_hits):
            return (None, None, None, None, "ANVI'O FOUND NO SCG HMM HITS :/",
                    scg_hmm_hits)

        best_matching_domain, domain_probabilities, control_domains, info_text = self.get_best_matching_domain(
            scg_hmm_hits, observed_genes_per_domain, bin_name)

        if best_matching_domain and best_matching_domain not in control_domains:
            if best_matching_domain not in scg_hmm_hits:
                self.progress.reset()
                self.run.warning(
                    "Just so you know: Your process branched into a part of the anvi'o code that run into a weird situation.\
                                  and wishes to tell you something. This may come accross confusing, and we apologize for that. The thing is,\
                                  anvi'o is trying to estimate the completion and redundancy of a set of contigs here. Maybe it is doing it\
                                  because you are running the interactive interface and clicked on something, or you are summarizing a\
                                  collection, or doing something we never envisioned you would do. The bottom line is this: anvi'o predicts\
                                  that this set of contigs belong to the domain %s. However, it seems the single-copy core genes for that\
                                  domain were not run on this contigs database. Hence, you will not get any completion and redundancy\
                                  estimates for this one :( That is fine and things will continue to run smoothly, but we thought you should\
                                  know. Because knowledge is power .. even when you're not sure what it means."
                    % best_matching_domain)
                percent_completion, percent_redundancy = 0.0, 0.0
            else:
                source = self.SCG_domain_predictor.SCG_domain_to_source[
                    best_matching_domain]
                percent_completion = scg_hmm_hits[best_matching_domain][
                    source]['percent_completion']
                percent_redundancy = scg_hmm_hits[best_matching_domain][
                    source]['percent_redundancy']
        else:
            percent_completion, percent_redundancy = 0.0, 0.0

        return (percent_completion, percent_redundancy, best_matching_domain,
                domain_probabilities, info_text, scg_hmm_hits)
コード例 #43
0
ファイル: completeness.py プロジェクト: dagahren/anvio
    def __init__(self,
                 contigs_db_path,
                 scg_domain_classifier_path=None,
                 source_requested=None,
                 run=run,
                 progress=progress):
        self.run = run
        self.progress = progress
        self.initialized_properly = True

        self.SCG_domain_predictor = scgdomainclassifier.Predict(
            argparse.Namespace(),
            run=terminal.Run(verbose=False),
            progress=self.progress)

        # hi db
        contigs_db = dbops.ContigsDatabase(contigs_db_path)

        # read info table to get what is available in the db
        info_table = contigs_db.db.get_table_as_dict(
            t.hmm_hits_info_table_name)

        # identify and remove non-single-copy sources of hmm search results:
        non_singlecopy_sources = set([
            k for k in list(info_table.keys())
            if info_table[k]['search_type'] != 'singlecopy'
        ])
        singlecopy_sources = set([
            k for k in list(info_table.keys())
            if info_table[k]['search_type'] == 'singlecopy'
        ])
        for non_singlecopy_source in non_singlecopy_sources:
            info_table.pop(non_singlecopy_source)

        # get the hmm hits table
        self.hmm_hits_table = contigs_db.db.get_table_as_dict(
            t.hmm_hits_table_name)

        # read search table (which holds hmmscan hits for splits).
        self.hmm_hits_splits_table = utils.get_filtered_dict(
            contigs_db.db.get_table_as_dict(t.hmm_hits_splits_table_name),
            'source', singlecopy_sources)

        # an example entry in self.hmm_hits_splits_table looks loke this:
        #
        # {
        #    'percentage_in_split'   : 69.6763202725724,
        #    'source'                : u'Bacteria_74',
        #    'split'                 : u'ANTARCTICAAQUATIC_SMPL_SITE231_3.0UMcontig18439_split_00001',
        #    'hmm_hit_entry_id'      : 1
        # }
        #

        # a little convenience for potential clients:
        self.http_refs = {}
        for source_in_db in info_table:
            self.http_refs[source_in_db] = [
                h for h in info_table[source_in_db]['ref'].split()
                if h.startswith('http')
            ][0]

        self.genes_in_db = dict([(s, info_table[s]['genes'].split(', '))
                                 for s in info_table])

        # we're done with the db
        contigs_db.disconnect()

        self.sources = list(info_table.keys())
        self.domains = set(
            [info_table[source]['domain'] for source in self.sources])
        self.source_to_domain = dict([(source, info_table[source]['domain'])
                                      for source in self.sources])
        self.domain_to_sources = [(domain, [
            source for source in self.sources
            if info_table[source]['domain'] == domain
        ]) for domain in self.domains]

        # compatibility sanity checks 1/2: make sure domains between domain predictor and the contigs database match
        self.domains_missing_in_SCG_domain_predictor = [
            d for d in self.domains
            if d not in self.SCG_domain_predictor.SCG_domains
        ]
        self.domains_missing_in_SCGs_run_for_contigs = [
            d for d in self.SCG_domain_predictor.SCG_domains
            if d not in self.domains
        ]

        if len(self.domains_missing_in_SCG_domain_predictor):
            num_domains_missing = len(
                self.domains_missing_in_SCG_domain_predictor)
            self.progress.reset()
            self.run.warning("OK. We have a problem. You seem to have single-copy core gene collections for among your HMM hits %s that\
                              are not included when the anvi'o domain predictor was trained :/ Here is the list of domains that are making\
                              us upset here: \"%s\". This means either you put a new HMM single-copy core gene collection to the anvi'o HMMs\
                              directory, or gave it as a parameter, and run `anvi-run-hmms` without updating the classifier anvi'o uses to\
                              resolve domains for proper completion/redundancy estimates."                                                                                           % \
                                           ('a domain' if num_domains_missing == 1 else '%s domains' % num_domains_missing,
                                            ', '.join(self.domains_missing_in_SCG_domain_predictor)))
            self.initialized_properly = False

        if len(self.domains_missing_in_SCGs_run_for_contigs):
            num_domains_missing = len(
                self.domains_missing_in_SCGs_run_for_contigs)
            self.progress.reset()
            self.run.warning("Things are not quite OK. It seems %d of the domains that are known to the classifier anvi'o uses to predict\
                              domains for completion estimation are missing from your contigs database. This means, you didn't run the\
                              program `anvi-run-hmms` with default parameters, or you removed some essential SCG domains from it later. Or\
                              you did something else. Who knows. Here is the list of domains that are making us upset here: \"%s\". We hope\
                              you are happy. If you want to get rid of this warning you can run `anvi-run-hmms` on this your contigs database\
                              whenever it is convenient to you, so anvi'o can make sure you have everything in the right place."                                                                                                                                 % \
                                           (num_domains_missing, ', '.join(self.domains_missing_in_SCG_domain_predictor)))

            # since we just established that the user did not run these domains for their contigs database,
            # we will update our self.domains variable to make sure the f****d uppery that will likely take
            # place later is to a convenient minumum:
            self.domains.discard(
                set(self.domains_missing_in_SCGs_run_for_contigs))

            self.initialized_properly = False

        # compatibility sanity checks 2/2: make sure sources in domain predictor to those in the contigs database
        self.sources_missing_in_SCGs_run_for_contigs = [
            s for s in self.SCG_domain_predictor.SCG_sources
            if s not in self.sources
        ]
        self.sources_missing_in_SCG_domain_predictor = [
            s for s in self.sources
            if s not in self.SCG_domain_predictor.SCG_sources
        ]
        if len(self.sources_missing_in_SCGs_run_for_contigs):
            num_sources_missing = len(
                self.sources_missing_in_SCGs_run_for_contigs)
            self.progress.reset()
            self.run.warning("OK. We have a VERY interesting problem. You have all the SCG domains necessary to run the predictor covered\
                              in your contigs database, however, %s that are used during the training of the domain predictor does not seem\
                              to occur in your contigs database :/ Here is the list of HMM sources that are making us upset here: \"%s\".\
                              This most likely means you are using a new version of anvi'o with older single-copy core gene sources, or you are\
                              exploring new single-copy core gene sources to see how they behave. That's all good and very exciting, but unfortunately\
                              anvi'o will not be able to predict domains due to this incompatibility here. You could solve this problem by running\
                              `anvi-run-hmms` on your contigs database, but you can also live without solving it as anvi'o will continue running\
                              by not utilizing domain-specific HMMs for completion/redundancy estimates, but giving you all the results all at once."                                                                                                                                                      % \
                                           ('an HMM source' if num_sources_missing == 1 else '%s HMM sources' % num_sources_missing,
                                            ', '.join(self.sources_missing_in_SCGs_run_for_contigs)))
            self.initialized_properly = False

        if source_requested:
            if source_requested not in self.sources:
                raise ConfigError(
                    'Requested source "%s" is not one of the single-copy gene sources found in the database.'
                    % source_requested)

            # filter out sources that are not requested
            self.sources = [source_requested]
            self.genes_in_db = {
                source_requested: self.genes_in_db[source_requested]
            }
            self.hmm_hits_splits_table = utils.get_filtered_dict(
                self.hmm_hits_splits_table, 'source', set([source_requested]))

        # these will be very useful later. trust me.
        self.unique_gene_id_to_gene_name = {}
        self.splits_unique_gene_id_occurs = {}
        for entry in list(self.hmm_hits_splits_table.values()):
            hmm_hit = self.hmm_hits_table[entry['hmm_hit_entry_id']]
            gene_unique_identifier = hmm_hit['gene_unique_identifier']

            if gene_unique_identifier not in self.unique_gene_id_to_gene_name:
                self.unique_gene_id_to_gene_name[
                    gene_unique_identifier] = hmm_hit['gene_name']

            if gene_unique_identifier not in self.splits_unique_gene_id_occurs:
                self.splits_unique_gene_id_occurs[gene_unique_identifier] = [
                    entry['split']
                ]
            else:
                self.splits_unique_gene_id_occurs[
                    gene_unique_identifier].append(entry['split'])