Example #1
0
    def populate_collections_dict(self, db_path):
        filesnpaths.is_file_exists(db_path)
        self.db_path = db_path

        database = db.DB(db_path, utils.get_required_version_for_db(db_path))
        self.db_type = database.get_meta_value('db_type')
        collections_info_table = database.get_table_as_dict(t.collections_info_table_name)
        database.disconnect()

        # collections info must be read only if its coming from the contigs database.
        if self.db_type == 'contigs':
            read_only = True
        elif self.db_type == 'profile':
            read_only = False
        elif self.db_type:
            read_only = False
        elif self.db_type == 'pan':
            read_only = False
        else:
            raise ConfigError('Collections class does not know about this "%s" database type :/' % self.db_type)

        for collection_name in collections_info_table:
            self.collections_dict[collection_name] = collections_info_table[collection_name]
            self.collections_dict[collection_name]['read_only'] = read_only
            self.collections_dict[collection_name]['source_db_path'] = db_path
            self.collections_dict[collection_name]['source_db_version'] = utils.get_required_version_for_db(db_path)
Example #2
0
    def populate_collections_dict(self, db_path):
        filesnpaths.is_file_exists(db_path)
        self.db_path = db_path

        database = db.DB(db_path, utils.get_required_version_for_db(db_path))
        self.db_type = database.get_meta_value('db_type')
        collections_info_table = database.get_table_as_dict(t.collections_info_table_name)
        database.disconnect()

        # collections info must be read only if its coming from the contigs database.
        if self.db_type == 'contigs':
            read_only = True
        elif self.db_type == 'profile':
            read_only = False
        elif self.db_type:
            read_only = False
        elif self.db_type == 'pan':
            read_only = False
        else:
            raise ConfigError('Collections class does not know about this "%s" database type :/' % self.db_type)

        for collection_name in collections_info_table:
            self.collections_dict[collection_name] = collections_info_table[collection_name]
            self.collections_dict[collection_name]['read_only'] = read_only
            self.collections_dict[collection_name]['source_db_path'] = db_path
            self.collections_dict[collection_name]['source_db_version'] = utils.get_required_version_for_db(db_path)
Example #3
0
    def refresh_collections_info_table(self, collection_name):
        """For a given collection, re-read most up-to-date information from the collection splits table and update collections info table"""
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))

        collection_names_in_db = database.get_single_column_from_table(t.collections_splits_table_name, 'collection_name', unique=True)

        if collection_name not in collection_names_in_db:
            database.disconnect()
            raise ConfigError(f"The collection name '{collection_name}' is not in the collections table :/")

        where_clause = f'collection_name="{collection_name}"'
        # please note that this is not unique yet and it is intentional
        bin_names_in_collection = database.get_single_column_from_table(t.collections_splits_table_name, 'bin_name', where_clause=where_clause)

        num_splits_in_collection = len(bin_names_in_collection)
        bin_names_in_collection = sorted(list(set(bin_names_in_collection)))
        database.disconnect()

        self.delete_entries_for_key('collection_name', collection_name, [t.collections_info_table_name])

        db_entries = tuple([collection_name, num_splits_in_collection, len(bin_names_in_collection), ','.join(bin_names_in_collection)])

        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
        database._exec('''INSERT INTO %s VALUES (?,?,?,?)''' % t.collections_info_table_name, db_entries)
        database.disconnect()
Example #4
0
    def read(self):
        database = db.DB(self.db_path,
                         utils.get_required_version_for_db(self.db_path))
        if not database.get_meta_value('gene_level_coverages_stored'):
            # we don't have any gene-level coverage data stored in this database
            database.disconnect()
            return {}

        self.check_split_names()
        self.check_params()

        self.progress.new("Database bleep bloop")
        self.progress.update("Recovering %s stats from the genes database..." %
                             self.mode)

        database = db.DB(self.db_path,
                         utils.get_required_version_for_db(self.db_path))
        raw_data = database.get_table_as_dict(self.table_name)
        data = {}

        # here we are converting the data as it is stored in the database into something that
        # the rest of anvi'o expects to see how gene-level coverage data should look like
        for entry in raw_data.values():
            gene_callers_id, sample_name = entry['gene_callers_id'], entry[
                'sample_name']

            if gene_callers_id not in data:
                data[gene_callers_id] = {}

            if sample_name not in data[gene_callers_id]:
                data[gene_callers_id][sample_name] = entry

            g, n = data[gene_callers_id][sample_name][
                'gene_coverage_values_per_nt'], data[gene_callers_id][
                    sample_name]['gene_coverage_values_per_nt']
            data[gene_callers_id][sample_name][
                'gene_coverage_values_per_nt'] = utils.convert_binary_blob_to_numpy_array(
                    g, 'uint16')

            if n:
                data[gene_callers_id][sample_name][
                    'non_outlier_positions'] = utils.convert_binary_blob_to_numpy_array(
                        n, 'uint16')
            else:
                data[gene_callers_id][sample_name][
                    'non_outlier_positions'] = None

        database.disconnect()
        self.progress.end()

        self.run.warning(None,
                         header="GENE LEVEL COVERAGE STATS RECOVERED (yay)",
                         lc="green")
        self.run.info("Mode", self.mode, mc="red")
        self.run.info("Num genes", len(data))
        self.print_info()

        return data
Example #5
0
    def populate_genes_in_contigs_table(self,
                                        gene_calls_dict,
                                        amino_acid_sequences,
                                        append_to_the_db=False):
        utils.is_contigs_db(self.db_path)
        database = db.DB(self.db_path,
                         utils.get_required_version_for_db(self.db_path))

        if not append_to_the_db:
            database._exec('''DELETE FROM %s''' %
                           (t.genes_in_contigs_table_name))
            database._exec('''DELETE FROM %s''' %
                           (t.gene_amino_acid_sequences_table_name))
        else:
            # so we are in the append mode. We must remove all the previous entries from genes in contigs
            # that matches to the incoming sources. otherwise we may end up with many duplicates in the db.
            sources = set([v['source'] for v in gene_calls_dict.values()])

            # basically here we will go through those sources, find gene caller ids associated with them in
            # the genes in contigs table, and then remove entries for those gene caller ids both from the
            # genes in contigs and genes in splits tables.
            for source in sources:
                gene_caller_ids_for_source = database.get_single_column_from_table(
                    t.genes_in_contigs_table_name,
                    'gene_callers_id',
                    where_clause="""source='%s'""" % source)

                if gene_caller_ids_for_source:
                    for table_name in [
                            t.genes_in_contigs_table_name,
                            t.genes_in_splits_table_name
                    ]:
                        database._exec('''DELETE FROM %s WHERE gene_callers_id IN (%s)''' % \
                                                    (table_name, ','.join([str(g) for g in gene_caller_ids_for_source])))

        self.progress.new('Processing')
        self.progress.update('Entering %d gene calls into the db ...' %
                             (len(gene_calls_dict)))

        db_entries = [
            tuple([gene_callers_id] + [
                gene_calls_dict[gene_callers_id][h]
                for h in t.genes_in_contigs_table_structure[1:]
            ]) for gene_callers_id in gene_calls_dict
        ]
        database._exec_many(
            '''INSERT INTO %s VALUES (?,?,?,?,?,?,?,?,?)''' %
            t.genes_in_contigs_table_name, db_entries)

        db_entries = [
            tuple([gene_callers_id, amino_acid_sequences[gene_callers_id]])
            for gene_callers_id in gene_calls_dict
        ]
        database._exec_many(
            '''INSERT INTO %s VALUES (?,?)''' %
            t.gene_amino_acid_sequences_table_name, db_entries)

        self.progress.end()

        database.disconnect()
Example #6
0
    def add(self, search_output):
        """Incrementally adds new hits to a contigs database.

           It is essential to run the member function `update_db_self_table_values` once adding new hits are complete.
           At the time of writing this class w couldn't find a better way to do it.
        """

        self.database = db.DB(self.db_path,
                              utils.get_required_version_for_db(self.db_path))

        entries = []
        for gene_callers_id, anticodon, anticodon_hits in search_output:
            # go back if there is nothing to do
            if not len(anticodon_hits):
                continue

            amino_acid = anticodon_to_AA[anticodon]

            for anticodon_hit in anticodon_hits:
                entries.append([gene_callers_id, amino_acid, anticodon] + [
                    anticodon_hit[f]
                    for f in t.trna_taxonomy_table_structure[3:]
                ])

        self.database.insert_many(t.trna_taxonomy_table_name, entries)
        self.database.disconnect()
Example #7
0
    def list_data_keys(self):
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))

        NOPE = lambda: self.run.info_single("There are no additional data for '%s' in this database :/" % self.target, nl_before=1, nl_after=1, mc='red')

        additional_data_keys = {}
        # here is where things get tricky. if we are dealing with additional data layers or items, we will have
        # data groups that are not relevant for order data. this will affect the listing of data keys in either
        # of these table types. hence we get group names first here, and then will do a bunch of if/else checks
        # based on their availability
        if self.target in ['layers', 'items']:
            group_names = AdditionalDataBaseClass.get_group_names(self)
            for group_name in group_names:
                additional_data_keys[group_name] = sorted(database.get_single_column_from_table(self.table_name, 'data_key', unique=True, where_clause="data_group = '%s'" % group_name))

            if not len(additional_data_keys):
                NOPE()
                database.disconnect()
                return

        elif self.target in ['layer_orders']:
            data_keys = sorted(database.get_single_column_from_table(self.table_name, 'data_key', unique=True))

            if not len(data_keys):
                self.run.info_single("There are no additional data for '%s' in this database :/" % self.target, nl_before=1, nl_after=1, mc='red')
                database.disconnect()
                return

            additional_data_keys['default'] = data_keys
            group_names = ['default']

        self.run.warning('', 'DATA KEYS FOR "%s" in %d DATA GROUP(S)' % (self.target.upper(), len(group_names)), lc='yellow')

        for group_name in group_names:
            num_keys = len(additional_data_keys[group_name])

            self.run.info_single('DATA GROUP "%s" WITH %d KEYS' % (group_name, num_keys), nl_before = 1)

            if anvio.DEBUG:
                num_keys_to_display = num_keys
            else:
                num_keys_to_display = min([5, num_keys])

            for key_index in range(0, num_keys_to_display):
                data_key = additional_data_keys[group_name][key_index]
                rows = database.get_some_rows_from_table_as_dict(self.table_name, 'data_key="%s"' % data_key)

                if self.target == 'layer_orders':
                    self.run.info_single('%s (%s)' % (data_key, list(rows.values())[0]['data_type']),
                                         nl_after = 1 if data_key == additional_data_keys[group_name][-1] else 0, level=2)
                else:
                    self.run.info_single('%s (%s, describes %d %s)' % (data_key, list(rows.values())[0]['data_type'], len(rows), self.target),
                                         nl_after = 1 if data_key == additional_data_keys[group_name][-1] else 0, level=2)

            num_keys_not_displayed = num_keys - num_keys_to_display
            if num_keys_not_displayed > 0:
                self.run.info_single('(... %d more; use `--debug` to list all ...)' % \
                                                                (num_keys_not_displayed), nl_after = 1, mc='cyan', level=3)

        database.disconnect()
Example #8
0
    def __init__(self, db_path, num_threads_to_use=1, run=run, progress=progress):
        self.num_threads_to_use = num_threads_to_use
        self.db_path = db_path

        utils.is_contigs_db(self.db_path)

        self.contigs_db_hash = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)).get_meta_value('contigs_db_hash')

        Table.__init__(self, self.db_path, anvio.__contigs__version__, run, progress)

        self.init_gene_calls_dict()

        if not len(self.gene_calls_dict):
            if self.genes_are_called:
                self.run.warning("Tables in this contigs database that should contain gene calls are empty despite the fact that\
                                  you didn't skip the gene calling step while generating this contigs database. This probably means\
                                  that the gene caller did not find any genes among contigs. This is OK for now. But might explode\
                                  later. If it does explode and you decide to let us know about that problem, please remember to mention\
                                  this warning. By the way, this warning probably has been seen by like only 2 people on the planet. Who\
                                  works with contigs with no gene calls? A better implementation of anvi'o will unite researchers who\
                                  study weird stuff.")
            else:
                self.run.warning("It seems you have skipped gene calling step while generating your contigs database, and you have no\
                                  genes calls in tables that should contain gene calls. Anvi'o will let you go with this since some HMM\
                                  sources only operate on DNA sequences, and at this point it doesn't know which HMMs you wish to run.\
                                  If the lack of genes causes a problem, you will get another error message later probably :/")

        self.set_next_available_id(t.hmm_hits_table_name)
        self.set_next_available_id(t.hmm_hits_splits_table_name)
Example #9
0
    def __init__(self, args):
        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.db_path = A('pan_or_profile_db') or A('profile_db') or A('pan_db')
        self.just_do_it = A('just_do_it')

        if not self.db_path:
            raise ConfigError("The AdditionalAndOrderDataBaseClass is inherited with an args object that did not\
                               contain any database path :/ Even though any of the following would\
                               have worked: `pan_or_profile_db`, `profile_db`, `pan_db` :(")

        if not self.table_name:
            raise ConfigError("The AdditionalAndOrderDataBaseClass does not know anything about the table it should\
                               be working with.")

        utils.is_pan_or_profile_db(self.db_path)
        self.db_type = utils.get_db_type(self.db_path)
        self.db_version = utils.get_required_version_for_db(self.db_path)

        database = db.DB(self.db_path, self.db_version)
        self.additional_data_keys = database.get_single_column_from_table(self.table_name, 'data_key')
        database.disconnect()

        Table.__init__(self, self.db_path, self.db_version, self.run, self.progress)

        self.nulls_per_type = {'str': '',
                               'int': 0,
                               'float': 0,
                               'stackedbar': None,
                               'unknown': None}
Example #10
0
    def check_split_names(self):
        """Make sure split names in the genes database match to the expected split names"""

        if not self.ignore_splits_name_check:
            if not self.split_names:
                raise ConfigError("So you want to read gene-level coverage data from this genes database "
                                  "but there is a problem. Here anvi'o is talking to the programmer: there "
                                  "are two modes reading from the genes database. You either create an instance of "
                                  "TableForGeneLevelCoverages with a list of `split_names` so anvi'o can make "
                                  "sure the splits you are looking for are certainly those the database knows "
                                  "about, OR, you set the parameter `ignore_splits_name_check` to True, so anvi'o "
                                  "doesn't care about making sure everything is in order. Well. What is going on "
                                  "here is that someone called the `read` function, but the instance of this "
                                  "class does not know any splits, and the `ignore_splits_name_check` is False.")

            splits_hash = utils.get_hash_for_list(self.split_names)

            db_hash = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)).get_meta_value('splits_hash')

            if splits_hash != db_hash:
                raise ConfigError("Terrible news of the day: You have a genes database for the collection %s and bin %s. But "
                                  "it seems the splits your collection and bin contained when you generated this database "
                                  "has changed after its creation. Maybe you used `anvi-refine` to add or remove some? Or you "
                                  "imported other data with the same collection and bin name? We can't know. You are the one "
                                  "who is creative. But what we know is that this genes database at '%s' is not one that you "
                                  "can use anymore. The easy solution is this: remove this database, and let anvi'o generate "
                                  "another one for you. Alternatively you can run the same exact command you run right before "
                                  "you get this error. Sometimes that works too." % \
                                        (self.collection_name, self.bin_name, self.db_path))
Example #11
0
 def store(self):
     database = db.DB(self.db_path,
                      utils.get_required_version_for_db(self.db_path))
     database._exec_many(
         '''INSERT INTO %s VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)'''
         % t.variable_nts_table_name, self.db_entries)
     database.disconnect()
Example #12
0
    def store(self, data):
        self.progress.new("Database bleep bloop")
        self.progress.update("Adding %s stats into the genes database..." % self.mode)

        db_entries = []
        for gene_callers_id in data:
            for sample_name in data[gene_callers_id]:
                entry = data[gene_callers_id][sample_name]

                d = []
                for h in self.table_structure:
                    if h in ['gene_coverage_values_per_nt', 'non_outlier_positions']:
                        d.append(utils.convert_numpy_array_to_binary_blob(np.array(entry[h]), 'uint16'))
                    else:
                        d.append(entry[h])

                db_entries.append(tuple(d), )

        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
        database._exec_many('''INSERT INTO %s VALUES (?,?,?,?,?,?,?,?)''' % self.table_name, db_entries)

        for parameter in self.parameters:
            database.remove_meta_key_value_pair(parameter)
            database.set_meta_value(parameter, self.parameters[parameter])

        database.update_meta_value('gene_level_coverages_stored', True)
        database.disconnect()

        self.progress.end()

        self.run.warning(None, header="GENE LEVEL COVERAGE STATS STORED", lc="green")
        self.run.info("Mode", self.mode, mc="red")
        self.run.info("Num genes", len(data))
        self.run.info("Num entries", len(db_entries))
        self.print_info()
Example #13
0
    def store(self):
        self.delete_contents_of_table(t.pan_gene_clusters_table_name, warning=False)

        db_entries = [tuple([self.next_id(t.pan_gene_clusters_table_name)] + entry) for entry in self.entries]
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
        database._exec_many('''INSERT INTO %s VALUES (?,?,?,?,?)''' % t.pan_gene_clusters_table_name, db_entries)
        database.disconnect()
Example #14
0
    def __init__(self,
                 db_path,
                 num_threads_to_use=1,
                 run=run,
                 progress=progress):
        self.num_threads_to_use = num_threads_to_use
        self.db_path = db_path

        utils.is_contigs_db(self.db_path)

        self.contigs_db_hash = db.DB(
            self.db_path, utils.get_required_version_for_db(
                self.db_path)).get_meta_value('contigs_db_hash')

        Table.__init__(self, self.db_path, anvio.__contigs__version__, run,
                       progress)

        if not self.genes_are_called:
            raise ConfigError(
                "It seems the contigs database '%s' was created with '--skip-gene-calling' flag.\
                                Nothing to do here :/" % (self.db_path))

        self.init_gene_calls_dict()

        if not len(self.gene_calls_dict):
            raise ConfigError(
                "Tables that should contain gene calls are empty. Which probably means the gene\
                                caller reported no genes for your contigs.")

        self.set_next_available_id(t.hmm_hits_table_name)
        self.set_next_available_id(t.hmm_hits_splits_table_name)
Example #15
0
    def update_self_value(self, value=True):
        """Updates the self table in contigs db to clarify that scg taxonomy were run"""

        self.database = db.DB(self.db_path,
                              utils.get_required_version_for_db(self.db_path))
        self.database.update_meta_value("scg_taxonomy_was_run", value)
        self.database.disconnect()
Example #16
0
    def remove_source(self, source):
        """Remove an HMM source from the database."""

        tables_with_source = [
            t.hmm_hits_info_table_name,
            t.hmm_hits_table_name,
            t.hmm_hits_splits_table_name,
            t.genes_in_contigs_table_name,
            t.gene_function_calls_table_name,
        ]

        tables_with_gene_callers_id = [
            t.gene_amino_acid_sequences_table_name,
            t.genes_taxonomy_table_name,
            t.genes_in_splits_table_name
        ]

        # delete entries from tables with 'source' column
        self.delete_entries_for_key('source', source, tables_with_source)

        # collect gene caller ids that were added to the db via the HMM source
        gene_caller_ids_to_remove = set(key for key, val in self.gene_calls_dict.items() if val['source'] == source)

        # if there are any, remove them from tables with 'gene_callers_id' column
        if len(gene_caller_ids_to_remove):
            database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))

            CLAUSE = "gene_callers_id in (%s)" % (','.join([str(x) for x in gene_caller_ids_to_remove]))
            for table in tables_with_gene_callers_id:
                database.remove_some_rows_from_table(table, CLAUSE)

            database.disconnect()

            run.warning("%d gene caller ids that were added via the HMM source have been removed from \"%s\"" \
                        % (len(gene_caller_ids_to_remove), ', '.join(tables_with_gene_callers_id)))
Example #17
0
    def read_atomic_data_tables(self):
        """Reads atomic data for contigs and splits from the database into a dict"""

        atomic_data_table_for_each_run = {}

        for target in ['contigs', 'splits']:
            self.progress.new("Fetching atomic %s tables" % target,
                              progress_total_items=self.num_profile_dbs)

            atomic_data_table_for_each_run[target] = {}
            target_table = 'atomic_data_%s' % target

            for i, input_profile_db_path in enumerate(
                    self.profile_dbs_info_dict):
                self.progress.update(
                    "(%d/%d) %s" %
                    (i, self.num_profile_dbs, input_profile_db_path))
                self.progress.increment()

                db = anvio.db.DB(
                    input_profile_db_path,
                    utils.get_required_version_for_db(input_profile_db_path))
                atomic_data_table_for_each_run[target][
                    input_profile_db_path] = db.get_table_as_dict(target_table)

            self.progress.end()

        atomic_data_table_fields = db.get_table_structure('atomic_data_splits')
        db.disconnect()

        return atomic_data_table_fields, atomic_data_table_for_each_run
Example #18
0
    def create(self, functions_dict, drop_previous_annotations_first=False):
        self.sanity_check()

        # open connection
        database = db.DB(self.db_path,
                         utils.get_required_version_for_db(self.db_path))

        # Add the new sources to existing sources
        gene_function_sources = set(
            [v['source'] for v in list(functions_dict.values())])
        self.add_new_sources_to_functional_sources(
            gene_function_sources,
            database,
            drop_previous_annotations_first=drop_previous_annotations_first)

        unique_num_genes = len(
            set([v['gene_callers_id'] for v in list(functions_dict.values())]))

        # push the data
        db_entries = [
            tuple([
                functions_dict[v][h]
                for h in t.gene_function_calls_table_structure
            ]) for v in functions_dict
        ]
        database._exec_many(
            '''INSERT INTO %s VALUES (?,?,?,?,?)''' %
            t.gene_function_calls_table_name, db_entries)

        # disconnect like a pro.
        database.disconnect()

        self.run.info('Gene functions', '%d function calls from %d sources for %d unique gene calls has\
                                         been added to the contigs database.'                                                                              % \
                                            (len(functions_dict), len(gene_function_sources), unique_num_genes))
Example #19
0
    def add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(self, source, search_results_dict, skip_amino_acid_sequences=False):
        """Add new gene calls to the contigs database and update the HMM `search_results_dict`.

           When we are looking for HMM hits in the context of CONTIGS, our hits do not
           related to the gene calls we already have in a given contigs database. One
           slution is to add additional gene calls for a given set of HMM hits to keep
           them in the database."""

        # we will first learn the next available id in the gene callers table
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
        next_id = database.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1
        database.disconnect()

        additional_gene_calls = {}
        for e in search_results_dict.values():
            start = e['start']
            stop = e['stop']

            if stop > start:
                direction = 'f'
            else:
                direction = 'r'
                stop, start = start, stop

            partial = 0 if ((stop - start) % 3 == 0) else 1

            # add a new gene call in to the dictionary
            additional_gene_calls[next_id] = {'contig': e['contig_name'],
                                              'start': start,
                                              'stop': stop,
                                              'direction': direction,
                                              'partial': partial,
                                              'source': source,
                                              'version': 'unknown'
                                            }

            # update the search results dictionary with gene callers id:
            e['gene_callers_id'] = next_id

            # update the next available gene callers id:
            next_id += 1

        if not len(additional_gene_calls):
            return search_results_dict

        # update the contigs db with the gene calls in `additional_gene_calls` dict.
        gene_calls_table = TablesForGeneCalls(self.db_path, run=terminal.Run(verbose=False))
        gene_calls_table.use_external_gene_calls_to_populate_genes_in_contigs_table(input_file_path=None,
                                                                                    gene_calls_dict=additional_gene_calls,
                                                                                    ignore_internal_stop_codons=True,
                                                                                    skip_amino_acid_sequences=skip_amino_acid_sequences)
        gene_calls_table.populate_genes_in_splits_tables(gene_calls_dict=additional_gene_calls)

        # refresh the gene calls dict
        self.init_gene_calls_dict()

        self.run.info('Gene calls added to db', '%d (from source "%s")' % (len(additional_gene_calls), source))

        return search_results_dict
Example #20
0
    def append(self, source, reference, kind_of_search, domain, all_genes,
               search_results_dict):
        # we want to define unique identifiers for each gene first. this information will be used to track genes that will
        # break into multiple pieces due to arbitrary split boundaries. while doing that, we will add the 'source' info
        # into the dictionary, so it perfectly matches to the table structure

        for entry_id in search_results_dict:
            hit = search_results_dict[entry_id]

            gene_call = self.gene_calls_dict[hit['gene_callers_id']]

            hit['gene_unique_identifier'] = hashlib.sha224('_'.join([
                str(self.contigs_db_hash), gene_call['contig'],
                hit['gene_name'],
                str(gene_call['start']),
                str(gene_call['stop'])
            ]).encode('utf-8')).hexdigest()
            hit['source'] = source

        self.remove_source(source)

        database = db.DB(self.db_path,
                         utils.get_required_version_for_db(self.db_path))

        # push information about this search result into serach_info table.
        db_entries = [
            source, reference, kind_of_search, domain, ', '.join(all_genes)
        ]
        database._exec(
            '''INSERT INTO %s VALUES (?,?,?,?,?)''' %
            t.hmm_hits_info_table_name, db_entries)

        # if our search results were empty, we can return from here.
        if not len(search_results_dict):
            database.disconnect()
            return

        # then populate serach_data table for each contig.
        db_entries = []
        for hit in list(search_results_dict.values()):
            entry_id = self.next_id(t.hmm_hits_table_name)
            db_entries.append(
                tuple([entry_id] +
                      [hit[h] for h in t.hmm_hits_table_structure[1:]]))
            # tiny hack here: for each hit, we are generating a unique id (`entry_id`), and feeding that information
            #                 back into the dictionary to pass it to processing of splits, so each split-level
            #                 entry knows who is their parent.
            hit['hmm_hit_entry_id'] = entry_id

        database._exec_many(
            '''INSERT INTO %s VALUES (?,?,?,?,?,?,?)''' %
            t.hmm_hits_table_name, db_entries)

        db_entries = self.process_splits(search_results_dict)
        database._exec_many(
            '''INSERT INTO %s VALUES (?,?,?,?,?)''' %
            t.hmm_hits_splits_table_name, db_entries)

        database.disconnect()
Example #21
0
    def __init__(self, db_path, run=run, progress=progress):
        self.run = run
        self.progress = progress
        self.db_path = db_path

        Table.__init__(self, self.db_path,
                       utils.get_required_version_for_db(db_path), self.run,
                       self.progress)
Example #22
0
    def store(self):
        self.delete_contents_of_table(t.pan_gene_clusters_table_name, warning=False)

        db_entries = [tuple(entry) for entry in self.entries]

        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
        database._exec_many('''INSERT INTO %s VALUES (?,?,?,?)''' % t.pan_gene_clusters_table_name, db_entries)
        database.disconnect()
Example #23
0
    def get_num_entries(self):
        database = db.DB(self.db_path,
                         utils.get_required_version_for_db(self.db_path))
        num_entries = database.get_row_counts_from_table(
            t.variable_nts_table_name)
        database.disconnect()

        return num_entries
Example #24
0
    def populate_taxon_names_table(self):
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))

        db_entries = [tuple([t_name_id] + [self.taxon_names_dict[t_name_id][t_level] for t_level in t.taxon_names_table_structure[1:]]) for t_name_id in self.taxon_names_dict]
        database._exec_many('''INSERT INTO %s VALUES (?,?,?,?,?,?,?)''' % t.taxon_names_table_name, db_entries)

        database.disconnect()
        self.run.info('Taxon names table', 'Updated with %d unique taxon names' % len(db_entries))
Example #25
0
    def __init__(self, db_path):
        self.db_path = db_path
        self.states = {}

        utils.is_pan_or_profile_db(self.db_path)

        Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run, progress)

        self.init()
Example #26
0
    def __init__(self, db_path, run=run, progress=progress):
        self.db_path = db_path

        Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run, progress)

        # set these dudes so we have access to unique IDs:
        self.set_next_available_id(t.collections_bins_info_table_name)
        self.set_next_available_id(t.collections_contigs_table_name)
        self.set_next_available_id(t.collections_splits_table_name)
Example #27
0
    def __init__(self, db_path, run=run, progress=progress):
        self.db_path = db_path
        self.run = run
        self.progress = progress

        Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run=self.run, progress=self.progress)

        self.num_entries = 0
        self.db_entries = []
        self.set_next_available_id(t.variable_codons_table_name)
Example #28
0
    def __init__(self, db_path):
        self.db_path = db_path
        self.states = {}

        if utils.get_db_type(self.db_path) not in ['profile', 'pan', 'structure']:
            raise ConfigError("Your database '%s' does not seem to have states table, which anvi'o tries to access.")

        Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run, progress)

        self.init()
Example #29
0
    def store_state(self, state_id, content, last_modified=None):
        self.remove_state(state_id)

        last_modified = datetime.datetime.now().strftime("%d.%m.%Y %H:%M:%S") if not last_modified else last_modified

        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
        database._exec('''INSERT INTO %s VALUES (?,?,?)''' % t.states_table_name, (state_id, content, last_modified))
        self.states = database.get_table_as_dict(t.states_table_name)

        database.disconnect()
Example #30
0
 def store(self):
     utils.is_profile_db(self.db_path)
     database = db.DB(self.db_path,
                      utils.get_required_version_for_db(self.db_path))
     database._exec_many(
         '''INSERT INTO %s VALUES (%s)''' %
         (t.variable_codons_table_name, ','.join(
             ['?'] * len(t.variable_codons_table_structure))),
         self.db_entries)
     database.disconnect()
Example #31
0
    def __init__(self, db_path, run=run, progress=progress):
        self.db_path = db_path
        self.run = run
        self.progress = progress

        Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run=self.run, progress=self.progress)

        self.num_entries = 0
        self.db_entries = []
        self.set_next_available_id(t.variable_aas_table_name)
Example #32
0
    def store_state(self, state_id, content, last_modified=None):
        self.remove_state(state_id)

        last_modified = datetime.datetime.now().strftime("%d.%m.%Y %H:%M:%S") if not last_modified else last_modified

        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
        database._exec('''INSERT INTO %s VALUES (?,?,?)''' % t.states_table_name, (state_id, content, last_modified))
        self.states = database.get_table_as_dict(t.states_table_name)

        database.disconnect()
Example #33
0
    def __init__(self, db_path):
        self.db_path = db_path
        self.states = {}

        if utils.get_db_type(self.db_path) not in ['profile', 'pan', 'structure', 'genes']:
            raise ConfigError("Your database '%s' does not seem to have states table, which anvi'o tries to access.")

        Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run, progress)

        self.init()
Example #34
0
    def add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(self, source, search_results_dict):
        """Add new gene calls to the contigs database and update the HMM `search_results_dict`.

           When we are looking for HMM hits in the context of CONTIGS, our hits do not
           related to the gene calls we already have in a given contigs database. One
           slution is to add additional gene calls for a given set of HMM hits to keep
           them in the database."""

        # we will first learn the next available id in the gene callers table
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
        next_id = database.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1
        database.disconnect()

        additional_gene_calls = {}
        for e in search_results_dict.values():
            start = e['start']
            stop = e['stop']

            if stop > start:
                direction = 'f'
            else:
                direction = 'r'
                stop, start = start, stop

            partial = 0 if ((stop - start) % 3 == 0) else 1

            # add a new gene call in to the dictionary
            additional_gene_calls[next_id] = {'contig': e['contig_name'],
                                              'start': start,
                                              'stop': stop,
                                              'direction': direction,
                                              'partial': partial,
                                              'source': source,
                                              'version': 'unknown'
                                            }

            # update the search results dictionary with gene callers id:
            e['gene_callers_id'] = next_id

            # update the next available gene callers id:
            next_id += 1

        # update the contigs db with the gene calls in `additional_gene_calls` dict.
        gene_calls_table = TablesForGeneCalls(self.db_path, run=terminal.Run(verbose=False))
        gene_calls_table.use_external_gene_calls_to_populate_genes_in_contigs_table(input_file_path=None,
                                                                                    gene_calls_dict=additional_gene_calls,
                                                                                    ignore_internal_stop_codons=True)
        gene_calls_table.populate_genes_in_splits_tables()

        # refresh the gene calls dict
        self.init_gene_calls_dict()

        self.run.info('Gene calls added to db', '%d (from source "%s")' % (len(additional_gene_calls), source))

        return search_results_dict
Example #35
0
    def __init__(self, db_path, run=run, progress=progress):
        self.db_path = db_path
        self.version = utils.get_required_version_for_db(db_path)
        self.run = run

        Table.__init__(self, self.db_path, self.version, run, progress)

        # set these dudes so we have access to unique IDs:
        self.set_next_available_id(t.collections_bins_info_table_name)
        self.set_next_available_id(t.collections_contigs_table_name)
        self.set_next_available_id(t.collections_splits_table_name)
Example #36
0
    def __init__(self, db_path, run=run, progress=progress):
        self.db_path = db_path
        self.run = run
        self.progress = progress

        Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run=self.run, progress=self.progress)

        self.num_entries = self.get_num_entries()
        self.db_entries = []

        self.max_num_entries_in_storage_buffer = 15000
Example #37
0
    def add_empty_sources_to_functional_sources(self, gene_function_sources):
        if type(gene_function_sources) is not set:
            raise ConfigError('The programmer who called this function forgot that gene_function_sources must be of \
                               type %s. If this is not your falut, please contact an anvi\'o developer.' % set)
        # open connection
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))

        self.add_new_sources_to_functional_sources(gene_function_sources, database)

        # disconnect like a pro.
        database.disconnect()
Example #38
0
    def populate_genes_taxonomy_table(self):
        # open connection
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))

        # push taxonomy data
        db_entries = [(gene_callers_id, self.genes_taxonomy_dict[gene_callers_id]) for gene_callers_id in self.genes_taxonomy_dict]
        database._exec_many('''INSERT INTO %s VALUES (?,?)''' % t.genes_taxonomy_table_name, db_entries)

        # disconnect
        database.disconnect()

        self.run.info('Genes taxonomy table', 'Taxonomy stored for %d gene calls' % len(db_entries))
Example #39
0
    def store(self):
        if not len(self.db_entries):
            return

        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
        database._exec_many('''INSERT INTO %s VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)''' % t.indels_table_name, self.db_entries)
        database.disconnect()

        if anvio.DEBUG:
            run.info_single("INDELS: %d entries added to the indels table." % len(self.db_entries), mc="green")

        self.db_entries = []
Example #40
0
    def append(self, source, reference, kind_of_search, domain, all_genes, search_results_dict):
        # we want to define unique identifiers for each gene first. this information will be used to track genes that will
        # break into multiple pieces due to arbitrary split boundaries. while doing that, we will add the 'source' info
        # into the dictionary, so it perfectly matches to the table structure

        for entry_id in search_results_dict:
            hit = search_results_dict[entry_id]

            gene_call = self.gene_calls_dict[hit['gene_callers_id']]

            hit['gene_unique_identifier'] = hashlib.sha224('_'.join([str(self.contigs_db_hash),
                                                                     gene_call['contig'],
                                                                     hit['gene_name'],
                                                                     str(gene_call['start']),
                                                                     str(gene_call['stop'])]).encode('utf-8')).hexdigest()
            hit['source'] = source

        self.remove_source(source)

        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))

        # push information about this search result into serach_info table.
        db_entries = [source, reference, kind_of_search, domain, ', '.join(all_genes)]
        database._exec('''INSERT INTO %s VALUES (?,?,?,?,?)''' % t.hmm_hits_info_table_name, db_entries)

        # if our search results were empty, we can return from here.
        if not len(search_results_dict):
            database.disconnect()
            return

        # then populate serach_data table for each contig.
        db_entries = []
        for hit in list(search_results_dict.values()):
            entry_id = self.next_id(t.hmm_hits_table_name)
            db_entries.append(tuple([entry_id] + [hit[h] for h in t.hmm_hits_table_structure[1:]]))
            # tiny hack here: for each hit, we are generating a unique id (`entry_id`), and feeding that information
            #                 back into the dictionary to pass it to processing of splits, so each split-level
            #                 entry knows who is their parent.
            hit['hmm_hit_entry_id'] = entry_id

        database._exec_many('''INSERT INTO %s VALUES (?,?,?,?,?,?,?)''' % t.hmm_hits_table_name, db_entries)

        db_entries = self.process_splits(search_results_dict)
        database._exec_many('''INSERT INTO %s VALUES (?,?,?,?,?)''' % t.hmm_hits_splits_table_name, db_entries)

        database.disconnect()
Example #41
0
    def read_atomic_data_tables(self):
        """reads atomic data for contigs and splits from the database into a dict"""
        atomic_data_table_for_each_run = {}

        for target in ['contigs', 'splits']:
            atomic_data_table_for_each_run[target] = {}

            target_table = 'atomic_data_%s' % target

            for input_profile_db_path in self.profile_dbs_info_dict:
                db = anvio.db.DB(input_profile_db_path, utils.get_required_version_for_db(input_profile_db_path))
                atomic_data_table_for_each_run[target][input_profile_db_path] = db.get_table_as_dict(target_table)

        atomic_data_table_fields = db.get_table_structure('atomic_data_splits')
        db.disconnect()

        return atomic_data_table_fields, atomic_data_table_for_each_run
Example #42
0
    def create(self, genes_taxonomy_dict, taxon_names_dict, source='unkown source'):
        self.source = source

        if not self.genes_are_called:
            raise ConfigError("Something is wrong. The contigs database says that genes were now called, and here\
                                you are trying to populate taxonomy tables for genes. No, thanks.")

        self.init_gene_calls_dict()

        self.genes_taxonomy_dict = genes_taxonomy_dict
        self.taxon_names_dict = taxon_names_dict

        self.sanity_check()

        # oepn connection
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))

        self.splits_info = database.get_table_as_dict(t.splits_info_table_name)

        # test whether there are already genes tables populated
        taxonomy_source = database.get_meta_value('gene_level_taxonomy_source')
        if taxonomy_source:
            self.run.warning('Previous taxonomy information from "%s" is being replaced with the incoming data\
                              through "%s".' % (taxonomy_source, self.source))
            database._exec('''DELETE FROM %s''' % (t.splits_taxonomy_table_name))
            database._exec('''DELETE FROM %s''' % (t.taxon_names_table_name))
            database._exec('''DELETE FROM %s''' % (t.genes_taxonomy_table_name))

        # populate taxon mames table
        self.populate_taxon_names_table()

        # populate genes taxonomy table
        self.populate_genes_taxonomy_table()

        # compute and push split taxonomy information.
        self.populate_splits_taxonomy_table()

        # set the source
        database.remove_meta_key_value_pair('gene_level_taxonomy_source')
        database.set_meta_value('gene_level_taxonomy_source', self.source)

        # disconnect like a pro.
        database.disconnect()
Example #43
0
    def __init__(self, db_path, num_threads_to_use=1, run=run, progress=progress):
        self.num_threads_to_use = num_threads_to_use
        self.db_path = db_path

        utils.is_contigs_db(self.db_path)

        self.contigs_db_hash = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)).get_meta_value('contigs_db_hash')

        Table.__init__(self, self.db_path, anvio.__contigs__version__, run, progress)

        if not self.genes_are_called:
            raise ConfigError("It seems the contigs database '%s' was created with '--skip-gene-calling' flag.\
                                Nothing to do here :/" % (self.db_path))

        self.init_gene_calls_dict()

        if not len(self.gene_calls_dict):
            raise ConfigError("Tables that should contain gene calls are empty. Which probably means the gene\
                                caller reported no genes for your contigs.")

        self.set_next_available_id(t.hmm_hits_table_name)
        self.set_next_available_id(t.hmm_hits_splits_table_name)
Example #44
0
    def populate_genes_in_contigs_table(self, gene_calls_dict, amino_acid_sequences, append_to_the_db=False):
        utils.is_contigs_db(self.db_path)
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))

        if not append_to_the_db:
            database._exec('''DELETE FROM %s''' % (t.genes_in_contigs_table_name))
            database._exec('''DELETE FROM %s''' % (t.gene_amino_acid_sequences_table_name))
        else:
            # so we are in the append mode. We must remove all the previous entries from genes in contigs
            # that matches to the incoming sources. otherwhise we may end up with many duplicates in the db.
            sources = set([v['source'] for v in gene_calls_dict.values()])

            # basically here we will go through those sources, find gene caller ids associated with them in
            # the genes in contigs table, and then remove entries for those gene caller ids both from the
            # genes in contigs and genes in splits tables.
            for source in sources:
                gene_caller_ids_for_source = database.get_single_column_from_table(t.genes_in_contigs_table_name, 
                                                                                   'gene_callers_id',
                                                                                   where_clause="""source='%s'""" % source)

                if gene_caller_ids_for_source:
                    for table_name in [t.genes_in_contigs_table_name, t.genes_in_splits_table_name]:
                        database._exec('''DELETE FROM %s WHERE gene_callers_id IN (%s)''' % \
                                                    (table_name, ','.join([str(g) for g in gene_caller_ids_for_source])))

        self.progress.new('Processing')
        self.progress.update('Entering %d gene calls into the db ...' % (len(gene_calls_dict)))

        db_entries = [tuple([entry_id] + [gene_calls_dict[entry_id][h] for h in t.genes_in_contigs_table_structure[1:]]) for entry_id in gene_calls_dict]
        database._exec_many('''INSERT INTO %s VALUES (?,?,?,?,?,?,?,?)''' % t.genes_in_contigs_table_name, db_entries)

        db_entries = [tuple([entry_id] + [amino_acid_sequences[entry_id]]) for entry_id in gene_calls_dict]
        database._exec_many('''INSERT INTO %s VALUES (?,?)''' % t.gene_amino_acid_sequences_table_name, db_entries)

        self.progress.end()

        database.disconnect()
Example #45
0
 def init(self):
     database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
     self.states = database.get_table_as_dict(t.states_table_name)
     database.disconnect()
Example #46
0
    def create(self, functions_dict, drop_previous_annotations_first = False):
        self.sanity_check()

        # incoming stuff:
        gene_function_sources = set([v['source'] for v in list(functions_dict.values())])
        unique_num_genes = len(set([v['gene_callers_id'] for v in list(functions_dict.values())]))

        # oepn connection
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))

        # are there any previous annotations in the db:
        gene_function_sources_in_db = database.get_meta_value('gene_function_sources')
        gene_function_sources_in_db = set(gene_function_sources_in_db.split(',') if gene_function_sources_in_db else [])

        # difference between sources in the db, and incoming sources:
        gene_function_sources_both_in_db_and_incoming_dict = gene_function_sources.intersection(gene_function_sources_in_db)

        # here we will do some magic. there are mulitple scenarios to consider here based on whether there
        # are functions already in the database, whether some of them matches to the incoming functions, etc.
        # let's go case-by-case:
        if not gene_function_sources_in_db:
            # set the sources and continue
            database.remove_meta_key_value_pair('gene_function_sources')
            database.set_meta_value('gene_function_sources', ','.join(list(gene_function_sources)))

        elif gene_function_sources_in_db and drop_previous_annotations_first:
            # there are gene calls, but the user wants everything to be dropeped.
            self.run.warning("As per your request, anvi'o is DROPPING all previous function calls from %d sources\
                              before adding the incoming data, which contains %d entries originating from %d sources: %s" \
                                    % (len(gene_function_sources_in_db), len(functions_dict),
                                       len(gene_function_sources), ', '.join(gene_function_sources)))

            # clean the table and reset the next available ids
            database._exec('''DELETE FROM %s''' % (t.gene_function_calls_table_name))
            self.reset_next_available_id_for_table(t.gene_function_calls_table_name)

            # set the sources
            database.remove_meta_key_value_pair('gene_function_sources')
            database.set_meta_value('gene_function_sources', ','.join(gene_function_sources))

        elif gene_function_sources_in_db and gene_function_sources_both_in_db_and_incoming_dict:
            # some of the functions in the incoming dict match to what is already in the db. remove
            self.run.warning("Some of the annotation sources you want to add into the database are already in the db. So\
                              anvi'o will REPLACE those with the incoming data from these sources: %s" % \
                                            ', '.join(gene_function_sources_both_in_db_and_incoming_dict))

            # remove those entries for matching sources:
            for source in gene_function_sources_both_in_db_and_incoming_dict:
                database._exec('''DELETE FROM %s WHERE source = "%s"''' % (t.gene_function_calls_table_name, source))

            # set the sources
            database.remove_meta_key_value_pair('gene_function_sources')
            database.set_meta_value('gene_function_sources', ','.join(list(gene_function_sources_in_db.union(gene_function_sources))))

        else:
            # fuctions in the db, but none of them match with the incoming annotation sources. totally new stuff.
            # good then. update sources
            database.remove_meta_key_value_pair('gene_function_sources')
            database.set_meta_value('gene_function_sources', ','.join(list(gene_function_sources_in_db.union(gene_function_sources))))

        # push the data
        db_entries = [tuple([self.next_id(t.gene_function_calls_table_name)] + [functions_dict[v][h] for h in t.gene_function_calls_table_structure[1:]]) for v in functions_dict]
        database._exec_many('''INSERT INTO %s VALUES (?,?,?,?,?,?)''' % t.gene_function_calls_table_name, db_entries)

        # disconnect like a pro.
        database.disconnect()

        self.run.info('Gene functions', '%d function calls from %d sources for %d unique gene calls has\
                                        been added to the contigs database.' % \
                                            (len(functions_dict), len(gene_function_sources), unique_num_genes))
Example #47
0
    def use_external_gene_calls_to_populate_genes_in_contigs_table(self, input_file_path, gene_calls_dict=None, ignore_internal_stop_codons=False):
        """Add genes to the contigs database.

           Either provide an `input_file_path` for external gene calls, or provide an
           external gene calls dictionary. The format should follow this:

                {
                  "1": {
                      "contig": "contig_name",
                      "start": 20,
                      "stop": 1544,
                      "direction": "f",
                      "partial": 0,
                      "source": "source_name",
                      "version": "unknown"
                  },

                  "2": {
                    (...)
                  },

                (...)
                }

            If you provide a `gene_calls_dict`, they will be APPENDED to the database. So you
            need to make sure gene caller ids in your dict does not overlap with the ones in
            the database.

        """

        # by default we assume that this is a pristine run. but if the user sends a dictionary
        append_to_the_db = False

        gene_calls_found = False
        # let's do a rigorous check whether the user provided a gene_calls_dict.
        if (gene_calls_dict is not None and gene_calls_dict is not False):
            if not isinstance(gene_calls_dict, dict):
                raise ConfigError("'Use external gene calls' function received a non-empty gene_calls_dict object,\
                                    but it is of type '%s', and not '%s'" % (type(gene_calls_dict), type({})))

            # congrats, we have a dict.
            gene_calls_found = True

            if not len(gene_calls_dict):
                # but it is empty ... silly user.
                self.run.info_single("'Use external gene calls' function found an empty gene calls dict, returning\
                                      prematurely and assuming you know what's up. If you don't, stop here and try to\
                                      identify what decisions you've made might have led you to this weird point your\
                                      workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've\
                                      done great so far.", nl_before=1, nl_after=1)
                return


        if (not input_file_path and not gene_calls_found) or (input_file_path and gene_calls_found):
            raise ConfigError("You must provide either an input file, or an gene calls dict to process external\
                               gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table`\
                               with wrong parameters.")

        Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True)

        # take care of gene calls dict
        if not gene_calls_found:
            gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary(input_file_path,
                                                                         expected_fields=t.genes_in_contigs_table_structure,
                                                                         only_expected_fields=True,
                                                                         column_mapping=[int, str, int, int, str, int, str, str])

            if not len(gene_calls_dict):
                raise ConfigError("You provided an external gene calls file, but it returned zero gene calls. Assuming that\
                                   this is an error, anvi'o will stop here and complain. If this is not an error and you\
                                   in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag,\
                                   instead of providing an emtpy external gene calls file. You don't agree? You need this\
                                   for some weird step for you weird pipeline? Let us know, and we will consider changing\
                                   this.")

            self.run.info("External gene calls", "%d gene calls recovered and will be processed." % len(gene_calls_dict))
        else:
            # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to
            #        overwrite an existing gene call. Something like this would have returned the
            #        current max, which could be cross-checked with what's in the dict:
            #
            #            contigs_db = ContigsDatabase(self.db_path)
            #            next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1
            #            contigs_db.disconnect()
            append_to_the_db = True

        # recover amino acid sequences. during this operation we are going to have to read all contig sequences
        # into the damn memory. anvi'o is doing a pretty bad job with memory management :(
        amino_acid_sequences = {}

        contig_sequences = {}
        if self.contigs_fasta:
            fasta = u.SequenceSource(self.contigs_fasta)
            while next(fasta):
                contig_sequences[fasta.id] = {'sequence': fasta.seq}
            fasta.close()
        else:
            database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
            contig_sequences = database.get_table_as_dict(t.contig_sequences_table_name)

        num_genes_with_internal_stops = 0
        number_of_impartial_gene_calls = 0
        for gene_callers_id in gene_calls_dict:
            gene_call = gene_calls_dict[gene_callers_id]
            contig_name = gene_call['contig']

            if contig_name not in contig_sequences:
                # remove the partial contigs database so things don't get screwed later
                os.remove(self.db_path)
                raise ConfigError("You are in big trouble :( The contig name '%s' in your external gene callers file\
                                    does not appear to be in the contigs FASTA file. How did this happen?" % contig_name)

            if gene_call['partial']:
                amino_acid_sequences[gene_callers_id] = ''
                number_of_impartial_gene_calls += 1
                continue

            sequence = contig_sequences[contig_name]['sequence'][gene_call['start']:gene_call['stop']]
            if gene_call['direction'] == 'r':
                sequence = utils.rev_comp(sequence)

            amino_acid_sequence = utils.get_DNA_sequence_translated(sequence, gene_callers_id)

            # check if there are any internal stops:
            if amino_acid_sequence.find('*') > -1:
                if ignore_internal_stop_codons:
                    amino_acid_sequence = amino_acid_sequence.replace('*', 'X')
                    num_genes_with_internal_stops += 1
                else:
                    os.remove(self.db_path)
                    raise ConfigError("Oops. Anvi'o run into an amino acid seqeunce (that corresponds to the gene callers id '%s')\
                                       which had an internal stop codon :/ This usually indicates that your external gene calls\
                                       have problems. If you still want to continue, you can ask anvi'o to ignore internal stop\
                                       codons on your own risk. It will probably look very ugly on your screen, but here is the\
                                       DNA sequence for that gene in case you don't trust anvi'o (which only would be fair since\
                                       anvi'o does not trust you either): %s" % (str(gene_callers_id), sequence))

            amino_acid_sequences[gene_callers_id] = amino_acid_sequence

        # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db.
        self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences, append_to_the_db=append_to_the_db)

        if num_genes_with_internal_stops:
            percent_genes_with_internal_stops = num_genes_with_internal_stops * 100.0 / len(gene_calls_dict)
            self.run.warning("Please read this carefully: Your external gene calls contained open reading frames with internal\
                              stop codons, and you asked anvi'o to ignore those. Anvi'o replaced internal stop codons with 'X'\
                              characters, and stored them in the contigs database that way. %d of your genes, which corresponded\
                              to %.2f%% of the total %d genes, had internal stop codons. We hope you are happy." % \
                                        (num_genes_with_internal_stops, percent_genes_with_internal_stops, len(gene_calls_dict)))

        if number_of_impartial_gene_calls:
            self.run.warning('%d of your %d gene calls were impartial, hence the translated amino acid sequences for those\
                              were not stored in the database.' % (number_of_impartial_gene_calls, len(gene_calls_dict)))
Example #48
0
    def get_amino_acid_sequences_for_genes_in_gene_calls_dict(self, gene_calls_dict, ignore_internal_stop_codons=False):
        '''Recover amino acid sequences for gene calls in a gene_calls_dict.

           During this operation we are going to have to read all contig sequences
           into the damn memory. anvi'o is doing a pretty bad job with memory management :(
        '''

        amino_acid_sequences = {}

        # FIXME: this is a very poor practice for memory management:
        contig_sequences = {}

        if self.contigs_fasta:
            fasta = u.SequenceSource(self.contigs_fasta)
            while next(fasta):
                contig_sequences[fasta.id] = {'sequence': fasta.seq}
            fasta.close()
        else:
            database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
            contig_sequences = database.get_table_as_dict(t.contig_sequences_table_name)

        num_genes_with_internal_stops = 0
        number_of_impartial_gene_calls = 0
        for gene_callers_id in gene_calls_dict:
            gene_call = gene_calls_dict[gene_callers_id]
            contig_name = gene_call['contig']

            if contig_name not in contig_sequences:
                # remove the partial contigs database so things don't get screwed later
                os.remove(self.db_path)
                raise ConfigError("You are in big trouble :( The contig name '%s' in your external gene callers file\
                                    does not appear to be in the contigs FASTA file. How did this happen?" % contig_name)

            if gene_call['partial']:
                amino_acid_sequences[gene_callers_id] = ''
                number_of_impartial_gene_calls += 1
                continue

            sequence = contig_sequences[contig_name]['sequence'][gene_call['start']:gene_call['stop']]
            if gene_call['direction'] == 'r':
                sequence = utils.rev_comp(sequence)

            amino_acid_sequence = utils.get_DNA_sequence_translated(sequence, gene_callers_id)

            # check if there are any internal stops:
            if amino_acid_sequence.find('*') > -1:
                if ignore_internal_stop_codons:
                    amino_acid_sequence = amino_acid_sequence.replace('*', 'X')
                    num_genes_with_internal_stops += 1
                else:
                    os.remove(self.db_path)
                    raise ConfigError("Oops. Anvi'o run into an amino acid seqeunce (that corresponds to the gene callers id '%s')\
                                       which had an internal stop codon :/ This usually indicates that your external gene calls\
                                       have problems. If you still want to continue, you can ask anvi'o to ignore internal stop\
                                       codons on your own risk. It will probably look very ugly on your screen, but here is the\
                                       DNA sequence for that gene in case you don't trust anvi'o (which only would be fair since\
                                       anvi'o does not trust you either): %s" % (str(gene_callers_id), sequence))

            amino_acid_sequences[gene_callers_id] = amino_acid_sequence

        if num_genes_with_internal_stops:
            percent_genes_with_internal_stops = num_genes_with_internal_stops * 100.0 / len(gene_calls_dict)
            self.run.warning("Please read this carefully: Your external gene calls contained open reading frames with internal\
                              stop codons, and you asked anvi'o to ignore those. Anvi'o replaced internal stop codons with 'X'\
                              characters, and stored them in the contigs database that way. %d of your genes, which corresponded\
                              to %.2f%% of the total %d genes, had internal stop codons. We hope you are happy." % \
                                        (num_genes_with_internal_stops, percent_genes_with_internal_stops, len(gene_calls_dict)))

        if number_of_impartial_gene_calls:
            self.run.warning('%d of your %d gene calls were impartial, hence the translated amino acid sequences for those\
                              were not stored in the database.' % (number_of_impartial_gene_calls, len(gene_calls_dict)))

        return amino_acid_sequences
Example #49
0
    def get_num_entries(self):
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
        num_entries = database.get_row_counts_from_table(t.variable_nts_table_name)
        database.disconnect()

        return num_entries
Example #50
0
    def append(self, collection_name, collection_dict, bins_info_dict={}):
        utils.is_this_name_OK_for_database('collection name', collection_name, stringent=False)

        for bin_name in collection_dict:
            utils.is_this_name_OK_for_database('bin name', bin_name, stringent=False)

        if bins_info_dict:
            if set(collection_dict.keys()) - set(bins_info_dict.keys()):
                raise ConfigError('Bins in the collection dict do not match to the ones in the bins info dict.\
                                    They do not have to be identical, but for each bin id, there must be a unique\
                                    entry in the bins informaiton dict. There is something wrong with your input :/')

        # remove any pre-existing information for 'collection_name'
        self.delete(collection_name)

        num_splits_in_collection_dict = sum([len(splits) for splits in list(collection_dict.values())])
        splits_in_collection_dict = set(list(chain.from_iterable(list(collection_dict.values()))))
        if len(splits_in_collection_dict) != num_splits_in_collection_dict:
            raise ConfigError("TablesForCollections::append: %d of the split or contig IDs appear more than once in\
                                your collections input. It is unclear to anvi'o how did you manage to do this, but we\
                                cannot go anywhere with this :/" % (num_splits_in_collection_dict - len(splits_in_collection_dict)))

        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))

        # how many clusters are defined in 'collection_dict'?
        bin_names = list(collection_dict.keys())

        # push information about this search result into serach_info table.
        db_entries = tuple([collection_name, num_splits_in_collection_dict, len(bin_names), ','.join(bin_names)])
        database._exec('''INSERT INTO %s VALUES (?,?,?,?)''' % t.collections_info_table_name, db_entries)

        if not bins_info_dict:
            colors = utils.get_random_colors_dict(bin_names)
            for bin_name in bin_names:
                bins_info_dict[bin_name] = {'html_color': colors[bin_name], 'source': 'UNKNOWN'}

        # populate bins info table.
        db_entries = [(self.next_id(t.collections_bins_info_table_name), collection_name, b, bins_info_dict[b]['source'], bins_info_dict[b]['html_color']) for b in bin_names]
        database._exec_many('''INSERT INTO %s VALUES (?,?,?,?,?)''' % t.collections_bins_info_table_name, db_entries)

        # populate splits table
        db_entries = []
        for bin_name in collection_dict:
            for split_name in collection_dict[bin_name]:
                db_entries.append(tuple([self.next_id(t.collections_splits_table_name), collection_name, split_name, bin_name]))
        database._exec_many('''INSERT INTO %s VALUES (?,?,?,?)''' % t.collections_splits_table_name, db_entries)
        num_splits = len(db_entries)


        # FIXME: This function can be called to populate the contigs database (via anvi-populate-collections), or
        # the profile database. when it is contigs database, the superclass Table has the self.splits_info variable
        # set when it is initialized. however, the Table instance is missing self.splis when it is initialized with
        # the profile database. hence some special controls for contigs db (note that collections_contigs_table is
        # only populated in the contigs database):
        if self.db_type == 'contigs':
            splits_only_in_collection_dict = [c for c in splits_in_collection_dict if c not in self.splits_info]
            splits_only_in_db = [c for c in self.splits_info if c not in splits_in_collection_dict]

            if len(splits_only_in_collection_dict):
                self.run.warning('%d of %d splits found in "%s" results are not in the database. This may be OK,\
                                          but you must be the judge of it. If this is somewhat surprising, please use caution\
                                          and make sure all is fine before going forward with you analysis.'\
                                                % (len(splits_only_in_collection_dict), len(splits_in_collection_dict), collection_name))

            if len(splits_only_in_db):
                self.run.warning('%d of %d splits found in the database were missing from the "%s" results. If this\
                                          does not make any sense, please make sure you know why before going any further.'\
                                                % (len(splits_only_in_db), len(self.splits_info), collection_name))

            # then populate contigs table.
            db_entries = self.process_contigs(collection_name, collection_dict)
            database._exec_many('''INSERT INTO %s VALUES (?,?,?,?)''' % t.collections_contigs_table_name, db_entries)

        database.disconnect()

        num_bins = len(bin_names)
        num_bins_to_report = 50
        if num_bins <= num_bins_to_report:
            bins_to_report = bin_names
            bin_report_msg = "Here is a full list of the bin names in this collection: {}.".format(",".join(bins_to_report))
        else:
            bins_to_report = bin_names[:num_bins_to_report]
            bin_report_msg = "Here is a list of the first {} bin names in this collection: {}.".format(num_bins_to_report, ",".join(bins_to_report))

        self.run.info('Collections', 'The collection "%s" that describes %s splits and %s bins has been successfully added to the\
                                      database at "%s". %s' % (collection_name, pp(num_splits), pp(num_bins), self.db_path, bin_report_msg), mc='green')
Example #51
0
 def store(self):
     database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
     database._exec_many('''INSERT INTO %s VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)''' % t.variable_nts_table_name, self.db_entries)
     database.disconnect()
Example #52
0
 def store(self):
     utils.is_profile_db(self.db_path)
     database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
     database._exec_many('''INSERT INTO %s VALUES (%s)''' % (t.variable_codons_table_name, ','.join(['?'] * len(t.variable_codons_table_structure))), self.db_entries)
     database.disconnect()
Example #53
0
    def populate_splits_taxonomy_table(self):
        """Populate the taxonomy information per split"""

        # build a dictionary for fast access to all genes identified within a contig
        gene_caller_ids_in_contigs = {}
        for gene_callers_id in self.genes_taxonomy_dict:
            contig = self.gene_calls_dict[gene_callers_id]['contig']
            if contig in gene_caller_ids_in_contigs:
                gene_caller_ids_in_contigs[contig].add(gene_callers_id)
            else:
                gene_caller_ids_in_contigs[contig] = set([gene_callers_id])

        contigs_without_annotation = list(set(self.contigs_info.keys()) - set(gene_caller_ids_in_contigs.keys()))

        for contig in contigs_without_annotation:
            gene_caller_ids_in_contigs[contig] = set([])

        splits_dict = {}

        num_splits_processed = 0
        num_splits_with_taxonomy = 0

        for contig in self.contigs_info:
            for split_name in self.contig_name_to_splits[contig]:
                num_splits_processed += 1

                splits_dict[split_name] = None
                start = self.splits_info[split_name]['start']
                stop = self.splits_info[split_name]['end']

                taxon_name_ids = []
                for gene_callers_id in gene_caller_ids_in_contigs[contig]:
                    if self.gene_calls_dict[gene_callers_id]['stop'] > start and self.gene_calls_dict[gene_callers_id]['start'] < stop:
                        taxon_name_ids.append(self.genes_taxonomy_dict[gene_callers_id])

                if not taxon_name_ids:
                    continue

                if len(set(taxon_name_ids)) == 1:
                    splits_dict[split_name] = taxon_name_ids[0]
                else:
                    d = Counter()
                    for taxon_name_id in taxon_name_ids:
                        d[taxon_name_id] += 1

                    most_frequent_taxon_name_id, occurrence = d.most_common()[0]
                    splits_dict[split_name] = most_frequent_taxon_name_id

                num_splits_with_taxonomy += 1

        # open connection
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))

        # push taxonomy data
        db_entries = [(split, splits_dict[split]) for split in splits_dict]
        database._exec_many('''INSERT INTO %s VALUES (?,?)''' % t.splits_taxonomy_table_name, db_entries)

        # disconnect
        database.disconnect()

        self.run.info('Splits taxonomy', 'Input data from "%s" annotated %d of %d splits (%.1f%%) with taxonomy.'\
                                            % (self.source, num_splits_with_taxonomy, num_splits_processed,
                                               num_splits_with_taxonomy * 100.0 / num_splits_processed))
Example #54
0
File: views.py Project: meren/anvio
    def __init__(self, db_path, run=run, progress=progress):
        self.db_path = db_path

        Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run, progress)
Example #55
0
    def populate_genes_in_splits_tables(self, gene_calls_dict=None):
        utils.is_contigs_db(self.db_path)
        Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress)
        self.set_next_available_id(t.genes_in_splits_table_name)
        self.init_gene_calls_dict()

        if not gene_calls_dict:
            gene_calls_dict = self.gene_calls_dict

        genes_in_splits = GenesInSplits(entry_id_start=self.next_id(t.genes_in_splits_table_name))
        # build a dictionary for fast access to all genes identified within a contig
        gene_calls_in_contigs_dict = {}
        for gene_callers_id in gene_calls_dict:
            contig = gene_calls_dict[gene_callers_id]['contig']
            if contig in gene_calls_in_contigs_dict:
                gene_calls_in_contigs_dict[contig].add(gene_callers_id)
            else:
                gene_calls_in_contigs_dict[contig] = set([gene_callers_id])

        contigs_without_any_gene_calls = list(set(self.contigs_info.keys()) - set(gene_calls_in_contigs_dict.keys()))
        self.run.info('Contigs with at least one gene call', '%d of %d (%.1f%%)' % (len(gene_calls_in_contigs_dict),
                                                                                    len(self.contigs_info),
                                                                                    len(gene_calls_in_contigs_dict) * 100.0 / len(self.contigs_info)))

        for contig in contigs_without_any_gene_calls:
            gene_calls_in_contigs_dict[contig] = set([])

        splits_dict = {}
        for contig in self.contigs_info:
            for split_name in self.contig_name_to_splits[contig]:
                start = self.splits_info[split_name]['start']
                stop = self.splits_info[split_name]['end']

                gene_start_stops = []
                # here we go through all genes in the contig and identify the all the ones that happen to be in
                # this particular split to generate summarized info for each split. BUT one important that is done
                # in the following loop is genes_in_splits.add call, which populates GenesInSplits class.
                for gene_callers_id in gene_calls_in_contigs_dict[contig]:
                    if gene_calls_dict[gene_callers_id]['stop'] > start and gene_calls_dict[gene_callers_id]['start'] < stop:
                        gene_start_stops.append((gene_calls_dict[gene_callers_id]['start'], gene_calls_dict[gene_callers_id]['stop']), )
                        genes_in_splits.add(split_name, start, stop, gene_callers_id, gene_calls_dict[gene_callers_id]['start'], gene_calls_dict[gene_callers_id]['stop'])

                # here we identify genes that are associated with a split even if one base of the gene spills into
                # the defined start or stop of a split, which means, split N, will include genes A, B and C in this
                # scenario:
                #
                # contig: (...)------[ gene A ]--------[     gene B    ]----[gene C]---------[    gene D    ]-----(...)
                #         (...)----------x---------------------------------------x--------------------------------(...)
                #                        ^ (split N start)                       ^ (split N stop)
                #                        |                                       |
                #                        |<-              split N              ->|
                #
                # however, when looking at the coding versus non-coding nucleotide ratios in a split, we have to make
                # sure that only the relevant portion of gene A and gene C is counted:
                total_coding_nts = 0
                for gene_start, gene_stop in gene_start_stops:
                    total_coding_nts += (gene_stop if gene_stop < stop else stop) - (gene_start if gene_start > start else start)

                splits_dict[split_name] = {'num_genes': len(gene_start_stops),
                                           'avg_gene_length': numpy.mean([(l[1] - l[0]) for l in gene_start_stops]) if len(gene_start_stops) else 0.0,
                                           'ratio_coding': total_coding_nts * 1.0 / (stop - start),
                                           }

        # open connection
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))

        # push entries for genes in splits table
        db_entries = [tuple([entry_id] + [genes_in_splits.splits_to_prots[entry_id][h] for h in t.genes_in_splits_table_structure[1:]]) for entry_id in genes_in_splits.splits_to_prots]
        database._exec_many('''INSERT INTO %s VALUES (?,?,?,?,?,?)''' % t.genes_in_splits_table_name, db_entries)

        # disconnect
        database.disconnect()